Merge branch 'develop' into win_tidy
This commit is contained in:
commit
b29fd48998
|
|
@ -14,8 +14,8 @@ jobs:
|
|||
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282
|
||||
toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz
|
||||
xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1698113812618
|
||||
toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0-20231018.tar.gz
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
|
|
@ -76,7 +76,7 @@ jobs:
|
|||
run: |
|
||||
wget ${xuetie_toolchain}/${toolchain_file_name}
|
||||
tar -xvf ${toolchain_file_name} -C /opt
|
||||
export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1/bin:$PATH"
|
||||
export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0/bin:$PATH"
|
||||
|
||||
make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
|
||||
|
||||
|
|
|
|||
|
|
@ -16,13 +16,13 @@ jobs:
|
|||
include:
|
||||
- target: LOONGSONGENERIC
|
||||
triple: loongarch64-unknown-linux-gnu
|
||||
opts: NO_SHARED=1 TARGET=LOONGSONGENERIC
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSONGENERIC
|
||||
- target: LOONGSON3R5
|
||||
triple: loongarch64-unknown-linux-gnu
|
||||
opts: NO_SHARED=1 TARGET=LOONGSON3R5
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5
|
||||
- target: LOONGSON2K1000
|
||||
triple: loongarch64-unknown-linux-gnu
|
||||
opts: NO_SHARED=1 TARGET=LOONGSON2K1000
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000
|
||||
- target: DYNAMIC_ARCH
|
||||
triple: loongarch64-unknown-linux-gnu
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC
|
||||
|
|
@ -40,8 +40,9 @@ jobs:
|
|||
|
||||
- name: Download and install loongarch64-toolchain
|
||||
run: |
|
||||
wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz
|
||||
tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt
|
||||
wget https://github.com/sunhaiyong1978/CLFS-for-LoongArch/releases/download/8.1/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
|
||||
#wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
|
||||
tar -xf CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz -C /opt
|
||||
|
||||
- name: Set env
|
||||
run: |
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
|
|||
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 25.dev)
|
||||
set(OpenBLAS_PATCH_VERSION 26.dev)
|
||||
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
|
|
@ -256,15 +256,15 @@ if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
|
|||
set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
|
||||
set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
|
||||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
|
||||
"sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
|
||||
"sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
|
||||
"sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
|
||||
"sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -undefined dynamic_lookup -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
|
||||
"sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
|
||||
"sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
|
||||
else ()
|
||||
set (CMAKE_C_CREATE_SHARED_LIBRARY
|
||||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
|
||||
"sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
|
||||
"sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -undefined dynamic_lookup -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
|
||||
"sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
|
||||
"sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
|
||||
endif ()
|
||||
endif()
|
||||
|
||||
|
|
|
|||
|
|
@ -220,4 +220,6 @@ In chronological order:
|
|||
* Mark Seminatore <https://github.com/mseminatore>
|
||||
* [2023-11-09] Improve Windows threading performance scaling
|
||||
* [2024-02-09] Introduce MT_TRACE facility and improve code consistency
|
||||
|
||||
|
||||
* Dirreke <https://github.com/mseminatore>
|
||||
* [2024-01-16] Add basic support for the CSKY architecture
|
||||
|
|
|
|||
|
|
@ -1,4 +1,49 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.26
|
||||
2-Jan-2024
|
||||
|
||||
general:
|
||||
- improved the version of openblas.pc that is created by the CMAKE build
|
||||
- fixed a CMAKE-specific build problem on older versions of MacOS
|
||||
- worked around linking problems on old versions of MacOS
|
||||
- corrected installation location of the lapacke_mangling header in CMAKE builds
|
||||
- added type declarations for complex variables to the MSVC-specific parts of the LAPACK header
|
||||
- significantly sped up ?GESV for small problem sizes by introducing a lower bound for multithreading
|
||||
- imported additions and corrections from the Reference-LAPACK project:
|
||||
- added new LAPACK functions for truncated QR with pivoting (Reference-LAPACK PRs 891&941)
|
||||
- handle miscalculation of minimum work array size in corner cases (Reference-LAPACK PR 942)
|
||||
- fixed use of uninitialized variables in ?GEDMD and improved inline documentation (PR 959)
|
||||
- fixed use of uninitialized variables (and consequential failures) in ?BBCSD (PR 967)
|
||||
- added tests for the recently introduced Dynamic Mode Decomposition functions (PR 736)
|
||||
- fixed several memory leaks in the LAPACK testsuite (PR 953)
|
||||
- fixed counting of testsuite results by the Python script (PR 954)
|
||||
|
||||
x86-64:
|
||||
- fixed computation of CASUM on SkylakeX and newer targets in the special
|
||||
case that AVX512 is not supported by the compiler or operating environment
|
||||
- fixed potential undefined behaviour in the CASUM/ZASUM kernels for AVX512 targets
|
||||
- worked around a problem in the pre-AVX kernels for GEMV
|
||||
- sped up the thread management code on MS Windows
|
||||
|
||||
arm64:
|
||||
- fixed building of the LAPACK testsuite with Xcode 15 on Apple M1 and newer
|
||||
- sped up the thread management code on MS Windows
|
||||
- sped up SGEMM and DGEMM on Neoverse V1 and N1
|
||||
- sped up ?DOT on SVE-capable targets
|
||||
- reduced the number of targets in DYNAMIC_ARCH builds by eliminating functionally equivalent ones
|
||||
- included support for Apple M1 and newer targets in DYNAMIC_ARCH builds
|
||||
|
||||
power:
|
||||
- improved the SGEMM kernel for POWER10
|
||||
- fixed compilation with (very) old versions of gcc
|
||||
- fixed detection of old 32bit PPC targets in CMAKE-based builds
|
||||
- added autodetection of the POWERPC 7400 subtype
|
||||
- fixed CMAKE-based compilation for PPCG4 and PPC970 targets
|
||||
|
||||
loongarch64:
|
||||
- added and improved optimized kernels for almost all BLAS functions
|
||||
|
||||
====================================================================
|
||||
Version 0.3.25
|
||||
12-Nov-2023
|
||||
|
|
|
|||
|
|
@ -104,19 +104,25 @@ ifneq ($(F_COMPILER), NAG)
|
|||
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native
|
||||
CCOMMON_OPT += -march=armv8.4-a+sve
|
||||
ifneq ($(CROSS), 1)
|
||||
CCOMMON_OPT += -mtune=native
|
||||
endif
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.4-a -mtune=native
|
||||
FCOMMON_OPT += -march=armv8.4-a
|
||||
ifneq ($(CROSS), 1)
|
||||
FCOMMON_OPT += -mtune=native
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
|
|
@ -132,25 +138,31 @@ ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
|
|||
ifneq ($(OSNAME), Darwin)
|
||||
CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
|
||||
endif
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native
|
||||
CCOMMON_OPT += -march=armv8.5-a+sve
|
||||
ifneq ($(CROSS), 1)
|
||||
CCOMMON_OPT += -mtune=native
|
||||
endif
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.5-a -mtune=native
|
||||
FCOMMON_OPT += -march=armv8.5-a
|
||||
ifneq ($(CROSS), 1)
|
||||
FCOMMON_OPT += -mtune=native
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
|
|
|
|||
|
|
@ -0,0 +1,4 @@
|
|||
ifeq ($(CORE), CK860FV)
|
||||
CCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float
|
||||
FCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float -static
|
||||
endif
|
||||
|
|
@ -55,6 +55,26 @@ ifeq ($(TARGET), C910V)
|
|||
TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), CK860FV)
|
||||
TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), x280)
|
||||
TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), RISCV64_ZVL256B)
|
||||
TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), RISCV64_ZVL128B)
|
||||
TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), RISCV64_GENERIC)
|
||||
TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d
|
||||
endif
|
||||
|
||||
all: getarch_2nd
|
||||
./getarch_2nd 0 >> $(TARGET_MAKE)
|
||||
./getarch_2nd 1 >> $(TARGET_CONF)
|
||||
|
|
|
|||
|
|
@ -2,3 +2,19 @@ ifeq ($(CORE), C910V)
|
|||
CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920
|
||||
FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static
|
||||
endif
|
||||
ifeq ($(CORE), x280)
|
||||
CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math
|
||||
FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
|
||||
endif
|
||||
ifeq ($(CORE), RISCV64_ZVL256B)
|
||||
CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d
|
||||
FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
|
||||
endif
|
||||
ifeq ($(CORE), RISCV64_ZVL128B)
|
||||
CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
|
||||
FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
|
||||
endif
|
||||
ifeq ($(CORE), RISCV64_GENERIC)
|
||||
CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d
|
||||
FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static
|
||||
endif
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.25.dev
|
||||
VERSION = 0.3.26.dev
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
|
|
|||
|
|
@ -677,16 +677,12 @@ ifeq ($(ARCH), arm64)
|
|||
DYNAMIC_CORE = ARMV8
|
||||
DYNAMIC_CORE += CORTEXA53
|
||||
DYNAMIC_CORE += CORTEXA57
|
||||
DYNAMIC_CORE += CORTEXA72
|
||||
DYNAMIC_CORE += CORTEXA73
|
||||
DYNAMIC_CORE += NEOVERSEN1
|
||||
ifneq ($(NO_SVE), 1)
|
||||
DYNAMIC_CORE += NEOVERSEV1
|
||||
DYNAMIC_CORE += NEOVERSEN2
|
||||
DYNAMIC_CORE += ARMV8SVE
|
||||
endif
|
||||
DYNAMIC_CORE += CORTEXA55
|
||||
DYNAMIC_CORE += FALKOR
|
||||
DYNAMIC_CORE += THUNDERX
|
||||
DYNAMIC_CORE += THUNDERX2T99
|
||||
DYNAMIC_CORE += TSV110
|
||||
|
|
@ -877,6 +873,11 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), csky)
|
||||
NO_BINARY_MODE = 1
|
||||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
||||
#
|
||||
# C Compiler dependent settings
|
||||
#
|
||||
|
|
|
|||
|
|
@ -130,11 +130,11 @@ ifeq ($(C_COMPILER), GCC)
|
|||
endif
|
||||
endif
|
||||
else ifeq ($(C_COMPILER), CLANG)
|
||||
# cooperlake support was added in clang 12
|
||||
# sapphire rapids support was added in clang 12
|
||||
ifeq ($(CLANGVERSIONGTEQ12), 1)
|
||||
CCOMMON_OPT += -march=cooperlake
|
||||
CCOMMON_OPT += -march=sapphirerapids
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=cooperlake
|
||||
FCOMMON_OPT += -march=sapphirerapids
|
||||
endif
|
||||
else # not supported in clang, fallback to avx512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
|
|
|
|||
13
README.md
13
README.md
|
|
@ -196,7 +196,12 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
|||
```sh
|
||||
make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran
|
||||
```
|
||||
(also known to work on C906)
|
||||
(also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision)
|
||||
|
||||
- **x280**: Level-3 BLAS and Level-1,2 are optimized by RISC-V Vector extension 1.0.
|
||||
```sh
|
||||
make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran
|
||||
```
|
||||
|
||||
### Support for multiple targets in a single library
|
||||
|
||||
|
|
@ -207,9 +212,11 @@ For **x86_64**, the list of targets this activates contains Prescott, Core2, Neh
|
|||
`DYNAMIC_ARCH` is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
|
||||
Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.
|
||||
|
||||
On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus.
|
||||
On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. If compiler support for SVE is available at build time, support for NeoverseN2, NeoverseV1 as well as generic ArmV8SVE targets is also enabled.
|
||||
|
||||
For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
|
||||
For **POWER**, the list encompasses POWER6, POWER8 and POWER9. POWER10 is additionally available if a sufficiently recent compiler is used for the build.
|
||||
|
||||
on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support.
|
||||
|
||||
The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the
|
||||
common code in the library, usually you will want to set this to the oldest model you expect to encounter.
|
||||
|
|
|
|||
|
|
@ -118,8 +118,11 @@ Z13
|
|||
Z14
|
||||
|
||||
10.RISC-V 64:
|
||||
RISCV64_GENERIC
|
||||
RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54)
|
||||
RISCV64_ZVL128B
|
||||
C910V
|
||||
x280
|
||||
RISCV64_ZVL256B
|
||||
|
||||
11.LOONGARCH64:
|
||||
LOONGSONGENERIC
|
||||
|
|
@ -133,3 +136,7 @@ E2K
|
|||
EV4
|
||||
EV5
|
||||
EV6
|
||||
|
||||
14.CSKY
|
||||
CSKY
|
||||
CK860FV
|
||||
|
|
|
|||
|
|
@ -288,9 +288,9 @@ jobs:
|
|||
vmImage: 'ubuntu-latest'
|
||||
steps:
|
||||
- script: |
|
||||
wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \
|
||||
&& echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc alpine-chroot-install' | sha1sum -c \
|
||||
|| exit 1
|
||||
wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.14.0/alpine-chroot-install \
|
||||
&& echo 'ccbf65f85cdc351851f8ad025bb3e65bae4d5b06 alpine-chroot-install' | sha1sum -c \
|
||||
|| exit 1
|
||||
alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
|
||||
sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo'
|
||||
alpine make DYNAMIC_ARCH=1 BINARY=64
|
||||
|
|
|
|||
|
|
@ -37,6 +37,12 @@ ESSL=/opt/ibm/lib
|
|||
#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
|
||||
LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
|
||||
|
||||
# x280 temporary workaround for gfortran
|
||||
ifeq ($(TARGET), x280)
|
||||
CCOMMON_OPT:=$(filter-out -mllvm --riscv-v-vector-bits-min=512,$(CCOMMON_OPT))
|
||||
endif
|
||||
|
||||
|
||||
ifneq ($(NO_LAPACK), 1)
|
||||
GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||
scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \
|
||||
|
|
@ -3436,4 +3442,4 @@ smallscaling: smallscaling.c ../$(LIBNAME)
|
|||
clean ::
|
||||
@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling
|
||||
|
||||
include $(TOPDIR)/Makefile.tail
|
||||
include $(TOPDIR)/Makefile.tail
|
||||
24
c_check
24
c_check
|
|
@ -91,6 +91,7 @@ case "$data" in
|
|||
*ARCH_ZARCH*) architecture=zarch ;;
|
||||
*ARCH_RISCV64*) architecture=riscv64 ;;
|
||||
*ARCH_LOONGARCH64*) architecture=loongarch64 ;;
|
||||
*ARCH_CSKY*) architecture=csky ;;
|
||||
esac
|
||||
|
||||
defined=0
|
||||
|
|
@ -236,6 +237,7 @@ case "$data" in
|
|||
*ARCH_ARM*) architecture=arm ;;
|
||||
*ARCH_ZARCH*) architecture=zarch ;;
|
||||
*ARCH_LOONGARCH64*) architecture=loongarch64 ;;
|
||||
*ARCH_CSKY*) architecture=csky ;;
|
||||
esac
|
||||
|
||||
binformat='bin32'
|
||||
|
|
@ -244,6 +246,7 @@ case "$data" in
|
|||
esac
|
||||
|
||||
no_avx512=0
|
||||
no_avx512bf=0
|
||||
if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
|
||||
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||
tmpf="$tmpd/a.c"
|
||||
|
|
@ -262,6 +265,25 @@ if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
|
|||
}
|
||||
|
||||
rm -rf "$tmpd"
|
||||
if [ "$no_avx512" -eq 0 ]; then
|
||||
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||
tmpf="$tmpd/a.c"
|
||||
code='"__m512 a= _mm512_dpbf16_ps(a, (__m512bh) _mm512_loadu_si512(%1]), (__m512bh) _mm512_loadu_si512(%2]));"'
|
||||
printf "#include <immintrin.h>\n\nint main(void){ %s; }\n" "$code" >> "$tmpf"
|
||||
if [ "$compiler" = "PGI" ]; then
|
||||
args=" -tp cooperlake -c -o $tmpf.o $tmpf"
|
||||
else
|
||||
args=" -march=cooperlake -c -o $tmpf.o $tmpf"
|
||||
fi
|
||||
no_avx512bf=0
|
||||
{
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
no_avx512bf=1
|
||||
}
|
||||
|
||||
rm -rf "$tmpd"
|
||||
fi
|
||||
fi
|
||||
|
||||
no_rv64gv=0
|
||||
|
|
@ -409,6 +431,7 @@ done
|
|||
[ "$makefile" = "-" ] && {
|
||||
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
|
||||
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
|
||||
[ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
|
||||
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
|
||||
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
|
||||
exit 0
|
||||
|
|
@ -437,6 +460,7 @@ done
|
|||
[ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n"
|
||||
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
|
||||
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
|
||||
[ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
|
||||
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
|
||||
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
|
||||
[ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"
|
||||
|
|
|
|||
|
|
@ -97,6 +97,7 @@ $architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
|||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
|
||||
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
||||
$architecture = csky if ($data =~ /ARCH_CSKY/);
|
||||
|
||||
$defined = 0;
|
||||
|
||||
|
|
@ -156,6 +157,11 @@ if ($architecture eq "loongarch64") {
|
|||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "csky") {
|
||||
$defined = 1;
|
||||
$binary = 32;
|
||||
}
|
||||
|
||||
if ($compiler eq "PGI") {
|
||||
$compiler_name .= " -tp p7" if ($binary eq "32");
|
||||
$compiler_name .= " -tp p7-64" if ($binary eq "64");
|
||||
|
|
@ -284,6 +290,7 @@ $architecture = arm if ($data =~ /ARCH_ARM/);
|
|||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
||||
$architecture = csky if ($data =~ /ARCH_CSKY/);
|
||||
|
||||
$binformat = bin32;
|
||||
$binformat = bin64 if ($data =~ /BINARY_64/);
|
||||
|
|
|
|||
22
cblas.h
22
cblas.h
|
|
@ -12,6 +12,7 @@ extern "C" {
|
|||
/*Set the number of threads on runtime.*/
|
||||
void openblas_set_num_threads(int num_threads);
|
||||
void goto_set_num_threads(int num_threads);
|
||||
int openblas_set_num_threads_local(int num_threads);
|
||||
|
||||
/*Get the number of threads on runtime.*/
|
||||
int openblas_get_num_threads(void);
|
||||
|
|
@ -100,6 +101,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
|
|||
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||
float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||
float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
|
@ -115,6 +126,9 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS
|
|||
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
|
||||
void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
|
||||
void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
|
|
@ -289,6 +303,14 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA
|
|||
void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
|
||||
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
|
||||
|
||||
void cblas_sgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
|
||||
OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
|
||||
void cblas_dgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
|
||||
OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
|
||||
void cblas_cgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
|
||||
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
|
||||
void cblas_zgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
|
||||
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
|
||||
|
||||
void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
|
||||
OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ endif ()
|
|||
|
||||
if (DYNAMIC_ARCH)
|
||||
if (ARM64)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE)
|
||||
endif ()
|
||||
|
|
|
|||
|
|
@ -36,9 +36,19 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS
|
|||
|
||||
if (LOONGARCH64)
|
||||
if (BINARY64)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64")
|
||||
CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI)
|
||||
if(COMPILER_SUPPORT_LP64D_ABI)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64d")
|
||||
else()
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64")
|
||||
endif ()
|
||||
else ()
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32")
|
||||
CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI)
|
||||
if(COMPILER_SUPPORT_ILP32D_ABI)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=ilp32d")
|
||||
else()
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32")
|
||||
endif ()
|
||||
endif ()
|
||||
set(BINARY_DEFINED 1)
|
||||
endif ()
|
||||
|
|
@ -282,6 +292,27 @@ if (${CORE} STREQUAL POWER8)
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
# With -mcpu=970 added it compiles, but library is broken, at least on macOS. If someone
|
||||
# tests on *BSD or Linux and adds this flag, please make sure it is not used for macOS case.
|
||||
if (${CORE} STREQUAL PPC970)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mtune=970 -maltivec -fno-fast-math")
|
||||
endif ()
|
||||
if (APPLE)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -force_cpusubtype_ALL")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
# -mcpu=G4 seems to work fine, but perhaps avoid it for the sake of consistency?
|
||||
if (${CORE} STREQUAL PPCG4)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mtune=G4 -maltivec -fno-fast-math")
|
||||
endif ()
|
||||
if (APPLE)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -force_cpusubtype_ALL")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (HAVE_AVX2)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2")
|
||||
|
|
|
|||
|
|
@ -61,9 +61,19 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
|
|||
endif ()
|
||||
if (LOONGARCH64)
|
||||
if (BINARY64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
|
||||
CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI)
|
||||
if(COMPILER_SUPPORT_LP64D_ABI)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d")
|
||||
else()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
|
||||
endif ()
|
||||
else ()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
|
||||
CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI)
|
||||
if(COMPILER_SUPPORT_ILP32D_ABI)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d")
|
||||
else()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
if (RISCV64)
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
|||
openblas_config=USE_64BITINT=@INTERFACE64@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
|
||||
Name: OpenBLAS
|
||||
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
||||
Version: @OPENBLAS_VERSION@
|
||||
URL: https://github.com/xianyi/OpenBLAS
|
||||
Version: @OpenBLAS_VERSION@
|
||||
URL: https://github.com/OpenMathLib/OpenBLAS
|
||||
Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix}
|
||||
Cflags: -I${includedir}
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ if(CMAKE_CL_64 OR MINGW64)
|
|||
endif()
|
||||
elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING))
|
||||
set(X86 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc.*"))
|
||||
set(POWER 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
|
||||
set(MIPS64 1)
|
||||
|
|
@ -109,7 +109,7 @@ else()
|
|||
endif ()
|
||||
|
||||
if (NOT BINARY)
|
||||
if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64)
|
||||
if (X86_64 OR ARM64 OR MIPS64 OR LOONGARCH64 OR RISCV64 OR (POWER AND NOT (CMAKE_OSX_ARCHITECTURES STREQUAL "ppc")))
|
||||
set(BINARY 64)
|
||||
else ()
|
||||
set(BINARY 32)
|
||||
|
|
|
|||
6
common.h
6
common.h
|
|
@ -396,7 +396,7 @@ typedef int blasint;
|
|||
#endif
|
||||
|
||||
/***
|
||||
To alloc job_t on heap or statck.
|
||||
To alloc job_t on heap or stack.
|
||||
please https://github.com/xianyi/OpenBLAS/issues/246
|
||||
***/
|
||||
#if defined(OS_WINDOWS)
|
||||
|
|
@ -482,6 +482,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
|||
#include "common_e2k.h"
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_CSKY
|
||||
#include "common_csky.h"
|
||||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#ifdef OS_WINDOWSSTORE
|
||||
typedef char env_var_t[MAX_PATH];
|
||||
|
|
|
|||
|
|
@ -0,0 +1,56 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2011-2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
|
||||
#ifndef COMMON_CSKY
|
||||
#define COMMON_CSKY
|
||||
|
||||
#define MB __sync_synchronize()
|
||||
#define WMB __sync_synchronize()
|
||||
#define RMB __sync_synchronize()
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#define BUFFER_SIZE ( 32 << 20)
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#endif
|
||||
|
|
@ -498,6 +498,15 @@ void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double *
|
|||
void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
|
||||
xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);
|
||||
|
||||
void BLASFUNC(sgemmt)(char*, char *, char *, blasint *, blasint *, float *,
|
||||
float *, blasint *, float *, blasint *, float *, float *, blasint *);
|
||||
void BLASFUNC(dgemmt)(char*, char *, char *, blasint *, blasint *, double *,
|
||||
double *, blasint *, double *, blasint *, double *, double *, blasint *);
|
||||
void BLASFUNC(cgemmt)(char*, char *, char *, blasint *, blasint *, float *,
|
||||
float *, blasint *, float *, blasint *, float *, float *, blasint *);
|
||||
void BLASFUNC(zgemmt)(char*, char *, char *, blasint *, blasint *, double *,
|
||||
double *, blasint *, double *, blasint *, double *, double *, blasint *);
|
||||
|
||||
int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *,
|
||||
float *, float *, blasint *, float *, blasint *,
|
||||
float *, float *, blasint *);
|
||||
|
|
@ -764,8 +773,8 @@ xdouble BLASFUNC(qlamc3)(xdouble *, xdouble *);
|
|||
|
||||
void BLASFUNC(saxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *);
|
||||
void BLASFUNC(daxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *);
|
||||
void BLASFUNC(caxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *);
|
||||
void BLASFUNC(zaxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *);
|
||||
void BLASFUNC(caxpby) (blasint *, void *, float *, blasint *, void *, float *, blasint *);
|
||||
void BLASFUNC(zaxpby) (blasint *, void *, double *, blasint *, void *, double *, blasint *);
|
||||
|
||||
void BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *);
|
||||
void BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *);
|
||||
|
|
|
|||
|
|
@ -119,19 +119,47 @@ static inline int WhereAmI(void){
|
|||
#define MOV fmov.d
|
||||
#define CMOVT fsel
|
||||
#define MTC movgr2fr.d
|
||||
#define MTG movfr2gr.d
|
||||
#define FABS fabs.d
|
||||
#define FMIN fmin.d
|
||||
#define FMINA fmina.d
|
||||
#define FMAX fmax.d
|
||||
#define FMAXA fmaxa.d
|
||||
#define CMPEQ fcmp.ceq.d
|
||||
#define CMPLE fcmp.cle.d
|
||||
#define CMPLT fcmp.clt.d
|
||||
#define NEG fneg.d
|
||||
#define FFINT ffint.d.l
|
||||
|
||||
#define XVFSUB xvfsub.d
|
||||
#define XVFADD xvfadd.d
|
||||
#define XVFMUL xvfmul.d
|
||||
#define XVFMADD xvfmadd.d
|
||||
#define XVFMIN xvfmin.d
|
||||
#define XVFMINA xvfmina.d
|
||||
#define XVFMAX xvfmax.d
|
||||
#define XVFMAXA xvfmaxa.d
|
||||
#define XVCMPEQ xvfcmp.ceq.d
|
||||
#define XVCMPLE xvfcmp.cle.d
|
||||
#define XVCMPLT xvfcmp.clt.d
|
||||
#define XVMUL xvfmul.d
|
||||
#define XVMSUB xvfmsub.d
|
||||
#define XVNMSUB xvfnmsub.d
|
||||
|
||||
#define VFSUB vfsub.d
|
||||
#define VFADD vfadd.d
|
||||
#define VFMUL vfmul.d
|
||||
#define VFMADD vfmadd.d
|
||||
#define VFMIN vfmin.d
|
||||
#define VFMINA vfmina.d
|
||||
#define VFMAX vfmax.d
|
||||
#define VFMAXA vfmaxa.d
|
||||
#define VCMPEQ vfcmp.ceq.d
|
||||
#define VCMPLE vfcmp.cle.d
|
||||
#define VCMPLT vfcmp.clt.d
|
||||
#define VMUL vfmul.d
|
||||
#define VMSUB vfmsub.d
|
||||
#define VNMSUB vfnmsub.d
|
||||
|
||||
#else
|
||||
|
||||
|
|
@ -147,19 +175,47 @@ static inline int WhereAmI(void){
|
|||
#define MOV fmov.s
|
||||
#define CMOVT fsel
|
||||
#define MTC movgr2fr.w
|
||||
#define MTG movfr2gr.s
|
||||
#define FABS fabs.s
|
||||
#define FMIN fmin.s
|
||||
#define FMINA fmina.s
|
||||
#define FMAX fmax.s
|
||||
#define FMAXA fmaxa.s
|
||||
#define CMPEQ fcmp.ceq.s
|
||||
#define CMPLE fcmp.cle.s
|
||||
#define CMPLT fcmp.clt.s
|
||||
#define NEG fneg.s
|
||||
#define FFINT ffint.s.l
|
||||
|
||||
#define XVFSUB xvfsub.s
|
||||
#define XVFADD xvfadd.s
|
||||
#define XVFMUL xvfmul.s
|
||||
#define XVFMADD xvfmadd.s
|
||||
#define XVFMIN xvfmin.s
|
||||
#define XVFMINA xvfmina.s
|
||||
#define XVFMAX xvfmax.s
|
||||
#define XVFMAXA xvfmaxa.s
|
||||
#define XVCMPEQ xvfcmp.ceq.s
|
||||
#define XVCMPLE xvfcmp.cle.s
|
||||
#define XVCMPLT xvfcmp.clt.s
|
||||
#define XVMUL xvfmul.s
|
||||
#define XVMSUB xvfmsub.s
|
||||
#define XVNMSUB xvfnmsub.s
|
||||
|
||||
#define VFSUB vfsub.s
|
||||
#define VFADD vfadd.s
|
||||
#define VFMUL vfmul.s
|
||||
#define VFMADD vfmadd.s
|
||||
#define VFMIN vfmin.s
|
||||
#define VFMINA vfmina.s
|
||||
#define VFMAX vfmax.s
|
||||
#define VFMAXA vfmaxa.s
|
||||
#define VCMPEQ vfcmp.ceq.s
|
||||
#define VCMPLE vfcmp.cle.s
|
||||
#define VCMPLT vfcmp.clt.s
|
||||
#define VMUL vfmul.s
|
||||
#define VMSUB vfmsub.s
|
||||
#define VNMSUB vfnmsub.s
|
||||
|
||||
#endif /* defined(DOUBLE) */
|
||||
|
||||
|
|
|
|||
|
|
@ -91,8 +91,26 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#define BUFFER_SIZE ( 32 << 20)
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#if defined(C910V)
|
||||
#include <riscv_vector.h>
|
||||
#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) || defined(x280)
|
||||
# include <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
#if defined( __riscv_xtheadc ) && defined( __riscv_v ) && ( __riscv_v <= 7000 )
|
||||
// t-head toolchain uses obsolete rvv intrinsics, can't build for C910V without this
|
||||
#define RISCV_0p10_INTRINSICS
|
||||
#define RISCV_RVV(x) x
|
||||
#else
|
||||
#define RISCV_RVV(x) __riscv_ ## x
|
||||
#endif
|
||||
|
||||
#if defined(C910V) || defined(RISCV64_ZVL256B)
|
||||
# if !defined(DOUBLE)
|
||||
# define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f32m1_f32)(v)
|
||||
# else
|
||||
# define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f64m1_f64)(v)
|
||||
# endif
|
||||
#else
|
||||
# define EXTRACT_FLOAT(v) (v[0])
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -137,19 +137,20 @@ typedef struct blas_queue {
|
|||
|
||||
extern int blas_server_avail;
|
||||
extern int blas_omp_number_max;
|
||||
extern int blas_omp_threads_local;
|
||||
|
||||
static __inline int num_cpu_avail(int level) {
|
||||
|
||||
#ifdef USE_OPENMP
|
||||
int openmp_nthreads;
|
||||
openmp_nthreads=omp_get_max_threads();
|
||||
if (omp_in_parallel()) openmp_nthreads = blas_omp_threads_local;
|
||||
#endif
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
if (blas_cpu_number == 1
|
||||
#endif
|
||||
#ifdef USE_OPENMP
|
||||
if (openmp_nthreads == 1 || omp_in_parallel()
|
||||
#else
|
||||
if (openmp_nthreads == 1
|
||||
#endif
|
||||
) return 1;
|
||||
|
||||
|
|
|
|||
|
|
@ -160,6 +160,7 @@ int detect(void){
|
|||
infoCount = HOST_BASIC_INFO_COUNT;
|
||||
host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, &infoCount);
|
||||
|
||||
if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7400) return CPUTYPE_PPCG4;
|
||||
if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7450) return CPUTYPE_PPCG4;
|
||||
if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_970) return CPUTYPE_PPC970;
|
||||
|
||||
|
|
|
|||
|
|
@ -70,12 +70,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_C910V 1
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_C910V 1
|
||||
#define CPU_x280 2
|
||||
#define CPU_RISCV64_ZVL256B 3
|
||||
#define CPU_RISCV64_ZVL128B 4
|
||||
|
||||
static char *cpuname[] = {
|
||||
"RISCV64_GENERIC",
|
||||
"C910V"
|
||||
"C910V",
|
||||
"x280",
|
||||
"CPU_RISCV64_ZVL256B",
|
||||
"CPU_RISCV64_ZVL128B"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"riscv64_generic",
|
||||
"c910v",
|
||||
"x280",
|
||||
"riscv64_zvl256b",
|
||||
"riscv64_zvl128b"
|
||||
};
|
||||
|
||||
int detect(void){
|
||||
|
|
@ -86,23 +100,29 @@ int detect(void){
|
|||
char *pmodel = NULL, *pisa = NULL;
|
||||
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
if (!infile)
|
||||
return CPU_GENERIC;
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if(!strncmp(buffer, "model name", 10)){
|
||||
strcpy(model_buffer, buffer);
|
||||
pmodel = strchr(isa_buffer, ':') + 1;
|
||||
pmodel = strchr(model_buffer, ':');
|
||||
if (pmodel)
|
||||
pmodel++;
|
||||
}
|
||||
|
||||
if(!strncmp(buffer, "isa", 3)){
|
||||
strcpy(isa_buffer, buffer);
|
||||
pisa = strchr(isa_buffer, '4') + 1;
|
||||
pisa = strchr(isa_buffer, '4');
|
||||
if (pisa)
|
||||
pisa++;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
if (!pmodel)
|
||||
if (!pmodel || !pisa)
|
||||
return(CPU_GENERIC);
|
||||
|
||||
|
||||
if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v'))
|
||||
return CPU_C910V;
|
||||
|
||||
|
|
@ -140,5 +160,5 @@ void get_cpuconfig(void){
|
|||
}
|
||||
|
||||
void get_libname(void){
|
||||
printf("riscv64\n");
|
||||
printf("%s", cpuname_lower[detect()]);
|
||||
}
|
||||
|
|
|
|||
4
ctest.c
4
ctest.c
|
|
@ -173,6 +173,10 @@ HAVE_C11
|
|||
ARCH_E2K
|
||||
#endif
|
||||
|
||||
#if defined(__csky__)
|
||||
ARCH_CSKY
|
||||
#endif
|
||||
|
||||
#if defined(__EMSCRIPTEN__)
|
||||
ARCH_RISCV64
|
||||
OS_WINDOWS
|
||||
|
|
|
|||
|
|
@ -218,6 +218,9 @@ ifeq ($(F_COMPILER), IBM)
|
|||
ifeq ($(C_COMPILER), GCC)
|
||||
CEXTRALIB += -lgomp
|
||||
endif
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
CEXTRALIB += -lomp
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
|
|
|||
|
|
@ -96,7 +96,7 @@
|
|||
INTEGER ICAMAXTEST
|
||||
EXTERNAL SCASUMTEST, SCNRM2TEST, ICAMAXTEST
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1
|
||||
EXTERNAL CSCALTEST, CSSCALTEST, CTEST, ITEST1, STEST1
|
||||
* .. Intrinsic Functions ..
|
||||
INTRINSIC MAX
|
||||
* .. Common blocks ..
|
||||
|
|
@ -214,8 +214,8 @@
|
|||
CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1),
|
||||
+ STRUE4(NP1),SFAC)
|
||||
ELSE IF (ICASE.EQ.8) THEN
|
||||
* .. CSCAL ..
|
||||
CALL CSCAL(N,CA,CX,INCX)
|
||||
* .. CSCALTEST ..
|
||||
CALL CSCALTEST(N,CA,CX,INCX)
|
||||
CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX),
|
||||
+ SFAC)
|
||||
ELSE IF (ICASE.EQ.9) THEN
|
||||
|
|
@ -236,14 +236,14 @@
|
|||
*
|
||||
INCX = 1
|
||||
IF (ICASE.EQ.8) THEN
|
||||
* CSCAL
|
||||
* CSCALTEST
|
||||
* Add a test for alpha equal to zero.
|
||||
CA = (0.0E0,0.0E0)
|
||||
DO 80 I = 1, 5
|
||||
MWPCT(I) = (0.0E0,0.0E0)
|
||||
MWPCS(I) = (1.0E0,1.0E0)
|
||||
80 CONTINUE
|
||||
CALL CSCAL(5,CA,CX,INCX)
|
||||
CALL CSCALTEST(5,CA,CX,INCX)
|
||||
CALL CTEST(5,CX,MWPCT,MWPCS,SFAC)
|
||||
ELSE IF (ICASE.EQ.9) THEN
|
||||
* CSSCALTEST
|
||||
|
|
|
|||
|
|
@ -440,6 +440,7 @@ static real c_b43 = (float)1.;
|
|||
extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*);
|
||||
static complex mwpcs[5], mwpct[5];
|
||||
extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*);
|
||||
extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_();
|
||||
static complex cx[8];
|
||||
extern real scnrm2test_(integer*, complex*, integer*);
|
||||
static integer np1;
|
||||
|
|
@ -481,7 +482,7 @@ static real c_b43 = (float)1.;
|
|||
stest1_(&r__1, &strue4[np1 - 1], &strue4[np1 - 1], sfac);
|
||||
} else if (combla_1.icase == 8) {
|
||||
/* .. CSCAL .. */
|
||||
cscal_(&combla_1.n, &ca, cx, &combla_1.incx);
|
||||
cscaltest_(&combla_1.n, &ca, cx, &combla_1.incx);
|
||||
ctest_(&len, cx, &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48],
|
||||
&ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], sfac);
|
||||
} else if (combla_1.icase == 9) {
|
||||
|
|
@ -515,7 +516,7 @@ static real c_b43 = (float)1.;
|
|||
mwpcs[i__1].r = (float)1., mwpcs[i__1].i = (float)1.;
|
||||
/* L80: */
|
||||
}
|
||||
cscal_(&c__5, &ca, cx, &combla_1.incx);
|
||||
cscaltest_(&c__5, &ca, cx, &combla_1.incx);
|
||||
ctest_(&c__5, cx, mwpct, mwpcs, sfac);
|
||||
} else if (combla_1.icase == 9) {
|
||||
/* CSSCALTEST */
|
||||
|
|
|
|||
|
|
@ -113,6 +113,8 @@ extern unsigned int openblas_thread_timeout(void);
|
|||
/* We need this global for checking if initialization is finished. */
|
||||
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
|
||||
|
||||
int blas_omp_threads_local = 1;
|
||||
|
||||
/* Local Variables */
|
||||
#if defined(USE_PTHREAD_LOCK)
|
||||
static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
|
|
|||
|
|
@ -69,6 +69,7 @@
|
|||
|
||||
int blas_server_avail = 0;
|
||||
int blas_omp_number_max = 0;
|
||||
int blas_omp_threads_local = 1;
|
||||
|
||||
extern int openblas_omp_adaptive_env(void);
|
||||
|
||||
|
|
|
|||
|
|
@ -65,6 +65,8 @@ static CRITICAL_SECTION queue_lock;
|
|||
/* We need this global for checking if initialization is finished. */
|
||||
int blas_server_avail = 0;
|
||||
|
||||
int blas_omp_threads_local = 1;
|
||||
|
||||
/* Local Variables */
|
||||
static BLASULONG server_lock = 0;
|
||||
|
||||
|
|
|
|||
|
|
@ -275,6 +275,7 @@ extern gotoblas_t gotoblas_EXCAVATOR;
|
|||
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
|
||||
#define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE
|
||||
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
|
||||
#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE
|
||||
#else
|
||||
extern gotoblas_t gotoblas_HASWELL;
|
||||
extern gotoblas_t gotoblas_ZEN;
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* Copyright 2023-2024 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
|
@ -122,10 +122,11 @@ extern gotoblas_t gotoblas_CORTEXA55;
|
|||
#endif
|
||||
#else
|
||||
extern gotoblas_t gotoblas_CORTEXA53;
|
||||
#define gotoblas_CORTEXA55 gotoblas_CORTEXA53
|
||||
extern gotoblas_t gotoblas_CORTEXA57;
|
||||
extern gotoblas_t gotoblas_CORTEXA72;
|
||||
extern gotoblas_t gotoblas_CORTEXA73;
|
||||
extern gotoblas_t gotoblas_FALKOR;
|
||||
#define gotoblas_CORTEXA72 gotoblas_CORTEXA57
|
||||
#define gotoblas_CORTEXA73 gotoblas_CORTEXA57
|
||||
#define gotoblas_FALKOR gotoblas_CORTEXA57
|
||||
extern gotoblas_t gotoblas_THUNDERX;
|
||||
extern gotoblas_t gotoblas_THUNDERX2T99;
|
||||
extern gotoblas_t gotoblas_TSV110;
|
||||
|
|
@ -141,14 +142,14 @@ extern gotoblas_t gotoblas_ARMV8SVE;
|
|||
#define gotoblas_ARMV8SVE gotoblas_ARMV8
|
||||
#endif
|
||||
extern gotoblas_t gotoblas_THUNDERX3T110;
|
||||
extern gotoblas_t gotoblas_CORTEXA55;
|
||||
#endif
|
||||
#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
#define FALLBACK_VERBOSE 1
|
||||
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"
|
||||
|
||||
#define NUM_CORETYPES 16
|
||||
#define NUM_CORETYPES 17
|
||||
|
||||
/*
|
||||
* In case asm/hwcap.h is outdated on the build system, make sure
|
||||
|
|
@ -178,6 +179,7 @@ static char *corename[] = {
|
|||
"emag8180",
|
||||
"neoversen1",
|
||||
"neoversev1",
|
||||
"neoversev2",
|
||||
"neoversen2",
|
||||
"thunderx3t110",
|
||||
"cortexa55",
|
||||
|
|
@ -198,10 +200,11 @@ char *gotoblas_corename(void) {
|
|||
if (gotoblas == &gotoblas_EMAG8180) return corename[ 9];
|
||||
if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10];
|
||||
if (gotoblas == &gotoblas_NEOVERSEV1) return corename[11];
|
||||
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12];
|
||||
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
|
||||
if (gotoblas == &gotoblas_CORTEXA55) return corename[14];
|
||||
if (gotoblas == &gotoblas_ARMV8SVE) return corename[15];
|
||||
if (gotoblas == &gotoblas_NEOVERSEV2) return corename[12];
|
||||
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[13];
|
||||
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[14];
|
||||
if (gotoblas == &gotoblas_CORTEXA55) return corename[15];
|
||||
if (gotoblas == &gotoblas_ARMV8SVE) return corename[16];
|
||||
return corename[NUM_CORETYPES];
|
||||
}
|
||||
|
||||
|
|
@ -233,10 +236,11 @@ static gotoblas_t *force_coretype(char *coretype) {
|
|||
case 9: return (&gotoblas_EMAG8180);
|
||||
case 10: return (&gotoblas_NEOVERSEN1);
|
||||
case 11: return (&gotoblas_NEOVERSEV1);
|
||||
case 12: return (&gotoblas_NEOVERSEN2);
|
||||
case 13: return (&gotoblas_THUNDERX3T110);
|
||||
case 14: return (&gotoblas_CORTEXA55);
|
||||
case 15: return (&gotoblas_ARMV8SVE);
|
||||
case 12: return (&gotoblas_NEOVERSEV2);
|
||||
case 13: return (&gotoblas_NEOVERSEN2);
|
||||
case 14: return (&gotoblas_THUNDERX3T110);
|
||||
case 15: return (&gotoblas_CORTEXA55);
|
||||
case 16: return (&gotoblas_ARMV8SVE);
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
|
|
@ -247,6 +251,10 @@ static gotoblas_t *get_coretype(void) {
|
|||
int implementer, variant, part, arch, revision, midr_el1;
|
||||
char coremsg[128];
|
||||
|
||||
#if defined (OS_DARWIN)
|
||||
return &gotoblas_NEOVERSEN1;
|
||||
#endif
|
||||
|
||||
#if (!defined OS_LINUX && !defined OS_ANDROID)
|
||||
return NULL;
|
||||
#else
|
||||
|
|
@ -308,6 +316,13 @@ static gotoblas_t *get_coretype(void) {
|
|||
return &gotoblas_NEOVERSEN1;
|
||||
}else
|
||||
return &gotoblas_NEOVERSEV1;
|
||||
case 0xd4f:
|
||||
if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
|
||||
return &gotoblas_NEOVERSEN1;
|
||||
} else {
|
||||
return &gotoblas_NEOVERSEV2;
|
||||
}
|
||||
#endif
|
||||
case 0xd05: // Cortex A55
|
||||
return &gotoblas_CORTEXA55;
|
||||
|
|
@ -352,6 +367,9 @@ static gotoblas_t *get_coretype(void) {
|
|||
return &gotoblas_FALKOR;
|
||||
}
|
||||
break;
|
||||
case 0x61: // Apple
|
||||
return &gotoblas_NEOVERSEN1;
|
||||
break;
|
||||
default:
|
||||
snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
|
||||
openblas_warning(1, coremsg);
|
||||
|
|
|
|||
|
|
@ -43,6 +43,13 @@ char *gotoblas_corename(void) {
|
|||
#define CPU_POWER9 9
|
||||
#define CPU_POWER10 10
|
||||
|
||||
#ifndef POWER_9
|
||||
#define POWER_9 0x20000 /* 9 class CPU */
|
||||
#endif
|
||||
#ifndef POWER_10
|
||||
#define POWER_10 0x40000 /* 10 class CPU */
|
||||
#endif
|
||||
|
||||
#ifdef _AIX
|
||||
#include <sys/systemcfg.h>
|
||||
|
||||
|
|
@ -62,7 +69,7 @@ static int cpuid(void)
|
|||
else if (arch == POWER_9) return CPU_POWER9;
|
||||
#endif
|
||||
#ifdef POWER_10
|
||||
else if (arch == POWER_10) return CPU_POWER10;
|
||||
else if (arch >= POWER_10) return CPU_POWER10;
|
||||
#endif
|
||||
return CPU_UNKNOWN;
|
||||
}
|
||||
|
|
@ -332,6 +339,9 @@ void gotoblas_dynamic_init(void) {
|
|||
if (gotoblas && gotoblas -> init) {
|
||||
strncpy(coren,gotoblas_corename(),20);
|
||||
sprintf(coremsg, "Core: %s\n",coren);
|
||||
if (getenv("GET_OPENBLAS_CORETYPE")) {
|
||||
fprintf(stderr, "%s", coremsg);
|
||||
}
|
||||
openblas_warning(2, coremsg);
|
||||
gotoblas -> init();
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -3214,7 +3214,7 @@ void blas_shutdown(void){
|
|||
#endif
|
||||
memory[pos].lock = 0;
|
||||
}
|
||||
if (memory_overflowed)
|
||||
if (memory_overflowed) {
|
||||
for (pos = 0; pos < NEW_BUFFERS; pos ++){
|
||||
newmemory[pos].addr = (void *)0;
|
||||
newmemory[pos].used = 0;
|
||||
|
|
@ -3222,6 +3222,10 @@ void blas_shutdown(void){
|
|||
newmemory[pos].pos = -1;
|
||||
#endif
|
||||
newmemory[pos].lock = 0;
|
||||
}
|
||||
free(newmemory);
|
||||
newmemory = NULL;
|
||||
memory_overflowed = 0;
|
||||
}
|
||||
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
|
|
|||
|
|
@ -36,11 +36,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifdef SMP_SERVER
|
||||
|
||||
extern void openblas_set_num_threads(int num_threads) ;
|
||||
extern int openblas_get_num_threads(void) ;
|
||||
|
||||
void openblas_set_num_threads_(int* num_threads){
|
||||
openblas_set_num_threads(*num_threads);
|
||||
}
|
||||
|
||||
int openblas_set_num_threads_local(int num_threads){
|
||||
int ret = openblas_get_num_threads();
|
||||
openblas_set_num_threads(num_threads);
|
||||
blas_omp_threads_local=num_threads;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
#else
|
||||
//Single thread
|
||||
|
||||
|
|
@ -50,4 +59,8 @@ void openblas_set_num_threads(int num_threads) {
|
|||
void openblas_set_num_threads_(int* num_threads){
|
||||
|
||||
}
|
||||
|
||||
int openblas_set_num_threads_local(int num_threads){
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -60,6 +60,7 @@ cblasobjsc="
|
|||
cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv
|
||||
cblas_scnrm2 cblas_scasum cblas_cgemmt
|
||||
cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy
|
||||
cblas_caxpyc cblas_crotg cblas_csrot cblas_scamax cblas_scamin
|
||||
"
|
||||
cblasobjsd="
|
||||
cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot
|
||||
|
|
@ -69,6 +70,7 @@ cblasobjsd="
|
|||
cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv
|
||||
cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt
|
||||
cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy
|
||||
cblas_damax cblas_damin
|
||||
"
|
||||
|
||||
cblasobjss="
|
||||
|
|
@ -80,6 +82,7 @@ cblasobjss="
|
|||
cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm
|
||||
cblas_strsv cblas_sgeadd cblas_sgemmt
|
||||
cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy
|
||||
cblas_samax cblas_samin
|
||||
"
|
||||
|
||||
cblasobjsz="
|
||||
|
|
@ -91,6 +94,7 @@ cblasobjsz="
|
|||
cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub
|
||||
cblas_zaxpby cblas_zgeadd cblas_zgemmt
|
||||
cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy
|
||||
cblas_zaxpyc cblas_zdrot cblas_zrotg cblas_dzamax cblas_dzamin
|
||||
"
|
||||
|
||||
cblasobjs="cblas_xerbla"
|
||||
|
|
@ -861,6 +865,53 @@ lapackobjs2z="$lapackobjs2z
|
|||
zgedmd
|
||||
zgedmdq
|
||||
"
|
||||
|
||||
#functions added post 3.11
|
||||
|
||||
lapackobjs2c="$lapackobjs2c
|
||||
claqp2rk
|
||||
claqp3rk
|
||||
ctrsyl3
|
||||
"
|
||||
# claqz0
|
||||
# claqz1
|
||||
# claqz2
|
||||
# claqz3
|
||||
# clatrs3
|
||||
|
||||
lapackobjs2d="$lapackobjs2d
|
||||
dgelqs
|
||||
dgelst
|
||||
dgeqp3rk
|
||||
dgeqrs
|
||||
dlaqp2rk
|
||||
dlaqp3rk
|
||||
dlarmm
|
||||
dlatrs3
|
||||
dtrsyl3
|
||||
"
|
||||
# dlaqz0
|
||||
# dlaqz1
|
||||
# dlaqz2
|
||||
# dlaqz3
|
||||
# dlaqz4
|
||||
|
||||
lapackobjs2z="$lapackobjs2z
|
||||
zgelqs
|
||||
zgelst
|
||||
zgeqp3rk
|
||||
zgeqrs
|
||||
zlaqp2rk
|
||||
zlaqp3rk
|
||||
zlatrs3
|
||||
zrscl
|
||||
ztrsyl3
|
||||
"
|
||||
# zlaqz0
|
||||
# zlaqz1
|
||||
# zlaqz2
|
||||
# zlaqz3
|
||||
|
||||
lapack_extendedprecision_objs="
|
||||
zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx
|
||||
dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx
|
||||
|
|
@ -1622,6 +1673,14 @@ lapackeobjsc="
|
|||
LAPACKE_cgetsqrhrt_work
|
||||
LAPACKE_cungtsqr_row
|
||||
LAPACKE_cungtsqr_row_work
|
||||
LAPACKE_clangb
|
||||
LAPACKE_clangb_work
|
||||
LAPACKE_ctrsyl3
|
||||
LAPACKE_ctrsyl3_work
|
||||
LAPACKE_ctz_nancheck
|
||||
LAPACKE_ctz_trans
|
||||
LAPACKE_cunhr_col
|
||||
LAPACKE_cunhr_col_work
|
||||
"
|
||||
|
||||
lapackeobjsd="
|
||||
|
|
@ -2239,6 +2298,14 @@ lapackeobjsd="
|
|||
LAPACKE_dgetsqrhrt_work
|
||||
LAPACKE_dorgtsqr_row
|
||||
LAPACKE_dorgtsqr_row_work
|
||||
LAPACKE_dlangb
|
||||
LAPACKE_dlangb_work
|
||||
LAPACKE_dorhr_col
|
||||
LAPACKE_dorhr_col_work
|
||||
LAPACKE_dtrsyl3
|
||||
LAPACKE_dtrsyl3_work
|
||||
LAPACKE_dtz_nancheck
|
||||
LAPACKE_dtz_trans
|
||||
"
|
||||
|
||||
lapackeobjss="
|
||||
|
|
@ -2848,6 +2915,14 @@ lapackeobjss="
|
|||
LAPACKE_sgetsqrhrt_work
|
||||
LAPACKE_sorgtsqr_row
|
||||
LAPACKE_sorgtsqr_row_work
|
||||
LAPACKE_slangb
|
||||
LAPACKE_slangb_work
|
||||
LAPACKE_sorhr_col
|
||||
LAPACKE_sorhr_col_work
|
||||
LAPACKE_strsyl3
|
||||
LAPACKE_strsyl3_work
|
||||
LAPACKE_stz_nancheck
|
||||
LAPACKE_stz_trans
|
||||
"
|
||||
|
||||
lapackeobjsz="
|
||||
|
|
@ -3515,6 +3590,14 @@ lapackeobjsz="
|
|||
LAPACKE_zgetsqrhrt_work
|
||||
LAPACKE_zungtsqr_row
|
||||
LAPACKE_zungtsqr_row_work
|
||||
LAPACKE_zlangb
|
||||
LAPACKE_zlangb_work
|
||||
LAPACKE_ztrsyl3
|
||||
LAPACKE_ztrsyl3_work
|
||||
LAPACKE_ztz_nancheck
|
||||
LAPACKE_ztz_trans
|
||||
LAPACKE_zunhr_col
|
||||
LAPACKE_zunhr_col_work
|
||||
"
|
||||
## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile`
|
||||
## Not exported: requires LAPACKE_EXTENDED to be set and depends on the
|
||||
|
|
@ -3616,6 +3699,7 @@ lapack_embeded_underscore_objs_s="
|
|||
ssysv_aa_2stage ssytrf_aa_2stage
|
||||
ssytrs_aa_2stage
|
||||
slaorhr_col_getrfnp slaorhr_col_getrfnp2 sorhr_col
|
||||
slarfb_gett
|
||||
"
|
||||
lapack_embeded_underscore_objs_c="
|
||||
chetf2_rook chetrf_rook chetri_rook
|
||||
|
|
@ -3641,6 +3725,7 @@ lapack_embeded_underscore_objs_c="
|
|||
csysv_aa_2stage csytrf_aa_2stage
|
||||
csytrs_aa_2stage
|
||||
claunhr_col_getrfnp claunhr_col_getrfnp2 cunhr_col
|
||||
clarfb_gett
|
||||
"
|
||||
lapack_embeded_underscore_objs_d="
|
||||
dlasyf_rook
|
||||
|
|
@ -3658,6 +3743,7 @@ lapack_embeded_underscore_objs_d="
|
|||
dsysv_aa_2stage
|
||||
dsytrf_aa_2stage dsytrs_aa_2stage
|
||||
dlaorhr_col_getrfnp dlaorhr_col_getrfnp2 dorhr_col
|
||||
dlarfb_gett
|
||||
"
|
||||
lapack_embeded_underscore_objs_z="
|
||||
zhetf2_rook zhetrf_rook zhetri_rook
|
||||
|
|
@ -3682,6 +3768,7 @@ lapack_embeded_underscore_objs_z="
|
|||
zhetrs_aa_2stage zsysv_aa_2stage
|
||||
zsytrf_aa_2stage zsytrs_aa_2stage
|
||||
zlaunhr_col_getrfnp zlaunhr_col_getrfnp2 zunhr_col
|
||||
zlarfb_gett
|
||||
"
|
||||
|
||||
dirname=`pwd -P`/../lapack-netlib
|
||||
|
|
|
|||
10
f_check
10
f_check
|
|
@ -45,7 +45,7 @@ if [ -z "$compiler" ]; then
|
|||
pathf90 pathf95
|
||||
pgf95 pgf90 pgf77 pgfortran nvfortran
|
||||
flang egfortran
|
||||
ifort nagfor ifx ftn crayftn"
|
||||
ifort nagfor ifx ftn crayftn armflang"
|
||||
|
||||
for list in $lists; do
|
||||
for p in $path; do
|
||||
|
|
@ -85,7 +85,11 @@ else
|
|||
*Hewlett*)
|
||||
vendor=CRAY
|
||||
openmp='-fopenmp'
|
||||
;;
|
||||
;;
|
||||
*Arm\ F90*)
|
||||
vendor=FLANG
|
||||
openmp='-fopenmp'
|
||||
;;
|
||||
*GNU*|*GCC*)
|
||||
|
||||
v="${data#*GCC: *\) }"
|
||||
|
|
@ -108,7 +112,7 @@ else
|
|||
if [ "$major" -ge 17 ]; then
|
||||
vendor=FLANGNEW
|
||||
fi
|
||||
;;
|
||||
;;
|
||||
*ifort*|*ifx*)
|
||||
vendor=INTEL
|
||||
openmp='-fopenmp'
|
||||
|
|
|
|||
71
getarch.c
71
getarch.c
|
|
@ -150,6 +150,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/* #define FORCE_EV4 */
|
||||
/* #define FORCE_EV5 */
|
||||
/* #define FORCE_EV6 */
|
||||
/* #define FORCE_CSKY */
|
||||
/* #define FORCE_CK860FV */
|
||||
/* #define FORCE_GENERIC */
|
||||
|
||||
#ifdef FORCE_P2
|
||||
|
|
@ -1677,9 +1679,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define LIBNAME "c910v"
|
||||
#define CORENAME "C910V"
|
||||
#endif
|
||||
#endif
|
||||
#ifdef FORCE_x280
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "RISCV64"
|
||||
#define SUBARCHITECTURE "x280"
|
||||
#define SUBDIRNAME "riscv64"
|
||||
#define ARCHCONFIG "-Dx280 " \
|
||||
"-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
|
||||
#define LIBNAME "x280"
|
||||
#define CORENAME "x280"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_RISCV64_ZVL256B
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "RISCV64"
|
||||
#define SUBARCHITECTURE "RISCV64_ZVL256B"
|
||||
#define SUBDIRNAME "riscv64"
|
||||
#define ARCHCONFIG "-DRISCV64_ZVL256B " \
|
||||
"-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
|
||||
#define LIBNAME "riscv64_zvl256b"
|
||||
#define CORENAME "RISCV64_ZVL256B"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_RISCV64_ZVL128B
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "RISCV64"
|
||||
#define SUBARCHITECTURE "RISCV64_ZVL128B"
|
||||
#define SUBDIRNAME "riscv64"
|
||||
#define ARCHCONFIG "-DRISCV64_ZVL128B " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
|
||||
#define LIBNAME "riscv64_zvl128b"
|
||||
#define CORENAME "RISCV64_ZVL128B"
|
||||
#endif
|
||||
|
||||
#if defined(FORCE_E2K) || defined(__e2k__)
|
||||
#define FORCE
|
||||
|
|
@ -1692,6 +1731,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CORENAME "generic"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CSKY
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "CSKY"
|
||||
#define SUBARCHITECTURE "CSKY"
|
||||
#define SUBDIRNAME "csky"
|
||||
#define ARCHCONFIG "-DCSKY" \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=524288 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||
#define LIBNAME "csky"
|
||||
#define CORENAME "CSKY"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CK860FV
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "CSKY"
|
||||
#define SUBARCHITECTURE "CK860V"
|
||||
#define SUBDIRNAME "csky"
|
||||
#define ARCHCONFIG "-DCK860FV " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=524288 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||
#define LIBNAME "ck860fv"
|
||||
#define CORENAME "CK860FV"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef FORCE
|
||||
|
||||
#ifdef USER_TARGET
|
||||
|
|
@ -1766,7 +1832,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define OPENBLAS_SUPPORTED
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef OPENBLAS_SUPPORTED
|
||||
#error "This arch/CPU is not supported by OpenBLAS."
|
||||
#endif
|
||||
|
|
@ -1831,7 +1896,7 @@ int main(int argc, char *argv[]){
|
|||
#ifdef FORCE
|
||||
printf("CORE=%s\n", CORENAME);
|
||||
#else
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__)
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) || defined(__csky__)
|
||||
printf("CORE=%s\n", get_corename());
|
||||
#endif
|
||||
#endif
|
||||
|
|
@ -1979,7 +2044,7 @@ printf("ELF_VERSION=2\n");
|
|||
#ifdef FORCE
|
||||
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
|
||||
#else
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv)
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__csky__)
|
||||
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -119,6 +119,7 @@ endif ()
|
|||
if (BUILD_BFLOAT16)
|
||||
GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
|
|
@ -130,6 +131,8 @@ endif ()
|
|||
foreach (float_type ${FLOAT_TYPES})
|
||||
|
||||
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
|
||||
GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type})
|
||||
GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type})
|
||||
GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type})
|
||||
|
|
|
|||
|
|
@ -270,7 +270,8 @@ CSBLAS1OBJS = \
|
|||
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
|
||||
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
|
||||
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
|
||||
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX)
|
||||
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_samax.$(SUFFIX) \
|
||||
cblas_samin.$(SUFFIX)
|
||||
|
||||
CSBLAS2OBJS = \
|
||||
cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
|
||||
|
|
@ -295,7 +296,8 @@ CDBLAS1OBJS = \
|
|||
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
|
||||
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
|
||||
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
|
||||
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX)
|
||||
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_damax.$(SUFFIX) \
|
||||
cblas_damin.$(SUFFIX)
|
||||
|
||||
CDBLAS2OBJS = \
|
||||
cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
|
||||
|
|
@ -315,7 +317,7 @@ CCBLAS1OBJS = \
|
|||
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
|
||||
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
|
||||
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
|
||||
cblas_caxpby.$(SUFFIX) \
|
||||
cblas_caxpby.$(SUFFIX) cblas_scamax.$(SUFFIX) cblas_caxpyc.$(SUFFIX) cblas_scamin.$(SUFFIX) \
|
||||
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX)
|
||||
|
||||
CCBLAS2OBJS = \
|
||||
|
|
@ -340,12 +342,12 @@ CXERBLAOBJ = \
|
|||
|
||||
CZBLAS1OBJS = \
|
||||
cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
|
||||
cblas_zcopy.$(SUFFIX) \
|
||||
cblas_zcopy.$(SUFFIX) cblas_dzamax.$(SUFFIX) cblas_dzamin.$(SUFFIX) \
|
||||
cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
|
||||
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
|
||||
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
|
||||
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
|
||||
cblas_zaxpby.$(SUFFIX) \
|
||||
cblas_zaxpby.$(SUFFIX) cblas_zaxpyc.$(SUFFIX) \
|
||||
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX)
|
||||
|
||||
|
||||
|
|
@ -1301,7 +1303,7 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c
|
|||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : gemmt.c ../param.h
|
||||
sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
endif
|
||||
|
||||
|
|
@ -1533,6 +1535,30 @@ cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c
|
|||
cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_scamax.$(SUFFIX) cblas_scamax.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_dzamax.$(SUFFIX) cblas_dzamax.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_scamin.$(SUFFIX) cblas_scamin.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_dzamin.$(SUFFIX) cblas_dzamin.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
|
|
@ -1627,6 +1653,15 @@ cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c
|
|||
cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_caxpyc.$(SUFFIX) cblas_caxpyc.$(PSUFFIX) : zaxpy.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
|
||||
|
||||
cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
|
||||
|
||||
cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
|
||||
|
||||
cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
|
|
@ -1932,7 +1967,7 @@ cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h
|
|||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h
|
||||
cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
|
||||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||
endif
|
||||
|
||||
|
|
|
|||
|
|
@ -78,6 +78,9 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
|||
|
||||
char transA, transB, Uplo;
|
||||
blasint nrowa, nrowb;
|
||||
#if defined(COMPLEX)
|
||||
blasint ncolb;
|
||||
#endif
|
||||
IFLOAT *buffer;
|
||||
IFLOAT *aa, *bb;
|
||||
FLOAT *cc;
|
||||
|
|
@ -155,19 +158,27 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
|||
uplo = 0;
|
||||
if (Uplo == 'L')
|
||||
uplo = 1;
|
||||
|
||||
|
||||
nrowa = m;
|
||||
if (transa) nrowa = k;
|
||||
if (transa & 1) nrowa = k;
|
||||
nrowb = k;
|
||||
if (transb) nrowb = m;
|
||||
#if defined(COMPLEX)
|
||||
ncolb = m;
|
||||
#endif
|
||||
if (transb & 1) {
|
||||
nrowb = m;
|
||||
#if defined(COMPLEX)
|
||||
ncolb = k;
|
||||
#endif
|
||||
}
|
||||
|
||||
info = 0;
|
||||
|
||||
if (ldc < MAX(1, m))
|
||||
info = 13;
|
||||
if (ldb < MAX(1, nrowa))
|
||||
if (ldb < MAX(1, nrowb))
|
||||
info = 10;
|
||||
if (lda < MAX(1, nrowb))
|
||||
if (lda < MAX(1, nrowa))
|
||||
info = 8;
|
||||
if (k < 0)
|
||||
info = 5;
|
||||
|
|
@ -211,6 +222,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
blasint info;
|
||||
blasint lda, ldb;
|
||||
FLOAT *a, *b;
|
||||
#if defined(COMPLEX)
|
||||
blasint nrowb, ncolb;
|
||||
#endif
|
||||
XFLOAT *buffer;
|
||||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
|
@ -262,11 +276,22 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
|
||||
info = -1;
|
||||
|
||||
blasint nrowa, nrowb;
|
||||
blasint nrowa;
|
||||
#if !defined(COMPLEX)
|
||||
blasint nrowb;
|
||||
#endif
|
||||
nrowa = m;
|
||||
if (transa) nrowa = k;
|
||||
if (transa & 1) nrowa = k;
|
||||
nrowb = k;
|
||||
if (transb) nrowb = m;
|
||||
#if defined(COMPLEX)
|
||||
ncolb = m;
|
||||
#endif
|
||||
if (transb & 1) {
|
||||
nrowb = m;
|
||||
#if defined(COMPLEX)
|
||||
ncolb = k;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (ldc < MAX(1, m))
|
||||
info = 13;
|
||||
|
|
@ -330,26 +355,38 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
|
||||
info = -1;
|
||||
|
||||
blasint ncola, ncolb;
|
||||
ncola = k;
|
||||
if (transa) ncola = m;
|
||||
ncolb = m;
|
||||
if (transb) ncolb = k;
|
||||
blasint ncola;
|
||||
#if !defined(COMPLEX)
|
||||
blasint ncolb;
|
||||
#endif
|
||||
ncola = m;
|
||||
if (transa & 1) ncola = k;
|
||||
ncolb = k;
|
||||
#if defined(COMPLEX)
|
||||
nrowb = m;
|
||||
#endif
|
||||
|
||||
if (transb & 1) {
|
||||
#if defined(COMPLEX)
|
||||
nrowb = k;
|
||||
#endif
|
||||
ncolb = m;
|
||||
}
|
||||
|
||||
if (ldc < MAX(1,m))
|
||||
info = 13;
|
||||
if (ldb < MAX(1, ncolb))
|
||||
info = 10;
|
||||
if (lda < MAX(1, ncola))
|
||||
info = 8;
|
||||
if (lda < MAX(1, ncola))
|
||||
info = 10;
|
||||
if (k < 0)
|
||||
info = 5;
|
||||
if (m < 0)
|
||||
info = 4;
|
||||
if (transb < 0)
|
||||
info = 3;
|
||||
if (transa < 0)
|
||||
info = 2;
|
||||
if (transa < 0)
|
||||
info = 3;
|
||||
if (uplo < 0)
|
||||
info = 1;
|
||||
}
|
||||
|
|
@ -428,7 +465,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
|
||||
IDEBUG_START;
|
||||
|
||||
const blasint incb = (transb == 0) ? 1 : ldb;
|
||||
#if defined(COMPLEX)
|
||||
if (transb > 1){
|
||||
#ifndef CBLAS
|
||||
IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
|
||||
#else
|
||||
if (order == CblasColMajor)
|
||||
IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
|
||||
if (order == CblasRowMajor)
|
||||
IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
const blasint incb = ((transb & 1) == 0) ? 1 : ldb;
|
||||
|
||||
if (uplo == 1) {
|
||||
for (i = 0; i < m; i++) {
|
||||
|
|
@ -438,19 +488,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
#if defined(COMPLEX)
|
||||
aa = a + i * 2;
|
||||
bb = b + i * ldb * 2;
|
||||
if (transa) {
|
||||
if (transa & 1) {
|
||||
aa = a + lda * i * 2;
|
||||
}
|
||||
if (transb)
|
||||
if (transb & 1)
|
||||
bb = b + i * 2;
|
||||
cc = c + i * 2 * ldc + i * 2;
|
||||
#else
|
||||
aa = a + i;
|
||||
bb = b + i * ldb;
|
||||
if (transa) {
|
||||
if (transa & 1) {
|
||||
aa = a + lda * i;
|
||||
}
|
||||
if (transb)
|
||||
if (transb & 1)
|
||||
bb = b + i;
|
||||
cc = c + i * ldc + i;
|
||||
#endif
|
||||
|
|
@ -461,7 +511,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
NULL, 0);
|
||||
|
||||
if (alpha_r == ZERO && alpha_i == ZERO)
|
||||
return;
|
||||
continue;
|
||||
#else
|
||||
if (beta != ONE)
|
||||
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
|
||||
|
|
@ -478,7 +528,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
#endif
|
||||
// for alignment
|
||||
buffer_size = (buffer_size + 3) & ~3;
|
||||
STACK_ALLOC(buffer_size, FLOAT, buffer);
|
||||
STACK_ALLOC(buffer_size, IFLOAT, buffer);
|
||||
|
||||
#ifdef SMP
|
||||
|
||||
|
|
@ -491,7 +541,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
#endif
|
||||
|
||||
#if defined(COMPLEX)
|
||||
if (!transa)
|
||||
if (!(transa & 1))
|
||||
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
|
||||
aa, lda, bb, incb, cc, 1,
|
||||
buffer);
|
||||
|
|
@ -500,7 +550,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
aa, lda, bb, incb, cc, 1,
|
||||
buffer);
|
||||
#else
|
||||
if (!transa)
|
||||
if (!(transa & 1))
|
||||
(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
|
||||
bb, incb, cc, 1, buffer);
|
||||
else
|
||||
|
|
@ -509,7 +559,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
#endif
|
||||
#ifdef SMP
|
||||
} else {
|
||||
if (!transa)
|
||||
if (!(transa & 1))
|
||||
(gemv_thread[(int)transa]) (j, k, alpha, aa,
|
||||
lda, bb, incb, cc,
|
||||
1, buffer,
|
||||
|
|
@ -533,13 +583,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
l = j;
|
||||
#if defined COMPLEX
|
||||
bb = b + i * ldb * 2;
|
||||
if (transb) {
|
||||
if (transb & 1) {
|
||||
bb = b + i * 2;
|
||||
}
|
||||
cc = c + i * 2 * ldc;
|
||||
#else
|
||||
bb = b + i * ldb;
|
||||
if (transb) {
|
||||
if (transb & 1) {
|
||||
bb = b + i;
|
||||
}
|
||||
cc = c + i * ldc;
|
||||
|
|
@ -551,7 +601,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
NULL, 0);
|
||||
|
||||
if (alpha_r == ZERO && alpha_i == ZERO)
|
||||
return;
|
||||
continue;
|
||||
#else
|
||||
if (beta != ONE)
|
||||
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
|
||||
|
|
@ -567,7 +617,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
#endif
|
||||
// for alignment
|
||||
buffer_size = (buffer_size + 3) & ~3;
|
||||
STACK_ALLOC(buffer_size, FLOAT, buffer);
|
||||
STACK_ALLOC(buffer_size, IFLOAT, buffer);
|
||||
|
||||
#ifdef SMP
|
||||
|
||||
|
|
@ -580,7 +630,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
#endif
|
||||
|
||||
#if defined(COMPLEX)
|
||||
if (!transa)
|
||||
if (!(transa & 1))
|
||||
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
|
||||
a, lda, bb, incb, cc, 1,
|
||||
buffer);
|
||||
|
|
@ -589,7 +639,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
a, lda, bb, incb, cc, 1,
|
||||
buffer);
|
||||
#else
|
||||
if (!transa)
|
||||
if (!(transa & 1))
|
||||
(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
|
||||
incb, cc, 1, buffer);
|
||||
else
|
||||
|
|
@ -599,7 +649,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
|
||||
#ifdef SMP
|
||||
} else {
|
||||
if (!transa)
|
||||
if (!(transa & 1))
|
||||
(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
|
||||
bb, incb, cc, 1,
|
||||
buffer, nthreads);
|
||||
|
|
|
|||
|
|
@ -226,7 +226,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
|
||||
#ifdef SMP
|
||||
|
||||
if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD )
|
||||
if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD )
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = num_cpu_avail(2);
|
||||
|
|
|
|||
|
|
@ -154,7 +154,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
|||
}
|
||||
#endif
|
||||
|
||||
msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT);
|
||||
if ( *rows > *cols )
|
||||
msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT);
|
||||
else
|
||||
msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT);
|
||||
|
||||
b = malloc(msize);
|
||||
if ( b == NULL )
|
||||
|
|
|
|||
|
|
@ -114,7 +114,14 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv,
|
|||
|
||||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
#ifndef DOUBLE
|
||||
if (args.m*args.n < 40000)
|
||||
#else
|
||||
if (args.m*args.n < 10000)
|
||||
#endif
|
||||
args.nthreads=1;
|
||||
else
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -145,8 +145,13 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
|
|||
|
||||
#else
|
||||
|
||||
#ifdef COMPLEX
|
||||
FLOAT CNAME(blasint n, void *vx, blasint incx){
|
||||
FLOAT *x = (FLOAT*) vx;
|
||||
#else
|
||||
FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
|
||||
|
||||
#endif
|
||||
|
||||
FLOAT ret;
|
||||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
|
|
|||
|
|
@ -96,12 +96,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
|||
else
|
||||
{
|
||||
dp2 = *dd2 * dy1;
|
||||
if(dp2 == ZERO)
|
||||
{
|
||||
dflag = -TWO;
|
||||
dparam[0] = dflag;
|
||||
return;
|
||||
}
|
||||
dp1 = *dd1 * *dx1;
|
||||
dq2 = dp2 * dy1;
|
||||
dq1 = dp1 * *dx1;
|
||||
|
|
@ -113,24 +107,10 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
|||
dh12 = dp2 / dp1;
|
||||
|
||||
du = ONE - dh12 * dh21;
|
||||
if(du > ZERO)
|
||||
{
|
||||
dflag = ZERO;
|
||||
*dd1 = *dd1 / du;
|
||||
*dd2 = *dd2 / du;
|
||||
*dx1 = *dx1 * du;
|
||||
} else {
|
||||
dflag = -ONE;
|
||||
|
||||
dh11 = ZERO;
|
||||
dh12 = ZERO;
|
||||
dh21 = ZERO;
|
||||
dh22 = ZERO;
|
||||
|
||||
*dd1 = ZERO;
|
||||
*dd2 = ZERO;
|
||||
*dx1 = ZERO;
|
||||
}
|
||||
dflag = ZERO;
|
||||
*dd1 = *dd1 / du;
|
||||
*dd2 = *dd2 / du;
|
||||
*dx1 = *dx1 * du;
|
||||
|
||||
}
|
||||
else
|
||||
|
|
|
|||
|
|
@ -0,0 +1,447 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2024, The OpenBLAS Project. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "common.h"
|
||||
|
||||
#define SMP_THRESHOLD_MIN 65536.0
|
||||
#define ERROR_NAME "SBGEMMT "
|
||||
|
||||
#ifndef GEMM_MULTITHREAD_THRESHOLD
|
||||
#define GEMM_MULTITHREAD_THRESHOLD 4
|
||||
#endif
|
||||
|
||||
#ifndef CBLAS
|
||||
|
||||
void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
||||
blasint * M, blasint * K,
|
||||
FLOAT * Alpha,
|
||||
IFLOAT * a, blasint * ldA,
|
||||
IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
|
||||
{
|
||||
|
||||
blasint m, k;
|
||||
blasint lda, ldb, ldc;
|
||||
int transa, transb, uplo;
|
||||
blasint info;
|
||||
|
||||
char transA, transB, Uplo;
|
||||
blasint nrowa, nrowb;
|
||||
IFLOAT *buffer;
|
||||
IFLOAT *aa, *bb;
|
||||
FLOAT *cc;
|
||||
FLOAT alpha, beta;
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
m = *M;
|
||||
k = *K;
|
||||
|
||||
alpha = *Alpha;
|
||||
beta = *Beta;
|
||||
|
||||
lda = *ldA;
|
||||
ldb = *ldB;
|
||||
ldc = *ldC;
|
||||
|
||||
transA = *TRANSA;
|
||||
transB = *TRANSB;
|
||||
Uplo = *UPLO;
|
||||
TOUPPER(transA);
|
||||
TOUPPER(transB);
|
||||
TOUPPER(Uplo);
|
||||
|
||||
transa = -1;
|
||||
transb = -1;
|
||||
uplo = -1;
|
||||
|
||||
if (transA == 'N')
|
||||
transa = 0;
|
||||
if (transA == 'T')
|
||||
transa = 1;
|
||||
|
||||
if (transA == 'R')
|
||||
transa = 0;
|
||||
if (transA == 'C')
|
||||
transa = 1;
|
||||
|
||||
if (transB == 'N')
|
||||
transb = 0;
|
||||
if (transB == 'T')
|
||||
transb = 1;
|
||||
|
||||
if (transB == 'R')
|
||||
transb = 0;
|
||||
if (transB == 'C')
|
||||
transb = 1;
|
||||
|
||||
if (Uplo == 'U')
|
||||
uplo = 0;
|
||||
if (Uplo == 'L')
|
||||
uplo = 1;
|
||||
nrowa = m;
|
||||
if (transa & 1) nrowa = k;
|
||||
nrowb = k;
|
||||
if (transb & 1) nrowb = m;
|
||||
|
||||
info = 0;
|
||||
|
||||
if (ldc < MAX(1, m))
|
||||
info = 13;
|
||||
if (ldb < MAX(1, nrowb))
|
||||
info = 10;
|
||||
if (lda < MAX(1, nrowa))
|
||||
info = 8;
|
||||
if (k < 0)
|
||||
info = 5;
|
||||
if (m < 0)
|
||||
info = 4;
|
||||
if (transb < 0)
|
||||
info = 3;
|
||||
if (transa < 0)
|
||||
info = 2;
|
||||
if (uplo < 0)
|
||||
info = 1;
|
||||
|
||||
if (info != 0) {
|
||||
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
return;
|
||||
}
|
||||
#else
|
||||
|
||||
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m,
|
||||
blasint k,
|
||||
FLOAT alpha,
|
||||
IFLOAT * A, blasint LDA,
|
||||
IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc)
|
||||
{
|
||||
IFLOAT *aa, *bb;
|
||||
FLOAT *cc;
|
||||
|
||||
int transa, transb, uplo;
|
||||
blasint info;
|
||||
blasint lda, ldb;
|
||||
IFLOAT *a, *b;
|
||||
XFLOAT *buffer;
|
||||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
uplo = -1;
|
||||
transa = -1;
|
||||
transb = -1;
|
||||
info = 0;
|
||||
|
||||
if (order == CblasColMajor) {
|
||||
if (Uplo == CblasUpper) uplo = 0;
|
||||
if (Uplo == CblasLower) uplo = 1;
|
||||
|
||||
if (TransA == CblasNoTrans)
|
||||
transa = 0;
|
||||
if (TransA == CblasTrans)
|
||||
transa = 1;
|
||||
|
||||
if (TransA == CblasConjNoTrans)
|
||||
transa = 0;
|
||||
if (TransA == CblasConjTrans)
|
||||
transa = 1;
|
||||
|
||||
if (TransB == CblasNoTrans)
|
||||
transb = 0;
|
||||
if (TransB == CblasTrans)
|
||||
transb = 1;
|
||||
|
||||
if (TransB == CblasConjNoTrans)
|
||||
transb = 0;
|
||||
if (TransB == CblasConjTrans)
|
||||
transb = 1;
|
||||
|
||||
a = (void *)A;
|
||||
b = (void *)B;
|
||||
lda = LDA;
|
||||
ldb = LDB;
|
||||
|
||||
info = -1;
|
||||
|
||||
blasint nrowa;
|
||||
blasint nrowb;
|
||||
nrowa = m;
|
||||
if (transa & 1) nrowa = k;
|
||||
nrowb = k;
|
||||
if (transb & 1) nrowb = m;
|
||||
|
||||
if (ldc < MAX(1, m))
|
||||
info = 13;
|
||||
if (ldb < MAX(1, nrowb))
|
||||
info = 10;
|
||||
if (lda < MAX(1, nrowa))
|
||||
info = 8;
|
||||
if (k < 0)
|
||||
info = 5;
|
||||
if (m < 0)
|
||||
info = 4;
|
||||
if (transb < 0)
|
||||
info = 3;
|
||||
if (transa < 0)
|
||||
info = 2;
|
||||
if (uplo < 0)
|
||||
info = 1;
|
||||
}
|
||||
|
||||
if (order == CblasRowMajor) {
|
||||
|
||||
a = (void *)B;
|
||||
b = (void *)A;
|
||||
|
||||
lda = LDB;
|
||||
ldb = LDA;
|
||||
|
||||
if (Uplo == CblasUpper) uplo = 0;
|
||||
if (Uplo == CblasLower) uplo = 1;
|
||||
|
||||
if (TransB == CblasNoTrans)
|
||||
transa = 0;
|
||||
if (TransB == CblasTrans)
|
||||
transa = 1;
|
||||
|
||||
if (TransB == CblasConjNoTrans)
|
||||
transa = 0;
|
||||
if (TransB == CblasConjTrans)
|
||||
transa = 1;
|
||||
|
||||
if (TransA == CblasNoTrans)
|
||||
transb = 0;
|
||||
if (TransA == CblasTrans)
|
||||
transb = 1;
|
||||
|
||||
if (TransA == CblasConjNoTrans)
|
||||
transb = 0;
|
||||
if (TransA == CblasConjTrans)
|
||||
transb = 1;
|
||||
|
||||
info = -1;
|
||||
|
||||
blasint ncola;
|
||||
blasint ncolb;
|
||||
|
||||
ncola = m;
|
||||
if (transa & 1) ncola = k;
|
||||
ncolb = k;
|
||||
|
||||
if (transb & 1) {
|
||||
ncolb = m;
|
||||
}
|
||||
|
||||
if (ldc < MAX(1,m))
|
||||
info = 13;
|
||||
if (ldb < MAX(1, ncolb))
|
||||
info = 8;
|
||||
if (lda < MAX(1, ncola))
|
||||
info = 10;
|
||||
if (k < 0)
|
||||
info = 5;
|
||||
if (m < 0)
|
||||
info = 4;
|
||||
if (transb < 0)
|
||||
info = 2;
|
||||
if (transa < 0)
|
||||
info = 3;
|
||||
if (uplo < 0)
|
||||
info = 1;
|
||||
}
|
||||
|
||||
if (info >= 0) {
|
||||
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
||||
int buffer_size;
|
||||
blasint i, j;
|
||||
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef SMP
|
||||
static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *,
|
||||
BLASLONG, IFLOAT *, BLASLONG, FLOAT,
|
||||
FLOAT *, BLASLONG, int) = {
|
||||
sbgemv_thread_n, sbgemv_thread_t,
|
||||
};
|
||||
#endif
|
||||
int (*gemv[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, BLASLONG,
|
||||
IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = {
|
||||
SBGEMV_N, SBGEMV_T,};
|
||||
|
||||
|
||||
if (m == 0)
|
||||
return;
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
const blasint incb = ((transb & 1) == 0) ? 1 : ldb;
|
||||
|
||||
if (uplo == 1) {
|
||||
for (i = 0; i < m; i++) {
|
||||
j = m - i;
|
||||
|
||||
aa = a + i;
|
||||
bb = b + i * ldb;
|
||||
if (transa & 1) {
|
||||
aa = a + lda * i;
|
||||
}
|
||||
if (transb & 1)
|
||||
bb = b + i;
|
||||
cc = c + i * ldc + i;
|
||||
|
||||
#if 0
|
||||
if (beta != ONE)
|
||||
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO)
|
||||
continue;
|
||||
#endif
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
buffer_size = j + k + 128 / sizeof(FLOAT);
|
||||
#ifdef WINDOWS_ABI
|
||||
buffer_size += 160 / sizeof(FLOAT);
|
||||
#endif
|
||||
// for alignment
|
||||
buffer_size = (buffer_size + 3) & ~3;
|
||||
STACK_ALLOC(buffer_size, IFLOAT, buffer);
|
||||
|
||||
#ifdef SMP
|
||||
|
||||
if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = num_cpu_avail(2);
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
||||
if (!(transa & 1))
|
||||
(gemv[(int)transa]) (j, k, alpha, aa, lda,
|
||||
bb, incb, beta, cc, 1);
|
||||
else
|
||||
(gemv[(int)transa]) (k, j, alpha, aa, lda,
|
||||
bb, incb, beta, cc, 1);
|
||||
|
||||
#ifdef SMP
|
||||
} else {
|
||||
if (!(transa & 1))
|
||||
(gemv_thread[(int)transa]) (j, k, alpha, aa,
|
||||
lda, bb, incb, beta, cc,
|
||||
1, nthreads);
|
||||
else
|
||||
(gemv_thread[(int)transa]) (k, j, alpha, aa,
|
||||
lda, bb, incb, beta, cc,
|
||||
1, nthreads);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
STACK_FREE(buffer);
|
||||
}
|
||||
} else {
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
j = i + 1;
|
||||
|
||||
bb = b + i * ldb;
|
||||
if (transb & 1) {
|
||||
bb = b + i;
|
||||
}
|
||||
cc = c + i * ldc;
|
||||
|
||||
#if 0
|
||||
if (beta != ONE)
|
||||
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO)
|
||||
continue;
|
||||
#endif
|
||||
IDEBUG_START;
|
||||
|
||||
buffer_size = j + k + 128 / sizeof(FLOAT);
|
||||
#ifdef WINDOWS_ABI
|
||||
buffer_size += 160 / sizeof(FLOAT);
|
||||
#endif
|
||||
// for alignment
|
||||
buffer_size = (buffer_size + 3) & ~3;
|
||||
STACK_ALLOC(buffer_size, IFLOAT, buffer);
|
||||
|
||||
#ifdef SMP
|
||||
|
||||
if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = num_cpu_avail(2);
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
||||
if (!(transa & 1))
|
||||
(gemv[(int)transa]) (j, k, alpha, a, lda, bb,
|
||||
incb, beta, cc, 1);
|
||||
else
|
||||
(gemv[(int)transa]) (k, j, alpha, a, lda, bb,
|
||||
incb, beta, cc, 1);
|
||||
|
||||
#ifdef SMP
|
||||
} else {
|
||||
if (!(transa & 1))
|
||||
(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
|
||||
bb, incb, beta, cc, 1,
|
||||
nthreads);
|
||||
else
|
||||
(gemv_thread[(int)transa]) (k, j, alpha, a, lda,
|
||||
bb, incb, beta, cc, 1,
|
||||
nthreads);
|
||||
}
|
||||
#endif
|
||||
|
||||
STACK_FREE(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
IDEBUG_END;
|
||||
|
||||
return;
|
||||
}
|
||||
|
|
@ -39,12 +39,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#ifndef CBLAS
|
||||
|
||||
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY)
|
||||
void NAME(blasint *N, void *VALPHA, FLOAT *x, blasint *INCX, void *VBETA, FLOAT *y, blasint *INCY)
|
||||
{
|
||||
|
||||
blasint n = *N;
|
||||
blasint incx = *INCX;
|
||||
blasint incy = *INCY;
|
||||
FLOAT* ALPHA = (FLOAT*) VALPHA;
|
||||
FLOAT* BETA = (FLOAT*) VBETA;
|
||||
|
||||
#else
|
||||
|
||||
|
|
|
|||
|
|
@ -183,7 +183,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
|||
}
|
||||
#endif
|
||||
|
||||
msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT) * 2;
|
||||
if ( *rows > *cols )
|
||||
msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT) * 2;
|
||||
else
|
||||
msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT) * 2;
|
||||
|
||||
b = malloc(msize);
|
||||
if ( b == NULL )
|
||||
|
|
|
|||
|
|
@ -60,6 +60,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
|
|||
else
|
||||
{
|
||||
temp = - da_i * x[ip+1] ;
|
||||
if (isnan(x[ip]) || isinf(x[ip])) temp = NAN;
|
||||
x[ip+1] = da_i * x[ip] ;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,206 +1 @@
|
|||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
STRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
STRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
STRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
DTRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
DTRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
DTRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
DTRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
TRSMCOPYLN_M = trsm_lncopy_sve.c
|
||||
TRSMCOPYLT_M = trsm_ltcopy_sve.c
|
||||
TRSMCOPYUN_M = trsm_uncopy_sve.c
|
||||
TRSMCOPYUT_M = trsm_utcopy_sve.c
|
||||
|
||||
CTRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
CTRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
CTRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
CTRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c
|
||||
ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c
|
||||
ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c
|
||||
ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c
|
||||
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
SASUMKERNEL = sasum_thunderx2t99.c
|
||||
DASUMKERNEL = dasum_thunderx2t99.c
|
||||
CASUMKERNEL = casum_thunderx2t99.c
|
||||
ZASUMKERNEL = zasum_thunderx2t99.c
|
||||
|
||||
SCOPYKERNEL = copy_thunderx2t99.c
|
||||
DCOPYKERNEL = copy_thunderx2t99.c
|
||||
CCOPYKERNEL = copy_thunderx2t99.c
|
||||
ZCOPYKERNEL = copy_thunderx2t99.c
|
||||
|
||||
SSWAPKERNEL = swap_thunderx2t99.S
|
||||
DSWAPKERNEL = swap_thunderx2t99.S
|
||||
CSWAPKERNEL = swap_thunderx2t99.S
|
||||
ZSWAPKERNEL = swap_thunderx2t99.S
|
||||
|
||||
ISAMAXKERNEL = iamax_thunderx2t99.c
|
||||
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||
|
||||
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
DDOTKERNEL = dot.c
|
||||
SDOTKERNEL = dot.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
|
||||
|
||||
SGEMMINCOPY = gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c
|
||||
SGEMMITCOPY = gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c
|
||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRMMUNCOPY_M = trmm_uncopy_sve_v1.c
|
||||
STRMMLNCOPY_M = trmm_lncopy_sve_v1.c
|
||||
STRMMUTCOPY_M = trmm_utcopy_sve_v1.c
|
||||
STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
|
||||
|
||||
SSYMMUCOPY_M = symm_ucopy_sve.c
|
||||
SSYMMLCOPY_M = symm_lcopy_sve.c
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
|
||||
|
||||
DGEMMINCOPY = gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c
|
||||
DGEMMITCOPY = gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c
|
||||
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c
|
||||
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c
|
||||
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
|
||||
|
||||
DSYMMUCOPY_M = symm_ucopy_sve.c
|
||||
DSYMMLCOPY_M = symm_lcopy_sve.c
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
|
||||
CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
|
||||
CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
|
||||
CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
|
||||
CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
|
||||
CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
|
||||
|
||||
CHEMMLTCOPY_M = zhemm_ltcopy_sve.c
|
||||
CHEMMUTCOPY_M = zhemm_utcopy_sve.c
|
||||
|
||||
CSYMMUCOPY_M = zsymm_ucopy_sve.c
|
||||
CSYMMLCOPY_M = zsymm_lcopy_sve.c
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
|
||||
ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
|
||||
ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
|
||||
ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
|
||||
ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
|
||||
|
||||
ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c
|
||||
ZHEMMUTCOPY_M = zhemm_utcopy_sve.c
|
||||
|
||||
ZSYMMUCOPY_M = zsymm_ucopy_sve.c
|
||||
ZSYMMLCOPY_M = zsymm_lcopy_sve.c
|
||||
include $(KERNELDIR)/KERNEL.ARMV8SVE
|
||||
|
|
|
|||
|
|
@ -1,196 +1 @@
|
|||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = axpy.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
|
||||
SASUMKERNEL = asum.S
|
||||
DASUMKERNEL = asum.S
|
||||
CASUMKERNEL = casum.S
|
||||
ZASUMKERNEL = zasum.S
|
||||
|
||||
SCOPYKERNEL = copy.S
|
||||
DCOPYKERNEL = copy.S
|
||||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
|
||||
SSWAPKERNEL = swap.S
|
||||
DSWAPKERNEL = swap.S
|
||||
CSWAPKERNEL = swap.S
|
||||
ZSWAPKERNEL = swap.S
|
||||
|
||||
ISAMAXKERNEL = iamax.S
|
||||
IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
else
|
||||
SDOTKERNEL = dot.S
|
||||
endif
|
||||
DDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
|
||||
else
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
endif
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
ifeq ($(SGEMM_UNROLL_M), 16)
|
||||
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_M), 4)
|
||||
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||
else
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
else
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
include $(KERNELDIR)/KERNEL.CORTEXA53
|
||||
|
|
|
|||
|
|
@ -1,184 +1 @@
|
|||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||
else
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
else
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
SASUMKERNEL = sasum_thunderx2t99.c
|
||||
DASUMKERNEL = dasum_thunderx2t99.c
|
||||
CASUMKERNEL = casum_thunderx2t99.c
|
||||
ZASUMKERNEL = zasum_thunderx2t99.c
|
||||
|
||||
SCOPYKERNEL = copy_thunderx2t99.c
|
||||
DCOPYKERNEL = copy_thunderx2t99.c
|
||||
CCOPYKERNEL = copy_thunderx2t99.c
|
||||
ZCOPYKERNEL = copy_thunderx2t99.c
|
||||
|
||||
SSWAPKERNEL = swap_thunderx2t99.S
|
||||
DSWAPKERNEL = swap_thunderx2t99.S
|
||||
CSWAPKERNEL = swap_thunderx2t99.S
|
||||
ZSWAPKERNEL = swap_thunderx2t99.S
|
||||
|
||||
ISAMAXKERNEL = iamax_thunderx2t99.c
|
||||
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||
|
||||
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c
|
||||
#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c
|
||||
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
|
||||
DDOTKERNEL = dot.c
|
||||
SDOTKERNEL = dot.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
|
||||
DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S
|
||||
endif
|
||||
|
||||
ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4)
|
||||
SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S
|
||||
endif
|
||||
|
||||
ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4)
|
||||
CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S
|
||||
endif
|
||||
|
||||
ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4)
|
||||
ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S
|
||||
endif
|
||||
include $(KERNELDIR)/KERNEL.THUNDERX2T99
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
Copyright (c) 2022, Arm Ltd
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
|
@ -30,37 +31,84 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include <arm_sve.h>
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define SVE_TYPE svfloat64_t
|
||||
#define SVE_ZERO svdup_f64(0.0)
|
||||
#define SVE_WHILELT svwhilelt_b64
|
||||
#define SVE_ALL svptrue_b64()
|
||||
#define SVE_WIDTH svcntd()
|
||||
#define DTYPE "d"
|
||||
#define WIDTH "d"
|
||||
#define SHIFT "3"
|
||||
#else
|
||||
#define SVE_TYPE svfloat32_t
|
||||
#define SVE_ZERO svdup_f32(0.0)
|
||||
#define SVE_WHILELT svwhilelt_b32
|
||||
#define SVE_ALL svptrue_b32()
|
||||
#define SVE_WIDTH svcntw()
|
||||
#define DTYPE "s"
|
||||
#define WIDTH "w"
|
||||
#define SHIFT "2"
|
||||
#endif
|
||||
|
||||
static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||
SVE_TYPE acc_a = SVE_ZERO;
|
||||
SVE_TYPE acc_b = SVE_ZERO;
|
||||
#define COUNT \
|
||||
" cnt"WIDTH" x9 \n"
|
||||
#define SETUP_TRUE \
|
||||
" ptrue p0."DTYPE" \n"
|
||||
#define OFFSET_INPUTS \
|
||||
" add x12, %[X_], x9, lsl #"SHIFT" \n" \
|
||||
" add x13, %[Y_], x9, lsl #"SHIFT" \n"
|
||||
#define TAIL_WHILE \
|
||||
" whilelo p1."DTYPE", x8, x0 \n"
|
||||
#define UPDATE(pg, x,y,out) \
|
||||
" ld1"WIDTH" { z2."DTYPE" }, "pg"/z, ["x", x8, lsl #"SHIFT"] \n" \
|
||||
" ld1"WIDTH" { z3."DTYPE" }, "pg"/z, ["y", x8, lsl #"SHIFT"] \n" \
|
||||
" fmla "out"."DTYPE", "pg"/m, z2."DTYPE", z3."DTYPE" \n"
|
||||
#define SUM_VECTOR(v) \
|
||||
" faddv "DTYPE""v", p0, z"v"."DTYPE" \n"
|
||||
#define RET \
|
||||
" fadd %"DTYPE"[RET_], "DTYPE"1, "DTYPE"0 \n"
|
||||
|
||||
BLASLONG sve_width = SVE_WIDTH;
|
||||
#define DOT_KERNEL \
|
||||
COUNT \
|
||||
" mov z1.d, #0 \n" \
|
||||
" mov z0.d, #0 \n" \
|
||||
" mov x8, #0 \n" \
|
||||
" movi d1, #0x0 \n" \
|
||||
SETUP_TRUE \
|
||||
" neg x10, x9, lsl #1 \n" \
|
||||
" ands x11, x10, x0 \n" \
|
||||
" b.eq 2f // skip_2x \n" \
|
||||
OFFSET_INPUTS \
|
||||
"1: // vector_2x \n" \
|
||||
UPDATE("p0", "%[X_]", "%[Y_]", "z1") \
|
||||
UPDATE("p0", "x12", "x13", "z0") \
|
||||
" sub x8, x8, x10 \n" \
|
||||
" cmp x8, x11 \n" \
|
||||
" b.lo 1b // vector_2x \n" \
|
||||
SUM_VECTOR("1") \
|
||||
"2: // skip_2x \n" \
|
||||
" neg x10, x9 \n" \
|
||||
" and x10, x10, x0 \n" \
|
||||
" cmp x8, x10 \n" \
|
||||
" b.hs 4f // tail \n" \
|
||||
"3: // vector_1x \n" \
|
||||
UPDATE("p0", "%[X_]", "%[Y_]", "z0") \
|
||||
" add x8, x8, x9 \n" \
|
||||
" cmp x8, x10 \n" \
|
||||
" b.lo 3b // vector_1x \n" \
|
||||
"4: // tail \n" \
|
||||
" cmp x10, x0 \n" \
|
||||
" b.eq 5f // end \n" \
|
||||
TAIL_WHILE \
|
||||
UPDATE("p1", "%[X_]", "%[Y_]", "z0") \
|
||||
"5: // end \n" \
|
||||
SUM_VECTOR("0") \
|
||||
RET
|
||||
|
||||
for (BLASLONG i = 0; i < n; i += sve_width * 2) {
|
||||
svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n);
|
||||
svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n);
|
||||
static
|
||||
FLOAT
|
||||
dot_kernel_sve(BLASLONG n, FLOAT* x, FLOAT* y)
|
||||
{
|
||||
FLOAT ret;
|
||||
|
||||
SVE_TYPE x_vec_a = svld1(pg_a, &x[i]);
|
||||
SVE_TYPE y_vec_a = svld1(pg_a, &y[i]);
|
||||
SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]);
|
||||
SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]);
|
||||
asm(DOT_KERNEL
|
||||
:
|
||||
[RET_] "=&w" (ret)
|
||||
:
|
||||
[N_] "r" (n),
|
||||
[X_] "r" (x),
|
||||
[Y_] "r" (y)
|
||||
:);
|
||||
|
||||
acc_a = svmla_m(pg_a, acc_a, x_vec_a, y_vec_a);
|
||||
acc_b = svmla_m(pg_b, acc_b, x_vec_b, y_vec_b);
|
||||
}
|
||||
|
||||
return svaddv(SVE_ALL, acc_a) + svaddv(SVE_ALL, acc_b);
|
||||
return ret;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -223,7 +223,7 @@ zscal_begin:
|
|||
fcmp DA_I, #0.0
|
||||
beq .Lzscal_kernel_RI_zero
|
||||
|
||||
b .Lzscal_kernel_R_zero
|
||||
// b .Lzscal_kernel_R_zero
|
||||
|
||||
.Lzscal_kernel_R_non_zero:
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,149 @@
|
|||
SAMAXKERNEL = ../arm/amax.c
|
||||
DAMAXKERNEL = ../arm/amax.c
|
||||
CAMAXKERNEL = ../arm/zamax.c
|
||||
ZAMAXKERNEL = ../arm/zamax.c
|
||||
|
||||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMAXKERNEL = ../arm/iamax.c
|
||||
IDAMAXKERNEL = ../arm/iamax.c
|
||||
ICAMAXKERNEL = ../arm/izamax.c
|
||||
IZAMAXKERNEL = ../arm/izamax.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
SASUMKERNEL = ../arm/asum.c
|
||||
DASUMKERNEL = ../arm/asum.c
|
||||
CASUMKERNEL = ../arm/zasum.c
|
||||
ZASUMKERNEL = ../arm/zasum.c
|
||||
|
||||
SSUMKERNEL = ../arm/sum.c
|
||||
DSUMKERNEL = ../arm/sum.c
|
||||
CSUMKERNEL = ../arm/zsum.c
|
||||
ZSUMKERNEL = ../arm/zsum.c
|
||||
|
||||
SAXPYKERNEL = ../arm/axpy.c
|
||||
DAXPYKERNEL = ../arm/axpy.c
|
||||
CAXPYKERNEL = ../arm/zaxpy.c
|
||||
ZAXPYKERNEL = ../arm/zaxpy.c
|
||||
|
||||
SCOPYKERNEL = ../arm/copy.c
|
||||
DCOPYKERNEL = ../arm/copy.c
|
||||
CCOPYKERNEL = ../arm/zcopy.c
|
||||
ZCOPYKERNEL = ../arm/zcopy.c
|
||||
|
||||
SDOTKERNEL = ../arm/dot.c
|
||||
DDOTKERNEL = ../arm/dot.c
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
DSDOTKERNEL = ../generic/dot.c
|
||||
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
DNRM2KERNEL = ../arm/nrm2.c
|
||||
CNRM2KERNEL = ../arm/znrm2.c
|
||||
ZNRM2KERNEL = ../arm/znrm2.c
|
||||
|
||||
SROTKERNEL = ../arm/rot.c
|
||||
DROTKERNEL = ../arm/rot.c
|
||||
CROTKERNEL = ../arm/zrot.c
|
||||
ZROTKERNEL = ../arm/zrot.c
|
||||
|
||||
SSCALKERNEL = ../arm/scal.c
|
||||
DSCALKERNEL = ../arm/scal.c
|
||||
CSCALKERNEL = ../arm/zscal.c
|
||||
ZSCALKERNEL = ../arm/zscal.c
|
||||
|
||||
SSWAPKERNEL = ../arm/swap.c
|
||||
DSWAPKERNEL = ../arm/swap.c
|
||||
CSWAPKERNEL = ../arm/zswap.c
|
||||
ZSWAPKERNEL = ../arm/zswap.c
|
||||
|
||||
SGEMVNKERNEL = ../arm/gemv_n.c
|
||||
DGEMVNKERNEL = ../arm/gemv_n.c
|
||||
CGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
ZGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
|
||||
SGEMVTKERNEL = ../arm/gemv_t.c
|
||||
DGEMVTKERNEL = ../arm/gemv_t.c
|
||||
CGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
ZGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
|
||||
SCABS_KERNEL = ../generic/cabs.c
|
||||
DCABS_KERNEL = ../generic/cabs.c
|
||||
QCABS_KERNEL = ../generic/cabs.c
|
||||
LSAME_KERNEL = ../generic/lsame.c
|
||||
|
||||
SGEMM_BETA = ../generic/gemm_beta.c
|
||||
DGEMM_BETA = ../generic/gemm_beta.c
|
||||
CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||
|
||||
|
||||
|
|
@ -0,0 +1 @@
|
|||
clean ::
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -40,7 +40,6 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a,
|
|||
|
||||
if ( rows <= 0 ) return(0);
|
||||
if ( cols <= 0 ) return(0);
|
||||
if ( alpha_r == 1.0 && alpha_i == 0.0 ) return (0);
|
||||
|
||||
aptr = a;
|
||||
lda *= 2;
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -3,56 +3,126 @@ ifndef NO_LSX
|
|||
SDOTKERNEL = dot_lsx.S
|
||||
DSDOTKERNEL = dot_lsx.S
|
||||
DDOTKERNEL = dot_lsx.S
|
||||
CDOTKERNEL = cdot_lsx.S
|
||||
ZDOTKERNEL = cdot_lsx.S
|
||||
|
||||
SSCALKERNEL = sscal_lsx.S
|
||||
DSCALKERNEL = dscal_lsx.S
|
||||
SSCALKERNEL = scal_lsx.S
|
||||
DSCALKERNEL = scal_lsx.S
|
||||
CSCALKERNEL = cscal_lsx.S
|
||||
ZSCALKERNEL = cscal_lsx.S
|
||||
|
||||
SAMAXKERNEL = samax_lsx.S
|
||||
DAMAXKERNEL = damax_lsx.S
|
||||
SAMAXKERNEL = amax_lsx.S
|
||||
DAMAXKERNEL = amax_lsx.S
|
||||
CAMAXKERNEL = camax_lsx.S
|
||||
ZAMAXKERNEL = camax_lsx.S
|
||||
|
||||
SAMINKERNEL = samin_lsx.S
|
||||
DAMINKERNEL = damin_lsx.S
|
||||
SAMINKERNEL = amin_lsx.S
|
||||
DAMINKERNEL = amin_lsx.S
|
||||
CAMINKERNEL = camin_lsx.S
|
||||
ZAMINKERNEL = camin_lsx.S
|
||||
|
||||
SMAXKERNEL = smax_lsx.S
|
||||
DMAXKERNEL = dmax_lsx.S
|
||||
SMAXKERNEL = max_lsx.S
|
||||
DMAXKERNEL = max_lsx.S
|
||||
|
||||
SMINKERNEL = smin_lsx.S
|
||||
DMINKERNEL = dmin_lsx.S
|
||||
SMINKERNEL = min_lsx.S
|
||||
DMINKERNEL = min_lsx.S
|
||||
|
||||
ISMAXKERNEL = ismax_lsx.S
|
||||
IDMAXKERNEL = idmax_lsx.S
|
||||
ISMAXKERNEL = imax_lsx.S
|
||||
IDMAXKERNEL = imax_lsx.S
|
||||
|
||||
ISMINKERNEL = ismin_lsx.S
|
||||
IDMINKERNEL = idmin_lsx.S
|
||||
ISMINKERNEL = imin_lsx.S
|
||||
IDMINKERNEL = imin_lsx.S
|
||||
|
||||
ISAMAXKERNEL = isamax_lsx.S
|
||||
IDAMAXKERNEL = idamax_lsx.S
|
||||
ISAMAXKERNEL = iamax_lsx.S
|
||||
IDAMAXKERNEL = iamax_lsx.S
|
||||
ICAMAXKERNEL = icamax_lsx.S
|
||||
IZAMAXKERNEL = icamax_lsx.S
|
||||
|
||||
ISAMINKERNEL = isamin_lsx.S
|
||||
IDAMINKERNEL = idamin_lsx.S
|
||||
ISAMINKERNEL = iamin_lsx.S
|
||||
IDAMINKERNEL = iamin_lsx.S
|
||||
ICAMINKERNEL = icamin_lsx.S
|
||||
IZAMINKERNEL = icamin_lsx.S
|
||||
|
||||
SCOPYKERNEL = scopy_lsx.S
|
||||
DCOPYKERNEL = dcopy_lsx.S
|
||||
SCOPYKERNEL = copy_lsx.S
|
||||
DCOPYKERNEL = copy_lsx.S
|
||||
CCOPYKERNEL = ccopy_lsx.S
|
||||
ZCOPYKERNEL = ccopy_lsx.S
|
||||
|
||||
SSWAPKERNEL = sswap_lsx.S
|
||||
DSWAPKERNEL = dswap_lsx.S
|
||||
SSWAPKERNEL = swap_lsx.S
|
||||
DSWAPKERNEL = swap_lsx.S
|
||||
|
||||
SAXPYKERNEL = saxpy_lsx.S
|
||||
DAXPYKERNEL = daxpy_lsx.S
|
||||
SAXPYKERNEL = axpy_lsx.S
|
||||
DAXPYKERNEL = axpy_lsx.S
|
||||
CAXPYKERNEL = caxpy_lsx.S
|
||||
ZAXPYKERNEL = caxpy_lsx.S
|
||||
|
||||
SAXPBYKERNEL = saxpby_lsx.S
|
||||
DAXPBYKERNEL = daxpby_lsx.S
|
||||
SAXPBYKERNEL = axpby_lsx.S
|
||||
DAXPBYKERNEL = axpby_lsx.S
|
||||
CAXPBYKERNEL = caxpby_lsx.S
|
||||
ZAXPBYKERNEL = caxpby_lsx.S
|
||||
|
||||
SSUMKERNEL = ssum_lsx.S
|
||||
DSUMKERNEL = dsum_lsx.S
|
||||
SSUMKERNEL = sum_lsx.S
|
||||
DSUMKERNEL = sum_lsx.S
|
||||
|
||||
SASUMKERNEL = sasum_lsx.S
|
||||
DASUMKERNEL = dasum_lsx.S
|
||||
SASUMKERNEL = asum_lsx.S
|
||||
DASUMKERNEL = asum_lsx.S
|
||||
CASUMKERNEL = casum_lsx.S
|
||||
ZASUMKERNEL = casum_lsx.S
|
||||
|
||||
SROTKERNEL = srot_lsx.S
|
||||
DROTKERNEL = drot_lsx.S
|
||||
SROTKERNEL = rot_lsx.S
|
||||
DROTKERNEL = rot_lsx.S
|
||||
CROTKERNEL = crot_lsx.S
|
||||
ZROTKERNEL = crot_lsx.S
|
||||
|
||||
SNRM2KERNEL = snrm2_lsx.S
|
||||
DNRM2KERNEL = dnrm2_lsx.S
|
||||
CNRM2KERNEL = cnrm2_lsx.S
|
||||
ZNRM2KERNEL = znrm2_lsx.S
|
||||
|
||||
CSWAPKERNEL = cswap_lsx.S
|
||||
ZSWAPKERNEL = cswap_lsx.S
|
||||
|
||||
CSUMKERNEL = csum_lsx.S
|
||||
ZSUMKERNEL = csum_lsx.S
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_8x4.S
|
||||
DGEMMINCOPY = dgemm_ncopy_8_lsx.S
|
||||
DGEMMITCOPY = dgemm_tcopy_8_lsx.S
|
||||
DGEMMONCOPY = dgemm_ncopy_4_lsx.S
|
||||
DGEMMOTCOPY = dgemm_tcopy_4_lsx.S
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_8x4_lsx.S
|
||||
CGEMMINCOPY = cgemm_ncopy_8_lsx.S
|
||||
CGEMMITCOPY = cgemm_tcopy_8_lsx.S
|
||||
CGEMMONCOPY = cgemm_ncopy_4_lsx.S
|
||||
CGEMMOTCOPY = cgemm_tcopy_4_lsx.S
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S
|
||||
ZGEMMONCOPY = zgemm_ncopy_4_lsx.S
|
||||
ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
endif
|
||||
|
|
|
|||
|
|
@ -3,57 +3,87 @@ ifndef NO_LASX
|
|||
SDOTKERNEL = dot_lasx.S
|
||||
DSDOTKERNEL = dot_lasx.S
|
||||
DDOTKERNEL = dot_lasx.S
|
||||
CDOTKERNEL = cdot_lasx.S
|
||||
ZDOTKERNEL = cdot_lasx.S
|
||||
|
||||
SSCALKERNEL = sscal_lasx.S
|
||||
DSCALKERNEL = dscal_lasx.S
|
||||
SSCALKERNEL = scal_lasx.S
|
||||
DSCALKERNEL = scal_lasx.S
|
||||
CSCALKERNEL = cscal_lasx.S
|
||||
ZSCALKERNEL = cscal_lasx.S
|
||||
|
||||
SAMAXKERNEL = samax_lasx.S
|
||||
DAMAXKERNEL = damax_lasx.S
|
||||
SAMAXKERNEL = amax_lasx.S
|
||||
DAMAXKERNEL = amax_lasx.S
|
||||
CAMAXKERNEL = camax_lasx.S
|
||||
ZAMAXKERNEL = camax_lasx.S
|
||||
|
||||
SAMINKERNEL = samin_lasx.S
|
||||
DAMINKERNEL = damin_lasx.S
|
||||
SAMINKERNEL = amin_lasx.S
|
||||
DAMINKERNEL = amin_lasx.S
|
||||
CAMINKERNEL = camin_lasx.S
|
||||
ZAMINKERNEL = camin_lasx.S
|
||||
|
||||
SMAXKERNEL = smax_lasx.S
|
||||
DMAXKERNEL = dmax_lasx.S
|
||||
SMAXKERNEL = max_lsx.S
|
||||
DMAXKERNEL = max_lsx.S
|
||||
|
||||
SMINKERNEL = smin_lasx.S
|
||||
DMINKERNEL = dmin_lasx.S
|
||||
SMINKERNEL = min_lsx.S
|
||||
DMINKERNEL = min_lsx.S
|
||||
|
||||
ISMAXKERNEL = ismax_lasx.S
|
||||
IDMAXKERNEL = idmax_lasx.S
|
||||
ISMAXKERNEL = imax_lasx.S
|
||||
IDMAXKERNEL = imax_lasx.S
|
||||
|
||||
ISMINKERNEL = ismin_lasx.S
|
||||
IDMINKERNEL = idmin_lasx.S
|
||||
ISMINKERNEL = imin_lasx.S
|
||||
IDMINKERNEL = imin_lasx.S
|
||||
|
||||
ISAMAXKERNEL = isamax_lasx.S
|
||||
IDAMAXKERNEL = idamax_lasx.S
|
||||
ISAMAXKERNEL = iamax_lasx.S
|
||||
IDAMAXKERNEL = iamax_lasx.S
|
||||
ICAMAXKERNEL = icamax_lasx.S
|
||||
IZAMAXKERNEL = icamax_lasx.S
|
||||
|
||||
ISAMINKERNEL = isamin_lasx.S
|
||||
IDAMINKERNEL = idamin_lasx.S
|
||||
ISAMINKERNEL = iamin_lasx.S
|
||||
IDAMINKERNEL = iamin_lasx.S
|
||||
ICAMINKERNEL = icamin_lasx.S
|
||||
IZAMINKERNEL = icamin_lasx.S
|
||||
|
||||
SCOPYKERNEL = scopy_lasx.S
|
||||
DCOPYKERNEL = dcopy_lasx.S
|
||||
SCOPYKERNEL = copy_lasx.S
|
||||
DCOPYKERNEL = copy_lasx.S
|
||||
CCOPYKERNEL = ccopy_lasx.S
|
||||
ZCOPYKERNEL = ccopy_lasx.S
|
||||
|
||||
SSWAPKERNEL = sswap_lasx.S
|
||||
DSWAPKERNEL = dswap_lasx.S
|
||||
SSWAPKERNEL = swap_lasx.S
|
||||
DSWAPKERNEL = swap_lasx.S
|
||||
|
||||
SAXPYKERNEL = saxpy_lasx.S
|
||||
DAXPYKERNEL = daxpy_lasx.S
|
||||
SAXPYKERNEL = axpy_lasx.S
|
||||
DAXPYKERNEL = axpy_lasx.S
|
||||
CAXPYKERNEL = caxpy_lasx.S
|
||||
ZAXPYKERNEL = caxpy_lasx.S
|
||||
|
||||
SAXPBYKERNEL = saxpby_lasx.S
|
||||
DAXPBYKERNEL = daxpby_lasx.S
|
||||
SAXPBYKERNEL = axpby_lasx.S
|
||||
DAXPBYKERNEL = axpby_lasx.S
|
||||
CAXPBYKERNEL = caxpby_lasx.S
|
||||
ZAXPBYKERNEL = caxpby_lasx.S
|
||||
|
||||
SSUMKERNEL = ssum_lasx.S
|
||||
DSUMKERNEL = dsum_lasx.S
|
||||
SSUMKERNEL = sum_lasx.S
|
||||
DSUMKERNEL = sum_lasx.S
|
||||
|
||||
SASUMKERNEL = sasum_lasx.S
|
||||
DASUMKERNEL = dasum_lasx.S
|
||||
SASUMKERNEL = asum_lasx.S
|
||||
DASUMKERNEL = asum_lasx.S
|
||||
CASUMKERNEL = casum_lasx.S
|
||||
ZASUMKERNEL = casum_lasx.S
|
||||
|
||||
SROTKERNEL = srot_lasx.S
|
||||
DROTKERNEL = drot_lasx.S
|
||||
SROTKERNEL = rot_lasx.S
|
||||
DROTKERNEL = rot_lasx.S
|
||||
CROTKERNEL = crot_lasx.S
|
||||
ZROTKERNEL = crot_lasx.S
|
||||
|
||||
SNRM2KERNEL = snrm2_lasx.S
|
||||
DNRM2KERNEL = dnrm2_lasx.S
|
||||
CNRM2KERNEL = cnrm2_lasx.S
|
||||
ZNRM2KERNEL = znrm2_lasx.S
|
||||
|
||||
CSWAPKERNEL = cswap_lasx.S
|
||||
ZSWAPKERNEL = cswap_lasx.S
|
||||
|
||||
CSUMKERNEL = csum_lasx.S
|
||||
ZSUMKERNEL = csum_lasx.S
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_16x4.S
|
||||
DGEMMINCOPY = dgemm_ncopy_16.S
|
||||
|
|
@ -81,13 +111,39 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
|||
SGEMVNKERNEL = sgemv_n_8_lasx.S
|
||||
SGEMVTKERNEL = sgemv_t_8_lasx.S
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_2x2_lsx.S
|
||||
CGEMMONCOPY = cgemm_ncopy_2_lsx.S
|
||||
CGEMMOTCOPY = cgemm_tcopy_2_lsx.S
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_8x4_lasx.S
|
||||
ZGEMMINCOPY = zgemm_ncopy_8_lasx.S
|
||||
ZGEMMITCOPY = zgemm_tcopy_8_lasx.S
|
||||
ZGEMMONCOPY = zgemm_ncopy_4_lasx.S
|
||||
ZGEMMOTCOPY = zgemm_tcopy_4_lasx.S
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = dtrsm_kernel_LN_16x4_lasx.S
|
||||
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_lasx.S
|
||||
DTRSMKERNEL_RN = dtrsm_kernel_RN_16x4_lasx.S
|
||||
DTRSMKERNEL_RT = dtrsm_kernel_RT_16x4_lasx.S
|
||||
endif
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
endif
|
||||
|
|
|
|||
|
|
@ -0,0 +1,232 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
|
||||
#define I $r12
|
||||
#define TEMP $r13
|
||||
|
||||
#define VM0 $xr0
|
||||
#define VM1 $xr1
|
||||
#define VM2 $xr2
|
||||
#define VX0 $xr3
|
||||
#define VX1 $xr4
|
||||
#define VX2 $xr5
|
||||
#define VX3 $xr6
|
||||
|
||||
#define t1 $r14
|
||||
#define t2 $r15
|
||||
#define t3 $r16
|
||||
#define t4 $r17
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
#ifdef DOUBLE
|
||||
xvldrepl.d VM0, X, 0
|
||||
#else
|
||||
xvldrepl.w VM0, X, 0
|
||||
#endif
|
||||
XVFSUB VM0, VM0, VM0
|
||||
bne INCX, TEMP, .L20
|
||||
|
||||
srai.d I, N, 4
|
||||
bge $r0, I, .L11
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0
|
||||
xvld VX1, X, 32
|
||||
xvld VX2, X, 64
|
||||
xvld VX3, X, 96
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 128
|
||||
XVFMAXA VM1, VX0, VX1
|
||||
XVFMAXA VM2, VX2, VX3
|
||||
XVFMAXA VM0, VM0, VM1
|
||||
XVFMAXA VM0, VM0, VM2
|
||||
#else
|
||||
xvld VX0, X, 0
|
||||
xvld VX1, X, 32
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 64
|
||||
XVFMAXA VM1, VX0, VX1
|
||||
XVFMAXA VM0, VM0, VM1
|
||||
#endif
|
||||
blt $r0, I, .L10
|
||||
|
||||
#ifdef DOUBLE
|
||||
xvrepl128vei.d VX0, VM0, 0
|
||||
xvrepl128vei.d VX1, VM0, 1
|
||||
XVFMAXA VM0, VX0, VX1
|
||||
#else
|
||||
xvrepl128vei.w VX0, VM0, 0
|
||||
xvrepl128vei.w VX1, VM0, 1
|
||||
xvrepl128vei.w VX2, VM0, 2
|
||||
xvrepl128vei.w VX3, VM0, 3
|
||||
XVFMAXA VM1, VX0, VX1
|
||||
XVFMAXA VM2, VX2, VX3
|
||||
XVFMAXA VM0, VM1, VM2
|
||||
#endif
|
||||
xvpermi.q VM1, VM0, 0x1
|
||||
XVFMAXA VM0, VM0, VM1
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
andi I, N, 0x0f
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12: /* 0 < N < 16 */
|
||||
LD $f1, X, 0
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
FMAXA $f0, $f0, $f1
|
||||
bnez I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS $f0, $f0
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
ld.d t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvfmaxa.d VM1, VX0, VX1
|
||||
xvfmaxa.d VM0, VM0, VM1
|
||||
#else
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VM1, t1, 0
|
||||
xvinsgr2vr.w VM1, t2, 1
|
||||
xvinsgr2vr.w VM1, t3, 2
|
||||
xvinsgr2vr.w VM1, t4, 3
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VM1, t1, 4
|
||||
xvinsgr2vr.w VM1, t2, 5
|
||||
xvinsgr2vr.w VM1, t3, 6
|
||||
xvinsgr2vr.w VM1, t4, 7
|
||||
xvfmaxa.s VM0, VM0, VM1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#ifdef DOUBLE
|
||||
xvrepl128vei.d VX0, VM0, 0
|
||||
xvrepl128vei.d VX1, VM0, 1
|
||||
XVFMAXA VM0, VX0, VX1
|
||||
#else
|
||||
xvrepl128vei.w VX0, VM0, 0
|
||||
xvrepl128vei.w VX1, VM0, 1
|
||||
xvrepl128vei.w VX2, VM0, 2
|
||||
xvrepl128vei.w VX3, VM0, 3
|
||||
XVFMAXA VM1, VX0, VX1
|
||||
XVFMAXA VM2, VX2, VX3
|
||||
XVFMAXA VM0, VM1, VM2
|
||||
#endif
|
||||
xvpermi.q VM1, VM0, 1
|
||||
XVFMAXA VM0, VM0, VM1
|
||||
.align 3
|
||||
|
||||
.L23: //INCX!=1 and N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24: /* 0 < N < 8 */
|
||||
LD $f1, X, 0
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
FMAXA $f0, $f0, $f1
|
||||
bnez I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
FABS $f0, $f0
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,231 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
|
||||
#define I $r12
|
||||
#define TEMP $r13
|
||||
|
||||
#define VM0 $vr0
|
||||
#define VM1 $vr1
|
||||
#define VM2 $vr2
|
||||
#define VX0 $vr3
|
||||
#define VX1 $vr4
|
||||
#define VX2 $vr5
|
||||
#define VX3 $vr6
|
||||
|
||||
#define t1 $r14
|
||||
#define t2 $r15
|
||||
#define t3 $r16
|
||||
#define t4 $r17
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
#ifdef DOUBLE
|
||||
vldrepl.d VM0, X, 0
|
||||
#else
|
||||
vldrepl.w VM0, X, 0
|
||||
#endif
|
||||
VFSUB VM0, VM0, VM0
|
||||
bne INCX, TEMP, .L20
|
||||
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L11
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0
|
||||
vld VX1, X, 16
|
||||
vld VX2, X, 32
|
||||
vld VX3, X, 48
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 64
|
||||
VFMAXA VM1, VX0, VX1
|
||||
VFMAXA VM2, VX2, VX3
|
||||
VFMAXA VM0, VM0, VM1
|
||||
VFMAXA VM0, VM0, VM2
|
||||
#else
|
||||
vld VX0, X, 0
|
||||
vld VX1, X, 16
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 32
|
||||
VFMAXA VM1, VX0, VX1
|
||||
VFMAXA VM0, VM0, VM1
|
||||
#endif
|
||||
blt $r0, I, .L10
|
||||
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX0, VM0, 0
|
||||
vreplvei.d VX1, VM0, 1
|
||||
VFMAXA VM0, VX0, VX1
|
||||
#else
|
||||
vreplvei.w VX0, VM0, 0
|
||||
vreplvei.w VX1, VM0, 1
|
||||
vreplvei.w VX2, VM0, 2
|
||||
vreplvei.w VX3, VM0, 3
|
||||
VFMAXA VM1, VX0, VX1
|
||||
VFMAXA VM2, VX2, VX3
|
||||
VFMAXA VM0, VM1, VM2
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
LD $f1, X, 0
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
FMAXA $f0, $f0, $f1
|
||||
bnez I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS $f0, $f0
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vfmaxa.d VM1, VX0, VX1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vfmaxa.d VM2, VX0, VX1
|
||||
vfmaxa.d VM1, VM1, VM2
|
||||
vfmaxa.d VM0, VM0, VM1
|
||||
#else
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
vfmaxa.s VM1, VX0, VX1
|
||||
vfmaxa.s VM0, VM0, VM1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX0, VM0, 0
|
||||
vreplvei.d VX1, VM0, 1
|
||||
VFMAXA VM0, VX0, VX1
|
||||
#else
|
||||
vreplvei.w VX0, VM0, 0
|
||||
vreplvei.w VX1, VM0, 1
|
||||
vreplvei.w VX2, VM0, 2
|
||||
vreplvei.w VX3, VM0, 3
|
||||
VFMAXA VM1, VX0, VX1
|
||||
VFMAXA VM2, VX2, VX3
|
||||
VFMAXA VM0, VM1, VM2
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L23: //INCX!=1 and N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
LD $f1, X, 0
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
FMAXA $f0, $f0, $f1
|
||||
bnez I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
FABS $f0, $f0
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,231 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
|
||||
#define I $r12
|
||||
#define TEMP $r13
|
||||
|
||||
#define VM0 $xr0
|
||||
#define VM1 $xr1
|
||||
#define VM2 $xr2
|
||||
#define VX0 $xr3
|
||||
#define VX1 $xr4
|
||||
#define VX2 $xr5
|
||||
#define VX3 $xr6
|
||||
|
||||
#define t1 $r14
|
||||
#define t2 $r15
|
||||
#define t3 $r16
|
||||
#define t4 $r17
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
#ifdef DOUBLE
|
||||
xvldrepl.d VM0, X, 0
|
||||
#else
|
||||
xvldrepl.w VM0, X, 0
|
||||
#endif
|
||||
bne INCX, TEMP, .L20
|
||||
|
||||
srai.d I, N, 4
|
||||
bge $r0, I, .L11
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0
|
||||
xvld VX1, X, 32
|
||||
xvld VX2, X, 64
|
||||
xvld VX3, X, 96
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 128
|
||||
XVFMINA VM1, VX0, VX1
|
||||
XVFMINA VM2, VX2, VX3
|
||||
XVFMINA VM0, VM0, VM1
|
||||
XVFMINA VM0, VM0, VM2
|
||||
#else
|
||||
xvld VX0, X, 0
|
||||
xvld VX1, X, 32
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 64
|
||||
XVFMINA VM1, VX0, VX1
|
||||
XVFMINA VM0, VM0, VM1
|
||||
#endif
|
||||
blt $r0, I, .L10
|
||||
|
||||
#ifdef DOUBLE
|
||||
xvrepl128vei.d VX0, VM0, 0
|
||||
xvrepl128vei.d VX1, VM0, 1
|
||||
XVFMINA VM0, VX0, VX1
|
||||
#else
|
||||
xvrepl128vei.w VX0, VM0, 0
|
||||
xvrepl128vei.w VX1, VM0, 1
|
||||
xvrepl128vei.w VX2, VM0, 2
|
||||
xvrepl128vei.w VX3, VM0, 3
|
||||
XVFMINA VM1, VX0, VX1
|
||||
XVFMINA VM2, VX2, VX3
|
||||
XVFMINA VM0, VM1, VM2
|
||||
#endif
|
||||
xvpermi.q VM1, VM0, 0x1
|
||||
XVFMINA VM0, VM0, VM1
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
andi I, N, 0x0f
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12: /* 0 < N < 16 */
|
||||
LD $f1, X, 0
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
FMINA $f0, $f0, $f1
|
||||
bnez I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS $f0, $f0
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
ld.d t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvfmaxa.d VM1, VX0, VX1
|
||||
xvfmaxa.d VM0, VM0, VM1
|
||||
#else
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VM1, t1, 0
|
||||
xvinsgr2vr.w VM1, t2, 1
|
||||
xvinsgr2vr.w VM1, t3, 2
|
||||
xvinsgr2vr.w VM1, t4, 3
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VM1, t1, 4
|
||||
xvinsgr2vr.w VM1, t2, 5
|
||||
xvinsgr2vr.w VM1, t3, 6
|
||||
xvinsgr2vr.w VM1, t4, 7
|
||||
xvfmaxa.s VM0, VM0, VM1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#ifdef DOUBLE
|
||||
xvrepl128vei.d VX0, VM0, 0
|
||||
xvrepl128vei.d VX1, VM0, 1
|
||||
XVFMINA VM0, VX0, VX1
|
||||
#else
|
||||
xvrepl128vei.w VX0, VM0, 0
|
||||
xvrepl128vei.w VX1, VM0, 1
|
||||
xvrepl128vei.w VX2, VM0, 2
|
||||
xvrepl128vei.w VX3, VM0, 3
|
||||
XVFMINA VM1, VX0, VX1
|
||||
XVFMINA VM2, VX2, VX3
|
||||
XVFMINA VM0, VM1, VM2
|
||||
#endif
|
||||
xvpermi.q VM1, VM0, 1
|
||||
XVFMINA VM0, VM0, VM1
|
||||
.align 3
|
||||
|
||||
.L23: //INCX!=1 and N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24: /* 0 < N < 8 */
|
||||
LD $f1, X, 0
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
FMINA $f0, $f0, $f1
|
||||
bnez I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
FABS $f0, $f0
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,230 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
|
||||
#define I $r12
|
||||
#define TEMP $r13
|
||||
|
||||
#define VM0 $vr0
|
||||
#define VM1 $vr1
|
||||
#define VM2 $vr2
|
||||
#define VX0 $vr3
|
||||
#define VX1 $vr4
|
||||
#define VX2 $vr5
|
||||
#define VX3 $vr6
|
||||
|
||||
#define t1 $r14
|
||||
#define t2 $r15
|
||||
#define t3 $r16
|
||||
#define t4 $r17
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
#ifdef DOUBLE
|
||||
vldrepl.d VM0, X, 0
|
||||
#else
|
||||
vldrepl.w VM0, X, 0
|
||||
#endif
|
||||
bne INCX, TEMP, .L20
|
||||
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L11
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0
|
||||
vld VX1, X, 16
|
||||
vld VX2, X, 32
|
||||
vld VX3, X, 48
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 64
|
||||
VFMINA VM1, VX0, VX1
|
||||
VFMINA VM2, VX2, VX3
|
||||
VFMINA VM0, VM0, VM1
|
||||
VFMINA VM0, VM0, VM2
|
||||
#else
|
||||
vld VX0, X, 0
|
||||
vld VX1, X, 16
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 32
|
||||
VFMINA VM1, VX0, VX1
|
||||
VFMINA VM0, VM0, VM1
|
||||
#endif
|
||||
blt $r0, I, .L10
|
||||
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX0, VM0, 0
|
||||
vreplvei.d VX1, VM0, 1
|
||||
VFMINA VM0, VX0, VX1
|
||||
#else
|
||||
vreplvei.w VX0, VM0, 0
|
||||
vreplvei.w VX1, VM0, 1
|
||||
vreplvei.w VX2, VM0, 2
|
||||
vreplvei.w VX3, VM0, 3
|
||||
VFMINA VM1, VX0, VX1
|
||||
VFMINA VM2, VX2, VX3
|
||||
VFMINA VM0, VM1, VM2
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
LD $f1, X, 0
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
FMINA $f0, $f0, $f1
|
||||
bnez I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS $f0, $f0
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vfmaxa.d VM1, VX0, VX1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vfmaxa.d VM2, VX0, VX1
|
||||
vfmaxa.d VM1, VM1, VM2
|
||||
vfmaxa.d VM0, VM0, VM1
|
||||
#else
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
vfmaxa.s VM1, VX0, VX1
|
||||
vfmaxa.s VM0, VM0, VM1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX0, VM0, 0
|
||||
vreplvei.d VX1, VM0, 1
|
||||
VFMINA VM0, VX0, VX1
|
||||
#else
|
||||
vreplvei.w VX0, VM0, 0
|
||||
vreplvei.w VX1, VM0, 1
|
||||
vreplvei.w VX2, VM0, 2
|
||||
vreplvei.w VX3, VM0, 3
|
||||
VFMINA VM1, VX0, VX1
|
||||
VFMINA VM2, VX2, VX3
|
||||
VFMINA VM0, VM1, VM2
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L23: //INCX!=1 and N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
LD $f1, X, 0
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
FMINA $f0, $f0, $f1
|
||||
bnez I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
FABS $f0, $f0
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,257 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define t1 $r15
|
||||
#define t2 $r12
|
||||
#define t3 $r13
|
||||
#define t4 $r14
|
||||
#define VX0 $xr12
|
||||
#define VX1 $xr13
|
||||
#define VX2 $xr14
|
||||
#define VX3 $xr15
|
||||
#define VT0 $xr23
|
||||
#define VT1 $xr22
|
||||
#define res1 $xr16
|
||||
#define res2 $xr17
|
||||
#define res0 $xr18
|
||||
#define neg1 $xr19
|
||||
|
||||
PROLOGUE
|
||||
xvxor.v res1, res1, res1
|
||||
xvxor.v res2, res2, res2
|
||||
xvxor.v res0, res0, res0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
#ifdef DOUBLE
|
||||
li.d t1, -1
|
||||
xvreplgr2vr.d neg1, t1
|
||||
xvffint.d.l neg1, neg1
|
||||
#else
|
||||
li.w t1, -1
|
||||
xvreplgr2vr.w neg1, t1
|
||||
xvffint.s.w neg1, neg1
|
||||
#endif
|
||||
li.d TEMP, SIZE
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvfmul.d VX2, neg1, VX0
|
||||
xvfmul.d VX3, neg1, VX1
|
||||
xvfcmp.clt.d VT0, VX0, res0
|
||||
xvfcmp.clt.d VT1, VX1, res0
|
||||
xvbitsel.v VX0, VX0, VX2, VT0
|
||||
xvbitsel.v VX1, VX1, VX3, VT1
|
||||
xvfadd.d res2, VX0, VX1
|
||||
xvfadd.d res1, res1, res2
|
||||
#else
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvfmul.s VX2, neg1, VX0
|
||||
xvfcmp.clt.s VT0, VX0, res0
|
||||
xvbitsel.v VX0, VX0, VX2, VT0
|
||||
xvfadd.s res1, VX0, res1
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L11
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VX1, res1, 1
|
||||
xvpickve.d VX2, res1, 2
|
||||
xvpickve.d VX3, res1, 3
|
||||
xvfadd.d res1, VX1, res1
|
||||
xvfadd.d res1, VX2, res1
|
||||
xvfadd.d res1, VX3, res1
|
||||
#else
|
||||
xvfadd.s res2, res1, res2
|
||||
xvpickve.w VX1, res1, 1
|
||||
xvpickve.w VX2, res1, 2
|
||||
xvpickve.w VX3, res1, 3
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX3, res1
|
||||
xvpickve.w VX0, res2, 4
|
||||
xvpickve.w VX1, res2, 5
|
||||
xvpickve.w VX2, res2, 6
|
||||
xvpickve.w VX3, res2, 7
|
||||
xvfadd.s res1, VX0, res1
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L14:
|
||||
LD $f12, X, 0 * SIZE
|
||||
FABS $f12, $f12
|
||||
ADD $f16, $f12, $f16
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
blt $r0, I, .L14
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvfmul.d VX2, neg1, VX0
|
||||
xvfmul.d VX3, neg1, VX1
|
||||
xvfcmp.clt.d VT0, VX0, res0
|
||||
xvfcmp.clt.d VT1, VX1, res0
|
||||
xvbitsel.v VX0, VX0, VX2, VT0
|
||||
xvbitsel.v VX1, VX1, VX3, VT1
|
||||
xvfadd.d res2, VX0, VX1
|
||||
xvfadd.d res1, res1, res2
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
xvfmul.s VX2, neg1, VX0
|
||||
xvfcmp.clt.s VT0, VX0, res0
|
||||
xvbitsel.v VX0, VX0, VX2, VT0
|
||||
xvfadd.s res1, VX0, res1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VX1, res1, 1
|
||||
xvpickve.d VX2, res1, 2
|
||||
xvpickve.d VX3, res1, 3
|
||||
xvfadd.d res1, VX1, res1
|
||||
xvfadd.d res1, VX2, res1
|
||||
xvfadd.d res1, VX3, res1
|
||||
#else
|
||||
xvfadd.s res2, res1, res2
|
||||
xvpickve.w VX1, res1, 1
|
||||
xvpickve.w VX2, res1, 2
|
||||
xvpickve.w VX3, res1, 3
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX3, res1
|
||||
xvpickve.w VX0, res2, 4
|
||||
xvpickve.w VX1, res2, 5
|
||||
xvpickve.w VX2, res2, 6
|
||||
xvpickve.w VX3, res2, 7
|
||||
xvfadd.s res1, VX0, res1
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
LD $f12, X, 0 * SIZE
|
||||
FABS $f12, $f12
|
||||
ADD $f16, $f12, $f16
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
MOV $f0, $f16
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,258 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define t1 $r15
|
||||
#define t2 $r12
|
||||
#define t3 $r13
|
||||
#define t4 $r14
|
||||
#define VX0 $vr12
|
||||
#define VX1 $vr13
|
||||
#define VX2 $vr14
|
||||
#define VX3 $vr15
|
||||
#define VT0 $vr23
|
||||
#define VT1 $vr22
|
||||
#define res1 $vr16
|
||||
#define res2 $vr17
|
||||
#define res0 $vr18
|
||||
#define neg1 $vr19
|
||||
|
||||
PROLOGUE
|
||||
vxor.v res1, res1, res1
|
||||
vxor.v res2, res2, res2
|
||||
vxor.v res0, res0, res0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
#ifdef DOUBLE
|
||||
li.d t1, -1
|
||||
vreplgr2vr.d neg1, t1
|
||||
vffint.d.l neg1, neg1
|
||||
#else
|
||||
li.w t1, -1
|
||||
vreplgr2vr.w neg1, t1
|
||||
vffint.s.w neg1, neg1
|
||||
#endif
|
||||
li.d TEMP, SIZE
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vfmul.d VX2, neg1, VX0
|
||||
vfmul.d VX3, neg1, VX1
|
||||
vfcmp.clt.d VT0, VX0, res0
|
||||
vfcmp.clt.d VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
vld VX0, X, 4 * SIZE
|
||||
vld VX1, X, 6 * SIZE
|
||||
vfmul.d VX2, neg1, VX0
|
||||
vfmul.d VX3, neg1, VX1
|
||||
vfcmp.clt.d VT0, VX0, res0
|
||||
vfcmp.clt.d VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
#else
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX1, X, 4 * SIZE
|
||||
vfmul.s VX2, neg1, VX0
|
||||
vfmul.s VX3, neg1, VX1
|
||||
vfcmp.clt.s VT0, VX0, res0
|
||||
vfcmp.clt.s VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.s res2, VX0, VX1
|
||||
vfadd.s res1, res1, res2
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L11
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX1, res1, 1
|
||||
vfadd.d res1, VX1, res1
|
||||
#else
|
||||
vreplvei.w VX1, res1, 1
|
||||
vreplvei.w VX2, res1, 2
|
||||
vreplvei.w VX3, res1, 3
|
||||
vfadd.s res1, VX1, res1
|
||||
vfadd.s res1, VX2, res1
|
||||
vfadd.s res1, VX3, res1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L14:
|
||||
LD $f12, X, 0 * SIZE
|
||||
FABS $f12, $f12
|
||||
ADD $f16, $f12, $f16
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
blt $r0, I, .L14
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
vinsgr2vr.d VX1, t1, 0
|
||||
vinsgr2vr.d VX1, t2, 1
|
||||
add.d X, X, INCX
|
||||
vfmul.d VX2, neg1, VX0
|
||||
vfmul.d VX3, neg1, VX1
|
||||
vfcmp.clt.d VT0, VX0, res0
|
||||
vfcmp.clt.d VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t3, 0
|
||||
vinsgr2vr.d VX0, t4, 1
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
add.d X, X, INCX
|
||||
vfmul.d VX2, neg1, VX0
|
||||
vfmul.d VX3, neg1, VX1
|
||||
vfcmp.clt.d VT0, VX0, res0
|
||||
vfcmp.clt.d VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
vfmul.s VX2, neg1, VX0
|
||||
vfmul.s VX3, neg1, VX1
|
||||
vfcmp.clt.s VT0, VX0, res0
|
||||
vfcmp.clt.s VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.s res2, VX0, VX1
|
||||
vfadd.s res1, res1, res2
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX1, res1, 1
|
||||
vfadd.d res1, VX1, res1
|
||||
#else
|
||||
vreplvei.w VX1, res1, 1
|
||||
vreplvei.w VX2, res1, 2
|
||||
vreplvei.w VX3, res1, 3
|
||||
vfadd.s res1, VX1, res1
|
||||
vfadd.s res1, VX2, res1
|
||||
vfadd.s res1, VX3, res1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
LD $f12, X, 0 * SIZE
|
||||
FABS $f12, $f12
|
||||
ADD $f16, $f12, $f16
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
MOV $f0, $f16
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -1,6 +1,33 @@
|
|||
#define ASSEMBLER
|
||||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define ALPHA $f0
|
||||
#define X $r5
|
||||
|
|
@ -30,18 +57,29 @@
|
|||
PROLOGUE
|
||||
|
||||
bge $r0, N, .L999
|
||||
li.d TEMP, 1
|
||||
movgr2fr.d a1, $r0
|
||||
ffint.d.l a1, a1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
ffint.s.l a1, a1
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
slli.d INCY, INCY, BASE_SHIFT
|
||||
movfr2gr.d t1, ALPHA
|
||||
MTG t1, ALPHA
|
||||
MTG t2, BETA
|
||||
MTG t3, a1
|
||||
#ifdef DOUBLE
|
||||
xvreplgr2vr.d VXA, t1
|
||||
movfr2gr.d t2, BETA
|
||||
xvreplgr2vr.d VXB, t2
|
||||
movfr2gr.d t3, a1
|
||||
xvreplgr2vr.d VXZ, t3
|
||||
#else
|
||||
xvreplgr2vr.w VXA, t1
|
||||
xvreplgr2vr.w VXB, t2
|
||||
xvreplgr2vr.w VXZ, t3
|
||||
#endif
|
||||
// If incx == 0 || incy == 0, do one by one
|
||||
and TEMP, INCX, INCY
|
||||
or I, N, N
|
||||
beqz TEMP, .L998
|
||||
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
|
||||
|
|
@ -52,21 +90,22 @@
|
|||
|
||||
.L11:
|
||||
bge $r0, I, .L997
|
||||
fcmp.ceq.d $fcc0, ALPHA, a1
|
||||
CMPEQ $fcc0, ALPHA, a1
|
||||
bcnez $fcc0, .L110
|
||||
fcmp.ceq.d $fcc0, BETA, a1
|
||||
CMPEQ $fcc0, BETA, a1
|
||||
bcnez $fcc0, .L112 // ALPHA!=0 BETA==0
|
||||
b .L111 // ALPHA!=0 BETA!=0
|
||||
.align 3
|
||||
|
||||
.L110:
|
||||
fcmp.ceq.d $fcc0, BETA, a1
|
||||
CMPEQ $fcc0, BETA, a1
|
||||
bcnez $fcc0, .L114 // ALPHA==0 BETA==0
|
||||
b .L113 // ALPHA==0 BETA!=0
|
||||
.align 3
|
||||
|
||||
.L111: // ALPHA!=0 BETA!=0
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvld VX3, Y, 4 * SIZE
|
||||
|
|
@ -77,6 +116,13 @@
|
|||
addi.d I, I, -1
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
xvst VX3, Y, 4 * SIZE
|
||||
#else
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
xvfmul.s VX0, VX0, VXA
|
||||
addi.d I, I, -1
|
||||
xvfmadd.s VX2, VX2, VXB, VX0
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L111
|
||||
|
|
@ -85,34 +131,46 @@
|
|||
|
||||
.L112: // ALPHA!=0 BETA==0
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvfmul.d VX0, VX0, VXA
|
||||
xvfmul.d VX1, VX1, VXA
|
||||
xvst VX0, Y, 0 * SIZE
|
||||
xvst VX1, Y, 4 * SIZE
|
||||
#else
|
||||
xvfmul.s VX0, VX0, VXA
|
||||
addi.d I, I, -1
|
||||
xvst VX0, Y, 0 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L112
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L113: // ALPHA==0 BETA!=0
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvld VX3, Y, 4 * SIZE
|
||||
xvfmul.d VX2, VX2, VXB
|
||||
xvfmul.d VX3, VX3, VXB
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
xvst VX3, Y, 4 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
#else
|
||||
xvfmul.s VX2, VX2, VXB
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L113
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L114: // ALPHA==0 BETA==0
|
||||
xvst VXZ, Y, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvst VXZ, Y, 4 * SIZE
|
||||
#endif
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L114
|
||||
|
|
@ -122,21 +180,22 @@
|
|||
.L12: // INCX==1 and INCY!=1
|
||||
bge $r0, I, .L997
|
||||
move YY, Y
|
||||
fcmp.ceq.d $fcc0, ALPHA, a1
|
||||
CMPEQ $fcc0, ALPHA, a1
|
||||
bcnez $fcc0, .L120
|
||||
fcmp.ceq.d $fcc0, BETA, a1
|
||||
CMPEQ $fcc0, BETA, a1
|
||||
bcnez $fcc0, .L122 // ALPHA!=0 BETA==0
|
||||
b .L121 // ALPHA!=0 BETA!=0
|
||||
.align 3
|
||||
|
||||
.L120:
|
||||
fcmp.ceq.d $fcc0, BETA, a1
|
||||
CMPEQ $fcc0, BETA, a1
|
||||
bcnez $fcc0, .L124 // ALPHA==0 BETA==0
|
||||
b .L123 // ALPHA==0 BETA!=0
|
||||
.align 3
|
||||
|
||||
.L121: // ALPHA!=0 BETA!=0
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t2, Y, 0 * SIZE
|
||||
|
|
@ -182,14 +241,59 @@
|
|||
xvstelm.d VX3, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VX3, YY, 0, 3
|
||||
#else
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w VX2, t1, 0
|
||||
xvinsgr2vr.w VX2, t2, 1
|
||||
xvinsgr2vr.w VX2, t3, 2
|
||||
xvinsgr2vr.w VX2, t4, 3
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
xvinsgr2vr.w VX2, t1, 4
|
||||
xvinsgr2vr.w VX2, t2, 5
|
||||
xvinsgr2vr.w VX2, t3, 6
|
||||
xvinsgr2vr.w VX2, t4, 7
|
||||
add.d Y, Y, INCY
|
||||
xvfmul.s VX0, VX0, VXA
|
||||
xvfmadd.s VX2, VX2, VXB, VX0
|
||||
xvstelm.w VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 4
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 5
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 6
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 7
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L121
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L122: // ALPHA!=0 BETA==0
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvfmul.d VX0, VX0, VXA
|
||||
xvfmul.d VX1, VX1, VXA
|
||||
|
|
@ -208,14 +312,33 @@
|
|||
xvstelm.d VX1, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VX1, YY, 0, 3
|
||||
#else
|
||||
xvfmul.s VX0, VX0, VXA
|
||||
addi.d I, I, -1
|
||||
xvstelm.w VX0, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX0, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX0, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX0, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX0, YY, 0, 4
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX0, YY, 0, 5
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX0, YY, 0, 6
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX0, YY, 0, 7
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L122
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L123: // ALPHA==0 BETA!=0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t2, Y, 0 * SIZE
|
||||
|
|
@ -250,7 +373,6 @@
|
|||
xvstelm.d VX2, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
xvfmul.d VX3, VX3, VXB
|
||||
addi.d I, I, -1
|
||||
xvstelm.d VX3, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VX3, YY, 0, 1
|
||||
|
|
@ -258,12 +380,56 @@
|
|||
xvstelm.d VX3, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VX3, YY, 0, 3
|
||||
#else
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w VX2, t1, 0
|
||||
xvinsgr2vr.w VX2, t2, 1
|
||||
xvinsgr2vr.w VX2, t3, 2
|
||||
xvinsgr2vr.w VX2, t4, 3
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
xvinsgr2vr.w VX2, t1, 4
|
||||
xvinsgr2vr.w VX2, t2, 5
|
||||
xvinsgr2vr.w VX2, t3, 6
|
||||
xvinsgr2vr.w VX2, t4, 7
|
||||
add.d Y, Y, INCY
|
||||
xvfmul.s VX2, VX2, VXB
|
||||
xvstelm.w VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 4
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 5
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 6
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 7
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L123
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L124: // ALPHA==0 BETA==0
|
||||
#ifdef DOUBLE
|
||||
xvstelm.d VXZ, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VXZ, YY, 0, 1
|
||||
|
|
@ -279,6 +445,23 @@
|
|||
xvstelm.d VXZ, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VXZ, YY, 0, 3
|
||||
#else
|
||||
xvstelm.w VXZ, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VXZ, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VXZ, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VXZ, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VXZ, YY, 0, 4
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VXZ, YY, 0, 5
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VXZ, YY, 0, 6
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VXZ, YY, 0, 7
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L124
|
||||
|
|
@ -287,21 +470,22 @@
|
|||
|
||||
.L21:// INCX!=1 and INCY==1
|
||||
bge $r0, I, .L997
|
||||
fcmp.ceq.d $fcc0, ALPHA, a1
|
||||
CMPEQ $fcc0, ALPHA, a1
|
||||
bcnez $fcc0, .L210
|
||||
fcmp.ceq.d $fcc0, BETA, a1
|
||||
CMPEQ $fcc0, BETA, a1
|
||||
bcnez $fcc0, .L212 // ALPHA!=0 BETA==0
|
||||
b .L211 // ALPHA!=0 BETA!=0
|
||||
.align 3
|
||||
|
||||
.L210:
|
||||
fcmp.ceq.d $fcc0, BETA, a1
|
||||
CMPEQ $fcc0, BETA, a1
|
||||
bcnez $fcc0, .L214 // ALPHA==0 BETA==0
|
||||
b .L213 // ALPHA==0 BETA!=0
|
||||
.align 3
|
||||
|
||||
.L211: // ALPHA!=0 BETA!=0
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
|
|
@ -334,12 +518,43 @@
|
|||
xvfmadd.d VX3, VX3, VXB, VX1
|
||||
addi.d I, I, -1
|
||||
xvst VX3, Y, 4 * SIZE
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
add.d X, X, INCX
|
||||
xvfmul.s VX0, VXA, VX0
|
||||
xvfmadd.s VX2, VX2, VXB, VX0
|
||||
addi.d I, I, -1
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
#endif
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L211
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L212: // ALPHA!=0 BETA==0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
|
|
@ -369,6 +584,35 @@
|
|||
xvfmul.d VX1, VX1, VXA
|
||||
addi.d I, I, -1
|
||||
xvst VX1, Y, 4 * SIZE
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
add.d X, X, INCX
|
||||
xvfmul.s VX0, VXA, VX0
|
||||
addi.d I, I, -1
|
||||
xvst VX0, Y, 0 * SIZE
|
||||
#endif
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L212
|
||||
b .L997
|
||||
|
|
@ -376,20 +620,27 @@
|
|||
|
||||
.L213: // ALPHA==0 BETA!=0
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvld VX3, Y, 4 * SIZE
|
||||
xvfmul.d VX2, VX2, VXB
|
||||
xvfmul.d VX3, VX3, VXB
|
||||
addi.d I, I, -1
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
xvst VX3, Y, 4 * SIZE
|
||||
#else
|
||||
xvfmul.s VX2, VX2, VXB
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
#endif
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L213
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L214: // ALPHA==0 BETA==0
|
||||
xvst VXZ, Y, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvst VXZ, Y, 4 * SIZE
|
||||
#endif
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L214
|
||||
|
|
@ -399,20 +650,21 @@
|
|||
.L22:
|
||||
bge $r0, I, .L997
|
||||
move YY, Y
|
||||
fcmp.ceq.d $fcc0, ALPHA, a1
|
||||
CMPEQ $fcc0, ALPHA, a1
|
||||
bcnez $fcc0, .L220
|
||||
fcmp.ceq.d $fcc0, BETA, a1
|
||||
CMPEQ $fcc0, BETA, a1
|
||||
bcnez $fcc0, .L222 // ALPHA!=0 BETA==0
|
||||
b .L221 // ALPHA!=0 BETA!=0
|
||||
.align 3
|
||||
|
||||
.L220:
|
||||
fcmp.ceq.d $fcc0, BETA, a1
|
||||
CMPEQ $fcc0, BETA, a1
|
||||
bcnez $fcc0, .L224 // ALPHA==0 BETA==0
|
||||
b .L223 // ALPHA==0 BETA!=0
|
||||
.align 3
|
||||
|
||||
.L221: // ALPHA!=0 BETA!=0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
|
|
@ -481,12 +733,81 @@
|
|||
xvstelm.d VX3, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VX3, YY, 0, 3
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
add.d X, X, INCX
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
xvinsgr2vr.w VX2, t1, 0
|
||||
xvinsgr2vr.w VX2, t2, 1
|
||||
xvinsgr2vr.w VX2, t3, 2
|
||||
xvinsgr2vr.w VX2, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
xvinsgr2vr.w VX2, t1, 4
|
||||
xvinsgr2vr.w VX2, t2, 5
|
||||
xvinsgr2vr.w VX2, t3, 6
|
||||
xvinsgr2vr.w VX2, t4, 7
|
||||
add.d Y, Y, INCY
|
||||
xvfmul.s VX0, VX0, VXA
|
||||
xvfmadd.s VX2, VX2, VXB, VX0
|
||||
addi.d I, I, -1
|
||||
xvstelm.w VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 4
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 5
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 6
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 7
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
blt $r0, I, .L221
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L222: // ALPHA!=0 BETA==0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
|
|
@ -529,12 +850,56 @@
|
|||
xvstelm.d VX1, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VX1, YY, 0, 3
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
add.d X, X, INCX
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
add.d X, X, INCX
|
||||
xvfmul.s VX0, VX0, VXA
|
||||
addi.d I, I, -1
|
||||
xvstelm.w VX0, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX0, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX0, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX0, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX0, YY, 0, 4
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX0, YY, 0, 5
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX0, YY, 0, 6
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX0, YY, 0, 7
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
blt $r0, I, .L222
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L223: // ALPHA==0 BETA!=0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t2, Y, 0 * SIZE
|
||||
|
|
@ -577,12 +942,56 @@
|
|||
xvstelm.d VX3, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VX3, YY, 0, 3
|
||||
#else
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w VX2, t1, 0
|
||||
xvinsgr2vr.w VX2, t2, 1
|
||||
xvinsgr2vr.w VX2, t3, 2
|
||||
xvinsgr2vr.w VX2, t4, 3
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
xvinsgr2vr.w VX2, t1, 4
|
||||
xvinsgr2vr.w VX2, t2, 5
|
||||
xvinsgr2vr.w VX2, t3, 6
|
||||
xvinsgr2vr.w VX2, t4, 7
|
||||
add.d Y, Y, INCY
|
||||
xvfmul.s VX2, VX2, VXB
|
||||
addi.d I, I, -1
|
||||
xvstelm.w VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 4
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 5
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 6
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 7
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
blt $r0, I, .L223
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L224: // ALPHA==0 BETA==0
|
||||
#ifdef DOUBLE
|
||||
xvstelm.d VXZ, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VXZ, YY, 0, 1
|
||||
|
|
@ -598,6 +1007,23 @@
|
|||
xvstelm.d VXZ, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VXZ, YY, 0, 3
|
||||
#else
|
||||
xvstelm.w VXZ, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VXZ, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VXZ, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VXZ, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VXZ, YY, 0, 4
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VXZ, YY, 0, 5
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VXZ, YY, 0, 6
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VXZ, YY, 0, 7
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L224
|
||||
|
|
@ -610,12 +1036,12 @@
|
|||
.align 3
|
||||
|
||||
.L998:
|
||||
fld.d $f12, X, 0 * SIZE
|
||||
fld.d $f13, Y, 0 * SIZE
|
||||
LD $f12, X, 0 * SIZE
|
||||
LD $f13, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
fmul.d $f12, $f12, ALPHA
|
||||
fmadd.d $f13, $f13, BETA, $f12
|
||||
fst.d $f13, Y, 0 * SIZE
|
||||
MUL $f12, $f12, ALPHA
|
||||
MADD $f13, $f13, BETA, $f12
|
||||
ST $f13, Y, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L998
|
||||
|
|
@ -1,6 +1,33 @@
|
|||
#define ASSEMBLER
|
||||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define ALPHA $f0
|
||||
#define X $r5
|
||||
|
|
@ -30,18 +57,29 @@
|
|||
PROLOGUE
|
||||
|
||||
bge $r0, N, .L999
|
||||
li.d TEMP, 1
|
||||
movgr2fr.d a1, $r0
|
||||
ffint.d.l a1, a1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
ffint.s.l a1, a1
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
slli.d INCY, INCY, BASE_SHIFT
|
||||
movfr2gr.d t1, ALPHA
|
||||
MTG t1, ALPHA
|
||||
MTG t2, BETA
|
||||
MTG t3, a1
|
||||
#ifdef DOUBLE
|
||||
vreplgr2vr.d VXA, t1
|
||||
movfr2gr.d t2, BETA
|
||||
vreplgr2vr.d VXB, t2
|
||||
movfr2gr.d t3, a1
|
||||
vreplgr2vr.d VXZ, t3
|
||||
#else
|
||||
vreplgr2vr.w VXA, t1
|
||||
vreplgr2vr.w VXB, t2
|
||||
vreplgr2vr.w VXZ, t3
|
||||
#endif
|
||||
// If incx == 0 || incy == 0, do one by one
|
||||
and TEMP, INCX, INCY
|
||||
or I, N, N
|
||||
beqz TEMP, .L998
|
||||
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
|
||||
|
|
@ -52,15 +90,15 @@
|
|||
|
||||
.L11:
|
||||
bge $r0, I, .L997
|
||||
fcmp.ceq.d $fcc0, ALPHA, a1
|
||||
CMPEQ $fcc0, ALPHA, a1
|
||||
bcnez $fcc0, .L110
|
||||
fcmp.ceq.d $fcc0, BETA, a1
|
||||
CMPEQ $fcc0, BETA, a1
|
||||
bcnez $fcc0, .L112 // ALPHA!=0 BETA==0
|
||||
b .L111 // ALPHA!=0 BETA!=0
|
||||
.align 3
|
||||
|
||||
.L110:
|
||||
fcmp.ceq.d $fcc0, BETA, a1
|
||||
CMPEQ $fcc0, BETA, a1
|
||||
bcnez $fcc0, .L114 // ALPHA==0 BETA==0
|
||||
b .L113 // ALPHA==0 BETA!=0
|
||||
.align 3
|
||||
|
|
@ -68,6 +106,7 @@
|
|||
.L111: // ALPHA!=0 BETA!=0
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vld VX3, Y, 2 * SIZE
|
||||
vfmul.d VX0, VX0, VXA
|
||||
|
|
@ -86,6 +125,16 @@
|
|||
vfmadd.d VX3, VX3, VXB, VX1
|
||||
vst VX2, Y, 4 * SIZE
|
||||
vst VX3, Y, 6 * SIZE
|
||||
#else
|
||||
vld VX1, X, 4 * SIZE
|
||||
vld VX3, Y, 4 * SIZE
|
||||
vfmul.s VX0, VX0, VXA
|
||||
vfmul.s VX1, VX1, VXA
|
||||
vfmadd.s VX2, VX2, VXB, VX0
|
||||
vfmadd.s VX3, VX3, VXB, VX1
|
||||
vst VX2, Y, 0 * SIZE
|
||||
vst VX3, Y, 4 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
|
|
@ -95,6 +144,7 @@
|
|||
|
||||
.L112: // ALPHA!=0 BETA==0
|
||||
vld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vfmul.d VX0, VX0, VXA
|
||||
vfmul.d VX1, VX1, VXA
|
||||
|
|
@ -106,6 +156,13 @@
|
|||
vfmul.d VX3, VX3, VXA
|
||||
vst VX2, Y, 4 * SIZE
|
||||
vst VX3, Y, 6 * SIZE
|
||||
#else
|
||||
vld VX1, X, 4 * SIZE
|
||||
vfmul.s VX0, VX0, VXA
|
||||
vfmul.s VX1, VX1, VXA
|
||||
vst VX0, Y, 0 * SIZE
|
||||
vst VX1, Y, 4 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
|
|
@ -113,7 +170,8 @@
|
|||
b .L997
|
||||
.align 3
|
||||
|
||||
.L113: // ALPHA==0 BETA!=0\
|
||||
.L113: // ALPHA==0 BETA!=0
|
||||
#ifdef DOUBLE
|
||||
vld VX0, Y, 0 * SIZE
|
||||
vld VX1, Y, 2 * SIZE
|
||||
vfmul.d VX0, VX0, VXB
|
||||
|
|
@ -126,6 +184,14 @@
|
|||
vfmul.d VX3, VX3, VXB
|
||||
vst VX2, Y, 4 * SIZE
|
||||
vst VX3, Y, 6 * SIZE
|
||||
#else
|
||||
vld VX2, Y, 0 * SIZE
|
||||
vld VX3, Y, 4 * SIZE
|
||||
vfmul.s VX2, VX2, VXB
|
||||
vfmul.s VX3, VX3, VXB
|
||||
vst VX2, Y, 0 * SIZE
|
||||
vst VX3, Y, 4 * SIZE
|
||||
#endif
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L113
|
||||
|
|
@ -134,9 +200,13 @@
|
|||
|
||||
.L114: // ALPHA==0 BETA==0
|
||||
vst VXZ, Y, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vst VXZ, Y, 2 * SIZE
|
||||
vst VXZ, Y, 4 * SIZE
|
||||
vst VXZ, Y, 6 * SIZE
|
||||
#else
|
||||
vst VXZ, Y, 4 * SIZE
|
||||
#endif
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L114
|
||||
|
|
@ -146,21 +216,22 @@
|
|||
.L12: // INCX==1 and INCY!=1
|
||||
bge $r0, I, .L997
|
||||
move YY, Y
|
||||
fcmp.ceq.d $fcc0, ALPHA, a1
|
||||
CMPEQ $fcc0, ALPHA, a1
|
||||
bcnez $fcc0, .L120
|
||||
fcmp.ceq.d $fcc0, BETA, a1
|
||||
CMPEQ $fcc0, BETA, a1
|
||||
bcnez $fcc0, .L122 // ALPHA!=0 BETA==0
|
||||
b .L121 // ALPHA!=0 BETA!=0
|
||||
.align 3
|
||||
|
||||
.L120:
|
||||
fcmp.ceq.d $fcc0, BETA, a1
|
||||
CMPEQ $fcc0, BETA, a1
|
||||
bcnez $fcc0, .L124 // ALPHA==0 BETA==0
|
||||
b .L123 // ALPHA==0 BETA!=0
|
||||
.align 3
|
||||
|
||||
.L121: // ALPHA!=0 BETA!=0
|
||||
vld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t2, Y, 0 * SIZE
|
||||
|
|
@ -212,6 +283,53 @@
|
|||
vstelm.d VX3, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VX3, YY, 0, 1
|
||||
#else
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
vinsgr2vr.w VX2, t1, 0
|
||||
vinsgr2vr.w VX2, t2, 1
|
||||
vinsgr2vr.w VX2, t3, 2
|
||||
vinsgr2vr.w VX2, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
vfmul.s VX0, VX0, VXA
|
||||
vld VX1, X, 4 * SIZE
|
||||
vfmadd.s VX2, VX2, VXB, VX0
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.w VX3, t1, 0
|
||||
vinsgr2vr.w VX3, t2, 1
|
||||
vinsgr2vr.w VX3, t3, 2
|
||||
vinsgr2vr.w VX3, t4, 3
|
||||
vstelm.w VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
vfmul.s VX1, VX1, VXA
|
||||
vfmadd.s VX3, VX3, VXB, VX1
|
||||
addi.d I, I, -1
|
||||
vstelm.w VX3, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 3
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
addi.d X, X, 8 * SIZE
|
||||
blt $r0, I, .L121
|
||||
|
|
@ -220,6 +338,7 @@
|
|||
|
||||
.L122: // ALPHA!=0 BETA==0
|
||||
vld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vfmul.d VX0, VX0, VXA
|
||||
vfmul.d VX1, VX1, VXA
|
||||
|
|
@ -242,6 +361,26 @@
|
|||
vstelm.d VX1, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VX1, YY, 0, 1
|
||||
#else
|
||||
vld VX1, X, 4 * SIZE
|
||||
vfmul.s VX0, VX0, VXA
|
||||
vfmul.s VX1, VX1, VXA
|
||||
vstelm.w VX0, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX0, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX0, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX0, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX1, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX1, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX1, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX1, YY, 0, 3
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
|
|
@ -250,6 +389,7 @@
|
|||
.align 3
|
||||
|
||||
.L123: // ALPHA==0 BETA!=0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t2, Y, 0 * SIZE
|
||||
|
|
@ -294,12 +434,57 @@
|
|||
vstelm.d VX3, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VX3, YY, 0, 1
|
||||
#else
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
vinsgr2vr.w VX2, t1, 0
|
||||
vinsgr2vr.w VX2, t2, 1
|
||||
vinsgr2vr.w VX2, t3, 2
|
||||
vinsgr2vr.w VX2, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
vfmul.s VX2, VX2, VXB
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.w VX3, t1, 0
|
||||
vinsgr2vr.w VX3, t2, 1
|
||||
vinsgr2vr.w VX3, t3, 2
|
||||
vinsgr2vr.w VX3, t4, 3
|
||||
vstelm.w VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
vfmul.s VX3, VX3, VXB
|
||||
addi.d I, I, -1
|
||||
vstelm.w VX3, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 3
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
blt $r0, I, .L123
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L124: // ALPHA==0 BETA==0
|
||||
#ifdef DOUBLE
|
||||
vstelm.d VXZ, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VXZ, YY, 0, 1
|
||||
|
|
@ -315,6 +500,23 @@
|
|||
vstelm.d VXZ, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VXZ, YY, 0, 1
|
||||
#else
|
||||
vstelm.w VXZ, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VXZ, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VXZ, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VXZ, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VXZ, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VXZ, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VXZ, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VXZ, YY, 0, 3
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L124
|
||||
|
|
@ -323,21 +525,22 @@
|
|||
|
||||
.L21:// INCX!=1 and INCY==1
|
||||
bge $r0, I, .L997
|
||||
fcmp.ceq.d $fcc0, ALPHA, a1
|
||||
CMPEQ $fcc0, ALPHA, a1
|
||||
bcnez $fcc0, .L210
|
||||
fcmp.ceq.d $fcc0, BETA, a1
|
||||
CMPEQ $fcc0, BETA, a1
|
||||
bcnez $fcc0, .L212 // ALPHA!=0 BETA==0
|
||||
b .L211 // ALPHA!=0 BETA!=0
|
||||
.align 3
|
||||
|
||||
.L210:
|
||||
fcmp.ceq.d $fcc0, BETA, a1
|
||||
CMPEQ $fcc0, BETA, a1
|
||||
bcnez $fcc0, .L214 // ALPHA==0 BETA==0
|
||||
b .L213 // ALPHA==0 BETA!=0
|
||||
.align 3
|
||||
|
||||
.L211: // ALPHA!=0 BETA!=0
|
||||
vld VX2, Y, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
|
|
@ -378,12 +581,47 @@
|
|||
vfmadd.d VX3, VX3, VXB, VX1
|
||||
addi.d I, I, -1
|
||||
vst VX3, Y, 6 * SIZE
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
add.d X, X, INCX
|
||||
vfmul.s VX0, VXA, VX0
|
||||
vld VX3, Y, 4 * SIZE
|
||||
vfmadd.s VX2, VX2, VXB, VX0
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
vst VX2, Y, 0 * SIZE
|
||||
vfmul.s VX1, VX1, VXA
|
||||
vfmadd.s VX3, VX3, VXB, VX1
|
||||
addi.d I, I, -1
|
||||
vst VX3, Y, 4 * SIZE
|
||||
#endif
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L211
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L212: // ALPHA!=0 BETA==0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
|
|
@ -417,6 +655,37 @@
|
|||
vfmul.d VX1, VX1, VXA
|
||||
addi.d I, I, -1
|
||||
vst VX1, Y, 6 * SIZE
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
add.d X, X, INCX
|
||||
vfmul.s VX0, VXA, VX0
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
vst VX0, Y, 0 * SIZE
|
||||
vfmul.s VX1, VX1, VXA
|
||||
addi.d I, I, -1
|
||||
vst VX1, Y, 4 * SIZE
|
||||
#endif
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L212
|
||||
b .L997
|
||||
|
|
@ -424,6 +693,7 @@
|
|||
|
||||
.L213: // ALPHA==0 BETA!=0
|
||||
vld VX2, Y, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vld VX3, Y, 2 * SIZE
|
||||
vfmul.d VX2, VX2, VXB
|
||||
vfmul.d VX3, VX3, VXB
|
||||
|
|
@ -433,19 +703,30 @@
|
|||
vld VX3, Y, 6 * SIZE
|
||||
vfmul.d VX2, VX2, VXB
|
||||
vfmul.d VX3, VX3, VXB
|
||||
addi.d I, I, -1
|
||||
vst VX2, Y, 4 * SIZE
|
||||
vst VX3, Y, 6 * SIZE
|
||||
#else
|
||||
vld VX3, Y, 4 * SIZE
|
||||
vfmul.s VX2, VX2, VXB
|
||||
vfmul.s VX3, VX3, VXB
|
||||
vst VX2, Y, 0 * SIZE
|
||||
vst VX3, Y, 4 * SIZE
|
||||
#endif
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L213
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L214: // ALPHA==0 BETA==0
|
||||
vst VXZ, Y, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vst VXZ, Y, 2 * SIZE
|
||||
vst VXZ, Y, 4 * SIZE
|
||||
vst VXZ, Y, 6 * SIZE
|
||||
#else
|
||||
vst VXZ, Y, 4 * SIZE
|
||||
#endif
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L214
|
||||
|
|
@ -455,20 +736,21 @@
|
|||
.L22:
|
||||
bge $r0, I, .L997
|
||||
move YY, Y
|
||||
fcmp.ceq.d $fcc0, ALPHA, a1
|
||||
CMPEQ $fcc0, ALPHA, a1
|
||||
bcnez $fcc0, .L220
|
||||
fcmp.ceq.d $fcc0, BETA, a1
|
||||
CMPEQ $fcc0, BETA, a1
|
||||
bcnez $fcc0, .L222 // ALPHA!=0 BETA==0
|
||||
b .L221 // ALPHA!=0 BETA!=0
|
||||
.align 3
|
||||
|
||||
.L220:
|
||||
fcmp.ceq.d $fcc0, BETA, a1
|
||||
CMPEQ $fcc0, BETA, a1
|
||||
bcnez $fcc0, .L224 // ALPHA==0 BETA==0
|
||||
b .L223 // ALPHA==0 BETA!=0
|
||||
.align 3
|
||||
|
||||
.L221: // ALPHA!=0 BETA!=0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
|
|
@ -541,12 +823,83 @@
|
|||
vstelm.d VX3, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VX3, YY, 0, 1
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
vinsgr2vr.w VX2, t1, 0
|
||||
vinsgr2vr.w VX2, t2, 1
|
||||
vinsgr2vr.w VX2, t3, 2
|
||||
vinsgr2vr.w VX2, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
vfmul.s VX0, VX0, VXA
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vfmadd.s VX2, VX2, VXB, VX0
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
vstelm.w VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
vinsgr2vr.w VX3, t1, 0
|
||||
vinsgr2vr.w VX3, t2, 1
|
||||
vinsgr2vr.w VX3, t3, 2
|
||||
vinsgr2vr.w VX3, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
vfmul.s VX1, VX1, VXA
|
||||
addi.d I, I, -1
|
||||
vfmadd.s VX3, VX3, VXB, VX1
|
||||
vstelm.w VX3, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 3
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
blt $r0, I, .L221
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L222: // ALPHA!=0 BETA==0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
|
|
@ -591,12 +944,57 @@
|
|||
vstelm.d VX1, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VX1, YY, 0, 1
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
add.d X, X, INCX
|
||||
vfmul.s VX0, VX0, VXA
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
vstelm.w VX0, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX0, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX0, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX0, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
vfmul.s VX1, VX1, VXA
|
||||
addi.d I, I, -1
|
||||
vstelm.w VX1, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX1, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX1, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX1, YY, 0, 3
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
blt $r0, I, .L222
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L223: // ALPHA==0 BETA!=0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t2, Y, 0 * SIZE
|
||||
|
|
@ -641,12 +1039,57 @@
|
|||
vstelm.d VX3, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VX3, YY, 0, 1
|
||||
#else
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
vinsgr2vr.w VX2, t1, 0
|
||||
vinsgr2vr.w VX2, t2, 1
|
||||
vinsgr2vr.w VX2, t3, 2
|
||||
vinsgr2vr.w VX2, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
vfmul.s VX2, VX2, VXB
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.w VX3, t1, 0
|
||||
vinsgr2vr.w VX3, t2, 1
|
||||
vinsgr2vr.w VX3, t3, 2
|
||||
vinsgr2vr.w VX3, t4, 3
|
||||
vstelm.w VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
vfmul.s VX3, VX3, VXB
|
||||
addi.d I, I, -1
|
||||
vstelm.w VX3, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 3
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
blt $r0, I, .L223
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L224: // ALPHA==0 BETA==0
|
||||
#ifdef DOUBLE
|
||||
vstelm.d VXZ, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VXZ, YY, 0, 1
|
||||
|
|
@ -662,6 +1105,23 @@
|
|||
vstelm.d VXZ, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VXZ, YY, 0, 1
|
||||
#else
|
||||
vstelm.w VXZ, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VXZ, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VXZ, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VXZ, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VXZ, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VXZ, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VXZ, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VXZ, YY, 0, 3
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L224
|
||||
|
|
@ -674,12 +1134,12 @@
|
|||
.align 3
|
||||
|
||||
.L998:
|
||||
fld.d $f12, X, 0 * SIZE
|
||||
fld.d $f13, Y, 0 * SIZE
|
||||
LD $f12, X, 0 * SIZE
|
||||
LD $f13, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
fmul.d $f12, $f12, ALPHA
|
||||
fmadd.d $f13, $f13, BETA, $f12
|
||||
fst.d $f13, Y, 0 * SIZE
|
||||
MUL $f12, $f12, ALPHA
|
||||
MADD $f13, $f13, BETA, $f12
|
||||
ST $f13, Y, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L998
|
||||
|
|
@ -1,6 +1,33 @@
|
|||
#define ASSEMBLER
|
||||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define XX $r5
|
||||
#define YY $r6
|
||||
|
|
@ -35,16 +62,20 @@
|
|||
bge $r0, N, .L999
|
||||
li.d TEMP, 1
|
||||
movgr2fr.d a1, $r0
|
||||
ffint.d.l a1, a1
|
||||
FFINT a1, a1
|
||||
movgr2fr.d a2, TEMP
|
||||
ffint.d.l a2, a2
|
||||
fcmp.ceq.d $fcc0, ALPHA, a1
|
||||
FFINT a2, a2
|
||||
CMPEQ $fcc0, ALPHA, a1
|
||||
bcnez $fcc0, .L999
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
slli.d INCY, INCY, BASE_SHIFT
|
||||
movfr2gr.d t1, ALPHA
|
||||
MTG t1, ALPHA
|
||||
#ifdef DOUBLE
|
||||
xvreplgr2vr.d VXA, t1
|
||||
#else
|
||||
xvreplgr2vr.w VXA, t1
|
||||
#endif
|
||||
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
|
|
@ -56,11 +87,12 @@
|
|||
|
||||
.L11:
|
||||
bge $r0, I, .L113
|
||||
fcmp.ceq.d $fcc0, ALPHA, a2
|
||||
CMPEQ $fcc0, ALPHA, a2
|
||||
bceqz $fcc0, .L112
|
||||
.align 3
|
||||
|
||||
.L111:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
|
|
@ -70,6 +102,13 @@
|
|||
addi.d I, I, -1
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
xvst VX3, Y, 4 * SIZE
|
||||
#else
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
xvfadd.s VX2, VX0, VX2
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L111
|
||||
|
|
@ -77,6 +116,7 @@
|
|||
.align 3
|
||||
|
||||
.L112:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
|
|
@ -86,6 +126,13 @@
|
|||
addi.d I, I, -1
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
xvst VX3, Y, 4 * SIZE
|
||||
#else
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
xvfmadd.s VX2, VX0, VXA, VX2
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L112
|
||||
|
|
@ -97,11 +144,11 @@
|
|||
.align 3
|
||||
|
||||
.L114:
|
||||
fld.d $f12, X, 0 * SIZE
|
||||
fld.d $f14, Y, 0 * SIZE
|
||||
LD $f12, X, 0 * SIZE
|
||||
LD $f14, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
fmadd.d $f14, $f12, $f0, $f14
|
||||
fst.d $f14, Y, 0 * SIZE
|
||||
MADD $f14, $f12, $f0, $f14
|
||||
ST $f14, Y, 0 * SIZE
|
||||
addi.d X, X, SIZE
|
||||
addi.d Y, Y, SIZE
|
||||
blt $r0, I, .L114
|
||||
|
|
@ -114,6 +161,7 @@
|
|||
.align 3
|
||||
|
||||
.L121:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0 * SIZE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
|
|
@ -158,6 +206,50 @@
|
|||
xvstelm.d VX3, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VX3, YY, 0, 3
|
||||
#else
|
||||
xvld VX0, X, 0 * SIZE
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
xvinsgr2vr.w VX2, t1, 0
|
||||
xvinsgr2vr.w VX2, t2, 1
|
||||
xvinsgr2vr.w VX2, t3, 2
|
||||
xvinsgr2vr.w VX2, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
xvinsgr2vr.w VX2, t1, 4
|
||||
xvinsgr2vr.w VX2, t2, 5
|
||||
xvinsgr2vr.w VX2, t3, 6
|
||||
xvinsgr2vr.w VX2, t4, 7
|
||||
add.d Y, Y, INCY
|
||||
xvfmadd.s VX2, VX0, VXA, VX2
|
||||
addi.d I, I, -1
|
||||
xvstelm.w VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 4
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 5
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 6
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 7
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
addi.d X, X, 8 * SIZE
|
||||
blt $r0, I, .L121
|
||||
|
|
@ -169,11 +261,11 @@
|
|||
.align 3
|
||||
|
||||
.L123:
|
||||
fld.d $f12, X, 0 * SIZE
|
||||
fld.d $f14, Y, 0 * SIZE
|
||||
LD $f12, X, 0 * SIZE
|
||||
LD $f14, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
fmadd.d $f14, $f12, $f0, $f14
|
||||
fst.d $f14, Y, 0 * SIZE
|
||||
MADD $f14, $f12, $f0, $f14
|
||||
ST $f14, Y, 0 * SIZE
|
||||
addi.d X, X, SIZE
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L123
|
||||
|
|
@ -185,6 +277,7 @@
|
|||
.align 3
|
||||
|
||||
.L211:
|
||||
#ifdef DOUBLE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
|
|
@ -217,6 +310,37 @@
|
|||
addi.d I, I, -1
|
||||
xvst VX3, Y, 4 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
#else
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
add.d X, X, INCX
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
xvfmadd.s VX2, VX0, VXA, VX2
|
||||
addi.d I, I, -1
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
#endif
|
||||
blt $r0, I, .L211
|
||||
.align 3
|
||||
|
||||
|
|
@ -226,11 +350,11 @@
|
|||
.align 3
|
||||
|
||||
.L213:
|
||||
fld.d $f12, X, 0 * SIZE
|
||||
fld.d $f14, Y, 0 * SIZE
|
||||
LD $f12, X, 0 * SIZE
|
||||
LD $f14, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
fmadd.d $f14, $f12, $f0, $f14
|
||||
fst.d $f14, Y, 0 * SIZE
|
||||
MADD $f14, $f12, $f0, $f14
|
||||
ST $f14, Y, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
addi.d Y, Y, SIZE
|
||||
blt $r0, I, .L213
|
||||
|
|
@ -243,6 +367,7 @@
|
|||
.align 3
|
||||
|
||||
.L222:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
|
|
@ -309,6 +434,73 @@
|
|||
xvstelm.d VX3, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VX3, YY, 0, 3
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w VX2, t1, 0
|
||||
xvinsgr2vr.w VX2, t2, 1
|
||||
xvinsgr2vr.w VX2, t3, 2
|
||||
xvinsgr2vr.w VX2, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
xvinsgr2vr.w VX2, t1, 4
|
||||
xvinsgr2vr.w VX2, t2, 5
|
||||
xvinsgr2vr.w VX2, t3, 6
|
||||
xvinsgr2vr.w VX2, t4, 7
|
||||
add.d Y, Y, INCY
|
||||
xvfmadd.s VX2, VX0, VXA, VX2
|
||||
addi.d I, I, -1
|
||||
xvstelm.w VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 4
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 5
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 6
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 7
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
blt $r0, I, .L222
|
||||
.align 3
|
||||
|
|
@ -319,15 +511,14 @@
|
|||
.align 3
|
||||
|
||||
.L224:
|
||||
fld.d $f12, X, 0 * SIZE
|
||||
fld.d $f14, Y, 0 * SIZE
|
||||
LD $f12, X, 0 * SIZE
|
||||
LD $f14, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
fmadd.d $f14, $f12, $f0, $f14
|
||||
fst.d $f14, Y, 0 * SIZE
|
||||
MADD $f14, $f12, $f0, $f14
|
||||
ST $f14, Y, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L224
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
|
|
@ -1,6 +1,33 @@
|
|||
#define ASSEMBLER
|
||||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define XX $r5
|
||||
#define YY $r6
|
||||
|
|
@ -35,16 +62,20 @@
|
|||
bge $r0, N, .L999
|
||||
li.d TEMP, 1
|
||||
movgr2fr.d a1, $r0
|
||||
ffint.d.l a1, a1
|
||||
FFINT a1, a1
|
||||
movgr2fr.d a2, TEMP
|
||||
ffint.d.l a2, a2
|
||||
fcmp.ceq.d $fcc0, ALPHA, a1
|
||||
FFINT a2, a2
|
||||
CMPEQ $fcc0, ALPHA, a1
|
||||
bcnez $fcc0, .L999
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
slli.d INCY, INCY, BASE_SHIFT
|
||||
movfr2gr.d t1, ALPHA
|
||||
MTG t1, ALPHA
|
||||
#ifdef DOUBLE
|
||||
vreplgr2vr.d VXA, t1
|
||||
#else
|
||||
vreplgr2vr.w VXA, t1
|
||||
#endif
|
||||
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
|
|
@ -56,11 +87,12 @@
|
|||
|
||||
.L11:
|
||||
bge $r0, I, .L113
|
||||
fcmp.ceq.d $fcc0, ALPHA, a2
|
||||
CMPEQ $fcc0, ALPHA, a2
|
||||
bceqz $fcc0, .L112
|
||||
.align 3
|
||||
|
||||
.L111:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
vld VX1, X, 2 * SIZE
|
||||
|
|
@ -75,16 +107,27 @@
|
|||
vld VX3, Y, 6 * SIZE
|
||||
vfadd.d VX2, VX0, VX2
|
||||
vfadd.d VX3, VX1, VX3
|
||||
addi.d I, I, -1
|
||||
vst VX2, Y, 4 * SIZE
|
||||
vst VX3, Y, 6 * SIZE
|
||||
#else
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
vld VX1, X, 4 * SIZE
|
||||
vld VX3, Y, 4 * SIZE
|
||||
vfadd.s VX2, VX0, VX2
|
||||
vfadd.s VX3, VX1, VX3
|
||||
vst VX2, Y, 0 * SIZE
|
||||
vst VX3, Y, 4 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L111
|
||||
b .L113
|
||||
.align 3
|
||||
|
||||
.L112:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
vld VX1, X, 2 * SIZE
|
||||
|
|
@ -104,6 +147,19 @@
|
|||
vst VX2, Y, 4 * SIZE
|
||||
vst VX3, Y, 6 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
#else
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
vld VX1, X, 4 * SIZE
|
||||
vld VX3, Y, 4 * SIZE
|
||||
vfmadd.s VX2, VX0, VXA, VX2
|
||||
vfmadd.s VX3, VX1, VXA, VX3
|
||||
vst VX2, Y, 0 * SIZE
|
||||
vst VX3, Y, 4 * SIZE
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
#endif
|
||||
blt $r0, I, .L112
|
||||
.align 3
|
||||
|
||||
|
|
@ -113,11 +169,11 @@
|
|||
.align 3
|
||||
|
||||
.L114:
|
||||
fld.d $f12, X, 0 * SIZE
|
||||
fld.d $f14, Y, 0 * SIZE
|
||||
LD $f12, X, 0 * SIZE
|
||||
LD $f14, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
fmadd.d $f14, $f12, $f0, $f14
|
||||
fst.d $f14, Y, 0 * SIZE
|
||||
MADD $f14, $f12, $f0, $f14
|
||||
ST $f14, Y, 0 * SIZE
|
||||
addi.d X, X, SIZE
|
||||
addi.d Y, Y, SIZE
|
||||
blt $r0, I, .L114
|
||||
|
|
@ -130,6 +186,7 @@
|
|||
.align 3
|
||||
|
||||
.L121:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0 * SIZE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
|
|
@ -180,6 +237,54 @@
|
|||
add.d YY, YY, INCY
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
#else
|
||||
vld VX0, X, 0 * SIZE
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
vinsgr2vr.w VX2, t1, 0
|
||||
vinsgr2vr.w VX2, t2, 1
|
||||
vinsgr2vr.w VX2, t3, 2
|
||||
vinsgr2vr.w VX2, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
vfmadd.s VX2, VX0, VXA, VX2
|
||||
vld VX1, X, 4 * SIZE
|
||||
vstelm.w VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
vinsgr2vr.w VX3, t1, 0
|
||||
vinsgr2vr.w VX3, t2, 1
|
||||
vinsgr2vr.w VX3, t3, 2
|
||||
vinsgr2vr.w VX3, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
vfmadd.s VX3, VX1, VXA, VX3
|
||||
addi.d I, I, -1
|
||||
vstelm.w VX3, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
addi.d X, X, 8 * SIZE
|
||||
#endif
|
||||
blt $r0, I, .L121
|
||||
.align 3
|
||||
|
||||
|
|
@ -189,11 +294,11 @@
|
|||
.align 3
|
||||
|
||||
.L123:
|
||||
fld.d $f12, X, 0 * SIZE
|
||||
fld.d $f14, Y, 0 * SIZE
|
||||
LD $f12, X, 0 * SIZE
|
||||
LD $f14, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
fmadd.d $f14, $f12, $f0, $f14
|
||||
fst.d $f14, Y, 0 * SIZE
|
||||
MADD $f14, $f12, $f0, $f14
|
||||
ST $f14, Y, 0 * SIZE
|
||||
addi.d X, X, SIZE
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L123
|
||||
|
|
@ -205,6 +310,7 @@
|
|||
.align 3
|
||||
|
||||
.L211:
|
||||
#ifdef DOUBLE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
|
|
@ -242,6 +348,39 @@
|
|||
vfmadd.d VX3, VX1, VXA, VX3
|
||||
addi.d I, I, -1
|
||||
vst VX3, Y, 6 * SIZE
|
||||
#else
|
||||
vld VX2, Y, 0 * SIZE
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
add.d X, X, INCX
|
||||
vfmadd.s VX2, VX0, VXA, VX2
|
||||
vld VX3, Y, 4 * SIZE
|
||||
vst VX2, Y, 0 * SIZE
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
add.d X, X, INCX
|
||||
vfmadd.s VX3, VX1, VXA, VX3
|
||||
addi.d I, I, -1
|
||||
vst VX3, Y, 4 * SIZE
|
||||
#endif
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L211
|
||||
.align 3
|
||||
|
|
@ -252,11 +391,11 @@
|
|||
.align 3
|
||||
|
||||
.L213:
|
||||
fld.d $f12, X, 0 * SIZE
|
||||
fld.d $f14, Y, 0 * SIZE
|
||||
LD $f12, X, 0 * SIZE
|
||||
LD $f14, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
fmadd.d $f14, $f12, $f0, $f14
|
||||
fst.d $f14, Y, 0 * SIZE
|
||||
MADD $f14, $f12, $f0, $f14
|
||||
ST $f14, Y, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
addi.d Y, Y, SIZE
|
||||
blt $r0, I, .L213
|
||||
|
|
@ -269,6 +408,7 @@
|
|||
.align 3
|
||||
|
||||
.L222:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
|
|
@ -337,6 +477,74 @@
|
|||
vstelm.d VX3, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VX3, YY, 0, 1
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
add.d X, X, INCX
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
vinsgr2vr.w VX2, t1, 0
|
||||
vinsgr2vr.w VX2, t2, 1
|
||||
vinsgr2vr.w VX2, t3, 2
|
||||
vinsgr2vr.w VX2, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
vfmadd.s VX2, VX0, VXA, VX2
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
vstelm.w VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
vinsgr2vr.w VX3, t1, 0
|
||||
vinsgr2vr.w VX3, t2, 1
|
||||
vinsgr2vr.w VX3, t3, 2
|
||||
vinsgr2vr.w VX3, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
vfmadd.s VX3, VX1, VXA, VX3
|
||||
addi.d I, I, -1
|
||||
vstelm.w VX3, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 3
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
blt $r0, I, .L222
|
||||
.align 3
|
||||
|
|
@ -347,11 +555,11 @@
|
|||
.align 3
|
||||
|
||||
.L224:
|
||||
fld.d $f12, X, 0 * SIZE
|
||||
fld.d $f14, Y, 0 * SIZE
|
||||
LD $f12, X, 0 * SIZE
|
||||
LD $f14, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
fmadd.d $f14, $f12, $f0, $f14
|
||||
fst.d $f14, Y, 0 * SIZE
|
||||
MADD $f14, $f12, $f0, $f14
|
||||
ST $f14, Y, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L224
|
||||
|
|
@ -0,0 +1,212 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r12
|
||||
#define t1 $f14
|
||||
#define t2 $f18
|
||||
#define t3 $f15
|
||||
#define t4 $f17
|
||||
#define s1 $f22
|
||||
#define s2 $f9
|
||||
#define s3 $f10
|
||||
#define s4 $f11
|
||||
#define TEMP $r16
|
||||
#define a0 $f20
|
||||
#define a1 $f21
|
||||
#define x1 $xr9
|
||||
#define x2 $xr10
|
||||
#define x3 $xr11
|
||||
#define x4 $xr12
|
||||
#define VT0 $xr13
|
||||
#define VT1 $xr14
|
||||
#define res0 $xr18
|
||||
#define neg1 $xr19
|
||||
#define VX0 $xr20
|
||||
#define VX1 $xr21
|
||||
#define VM0 $xr22
|
||||
#define VM1 $xr23
|
||||
|
||||
PROLOGUE
|
||||
xvxor.v VM0, VM0, VM0
|
||||
xvxor.v res0, res0, res0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
xvld VX0, X, 0
|
||||
xvld VX1, X, 32
|
||||
#ifdef DOUBLE
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
#else
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
#endif
|
||||
XVFSUB x3, res0, x1
|
||||
XVFSUB x4, res0, x2
|
||||
XVFMAX x1, x1, x3
|
||||
XVFMAX x2, x2, x4
|
||||
XVFADD VM1, x1, x2
|
||||
XVFMAX VM0, VM0, VM1
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 64
|
||||
xvld VX1, X, 96
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
XVFSUB x3, res0, x1
|
||||
XVFSUB x4, res0, x2
|
||||
XVFMAX x1, x1, x3
|
||||
XVFMAX x2, x2, x4
|
||||
XVFADD VM1, x1, x2
|
||||
XVFMAX VM0, VM0, VM1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 16 * SIZE
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d x1, VM0, 0
|
||||
xvpickve.d x2, VM0, 1
|
||||
XVFMAX VM0, x1, x2
|
||||
#else
|
||||
xvpickve.w x1, VM0, 0
|
||||
xvpickve.w x2, VM0, 1
|
||||
xvpickve.w x3, VM0, 2
|
||||
xvpickve.w x4, VM0, 3
|
||||
XVFMAX VM0, x1, x2
|
||||
XVFMAX VM1, x3, x4
|
||||
XVFMAX VM0, VM0, VM1
|
||||
#endif
|
||||
b .L23
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMAX s1, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMAX s1, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
addi.d I, I, -1
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMAX s3, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMAX s4, t1, t3
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
FMAX s1, s1, s2
|
||||
FMAX s3, s3, s4
|
||||
FMAX s1, s1, s3
|
||||
.align 3
|
||||
|
||||
.L23: //N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
LD a0, X, 0 * SIZE
|
||||
LD a1, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
FABS a0, a0
|
||||
FABS a1, a1
|
||||
ADD a0, a0, a1
|
||||
add.d X, X, INCX
|
||||
FMAX s1, a0, s1
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
MOV $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,239 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r12
|
||||
#define t1 $f14
|
||||
#define t2 $f18
|
||||
#define t3 $f15
|
||||
#define t4 $f17
|
||||
#define s1 $f22
|
||||
#define s2 $f9
|
||||
#define s3 $f10
|
||||
#define s4 $f11
|
||||
#define TEMP $r16
|
||||
#define a0 $f20
|
||||
#define a1 $f21
|
||||
#define x1 $vr9
|
||||
#define x2 $vr10
|
||||
#define x3 $vr11
|
||||
#define x4 $vr12
|
||||
#define VT0 $vr13
|
||||
#define VT1 $vr14
|
||||
#define res0 $vr18
|
||||
#define neg1 $vr19
|
||||
#define VX0 $vr20
|
||||
#define VX1 $vr21
|
||||
#define VM0 $vr22
|
||||
#define VM1 $vr23
|
||||
|
||||
PROLOGUE
|
||||
vxor.v VM0, VM0, VM0
|
||||
vxor.v res0, res0, res0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
vld VX0, X, 0
|
||||
vld VX1, X, 16
|
||||
#ifdef DOUBLE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
#else
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
#endif
|
||||
VFSUB x3, res0, x1
|
||||
VFSUB x4, res0, x2
|
||||
VFMAX x1, x1, x3
|
||||
VFMAX x2, x2, x4
|
||||
VFADD VM1, x1, x2
|
||||
|
||||
vld VX0, X, 32
|
||||
vld VX1, X, 48
|
||||
#ifdef DOUBLE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
#else
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
#endif
|
||||
VFSUB x3, res0, x1
|
||||
VFSUB x4, res0, x2
|
||||
VFMAX x1, x1, x3
|
||||
VFMAX x2, x2, x4
|
||||
VFADD x1, x1, x2
|
||||
VFMAX VM1, x1, VM1
|
||||
VFMAX VM0, VM0, VM1
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 64
|
||||
vld VX1, X, 80
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
VFSUB x3, res0, x1
|
||||
VFSUB x4, res0, x2
|
||||
VFMAX x1, x1, x3
|
||||
VFMAX x2, x2, x4
|
||||
VFADD VM1, x1, x2
|
||||
|
||||
vld VX0, X, 96
|
||||
vld VX1, X, 112
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
VFSUB x3, res0, x1
|
||||
VFSUB x4, res0, x2
|
||||
VFMAX x1, x1, x3
|
||||
VFMAX x2, x2, x4
|
||||
VFADD x1, x1, x2
|
||||
VFMAX VM1, x1, VM1
|
||||
VFMAX VM0, VM0, VM1
|
||||
#endif
|
||||
addi.d X, X, 16 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d x1, VM0, 0
|
||||
vreplvei.d x2, VM0, 1
|
||||
VFMAX VM0, x1, x2
|
||||
#else
|
||||
vreplvei.w x1, VM0, 0
|
||||
vreplvei.w x2, VM0, 1
|
||||
vreplvei.w x3, VM0, 2
|
||||
vreplvei.w x4, VM0, 3
|
||||
VFMAX VM1, x1, x2
|
||||
VFMAX VM0, x3, x4
|
||||
VFMAX VM0, VM0, VM1
|
||||
#endif
|
||||
b .L23
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMAX s1, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMAX s1, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
addi.d I, I, -1
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMAX s3, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMAX s4, t1, t3
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
FMAX s1, s1, s2
|
||||
FMAX s3, s3, s4
|
||||
FMAX s1, s1, s3
|
||||
.align 3
|
||||
|
||||
.L23: //N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
LD a0, X, 0 * SIZE
|
||||
LD a1, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
FABS a0, a0
|
||||
FABS a1, a1
|
||||
ADD a0, a0, a1
|
||||
add.d X, X, INCX
|
||||
FMAX s1, a0, s1
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
MOV $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,221 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r12
|
||||
#define TEMP $r16
|
||||
#define t1 $f14
|
||||
#define t2 $f18
|
||||
#define t3 $f15
|
||||
#define t4 $f17
|
||||
#define s1 $f22
|
||||
#define s2 $f9
|
||||
#define s3 $f10
|
||||
#define s4 $f11
|
||||
#define a0 $f20
|
||||
#define a1 $f21
|
||||
#define x1 $xr9
|
||||
#define x2 $xr10
|
||||
#define x3 $xr11
|
||||
#define x4 $xr12
|
||||
#define VT0 $xr13
|
||||
#define VT1 $xr14
|
||||
#define res0 $xr18
|
||||
#define neg1 $xr19
|
||||
#define VX0 $xr20
|
||||
#define VX1 $xr21
|
||||
#define VM0 $xr22
|
||||
#define VM1 $xr23
|
||||
|
||||
PROLOGUE
|
||||
MTC s1, $r0
|
||||
xvxor.v res0, res0, res0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
LD a0, X, 0 * SIZE
|
||||
LD a1, X, 1 * SIZE
|
||||
FABS a0, a0
|
||||
FABS a1, a1
|
||||
ADD s1, a1, a0
|
||||
#ifdef DOUBLE
|
||||
xvreplve0.d VM0, VM0
|
||||
#else
|
||||
xvreplve0.w VM0, VM0
|
||||
#endif
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
xvld VX0, X, 0
|
||||
xvld VX1, X, 32
|
||||
#ifdef DOUBLE
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
#else
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
#endif
|
||||
XVFSUB x3, res0, x1
|
||||
XVFSUB x4, res0, x2
|
||||
XVFMAX x1, x1, x3
|
||||
XVFMAX x2, x2, x4
|
||||
XVFADD VM1, x1, x2
|
||||
XVFMIN VM0, VM0, VM1
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 64
|
||||
xvld VX1, X, 96
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
XVFSUB x3, res0, x1
|
||||
XVFSUB x4, res0, x2
|
||||
XVFMAX x1, x1, x3
|
||||
XVFMAX x2, x2, x4
|
||||
XVFADD VM1, x1, x2
|
||||
XVFMIN VM0, VM0, VM1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 16 * SIZE
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d x1, VM0, 0
|
||||
xvpickve.d x2, VM0, 1
|
||||
XVFMIN VM0, x1, x2
|
||||
#else
|
||||
xvpickve.w x1, VM0, 0
|
||||
xvpickve.w x2, VM0, 1
|
||||
xvpickve.w x3, VM0, 2
|
||||
xvpickve.w x4, VM0, 3
|
||||
XVFMIN VM0, x1, x2
|
||||
XVFMIN VM1, x3, x4
|
||||
XVFMIN VM0, VM0, VM1
|
||||
#endif
|
||||
b .L23
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMIN s1, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMIN s1, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
addi.d I, I, -1
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMIN s3, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMIN s4, t1, t3
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
FMIN s1, s1, s2
|
||||
FMIN s3, s3, s4
|
||||
FMIN s1, s1, s3
|
||||
.align 3
|
||||
|
||||
.L23: //N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
LD a0, X, 0 * SIZE
|
||||
LD a1, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
FABS a0, a0
|
||||
FABS a1, a1
|
||||
ADD a0, a0, a1
|
||||
add.d X, X, INCX
|
||||
FMIN s1, a0, s1
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
MOV $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,248 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r12
|
||||
#define t1 $f14
|
||||
#define t2 $f18
|
||||
#define t3 $f15
|
||||
#define t4 $f17
|
||||
#define s1 $f22
|
||||
#define s2 $f9
|
||||
#define s3 $f10
|
||||
#define s4 $f11
|
||||
#define TEMP $r16
|
||||
#define a0 $f20
|
||||
#define a1 $f21
|
||||
#define x1 $vr9
|
||||
#define x2 $vr10
|
||||
#define x3 $vr11
|
||||
#define x4 $vr12
|
||||
#define VT0 $vr13
|
||||
#define VT1 $vr14
|
||||
#define res0 $vr18
|
||||
#define neg1 $vr19
|
||||
#define VX0 $vr20
|
||||
#define VX1 $vr21
|
||||
#define VM0 $vr22
|
||||
#define VM1 $vr23
|
||||
|
||||
PROLOGUE
|
||||
MTC s1, $r0
|
||||
vxor.v res0, res0, res0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
LD a0, X, 0 * SIZE
|
||||
LD a1, X, 1 * SIZE
|
||||
FABS a0, a0
|
||||
FABS a1, a1
|
||||
ADD s1, a1, a0
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VM0, VM0, 0
|
||||
#else
|
||||
vreplvei.w VM0, VM0, 0
|
||||
#endif
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
vld VX0, X, 0
|
||||
vld VX1, X, 16
|
||||
#ifdef DOUBLE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
#else
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
#endif
|
||||
VFSUB x3, res0, x1
|
||||
VFSUB x4, res0, x2
|
||||
VFMAX x1, x1, x3
|
||||
VFMAX x2, x2, x4
|
||||
VFADD VM1, x1, x2
|
||||
|
||||
vld VX0, X, 32
|
||||
vld VX1, X, 48
|
||||
#ifdef DOUBLE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
#else
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
#endif
|
||||
VFSUB x3, res0, x1
|
||||
VFSUB x4, res0, x2
|
||||
VFMAX x1, x1, x3
|
||||
VFMAX x2, x2, x4
|
||||
VFADD x1, x1, x2
|
||||
VFMIN VM1, x1, VM1
|
||||
VFMIN VM0, VM0, VM1
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 64
|
||||
vld VX1, X, 80
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
VFSUB x3, res0, x1
|
||||
VFSUB x4, res0, x2
|
||||
VFMAX x1, x1, x3
|
||||
VFMAX x2, x2, x4
|
||||
VFADD VM1, x1, x2
|
||||
|
||||
vld VX0, X, 96
|
||||
vld VX1, X, 112
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
VFSUB x3, res0, x1
|
||||
VFSUB x4, res0, x2
|
||||
VFMAX x1, x1, x3
|
||||
VFMAX x2, x2, x4
|
||||
VFADD x1, x1, x2
|
||||
VFMIN VM1, x1, VM1
|
||||
VFMIN VM0, VM0, VM1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 16 * SIZE
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d x1, VM0, 0
|
||||
vreplvei.d x2, VM0, 1
|
||||
VFMIN VM0, x1, x2
|
||||
#else
|
||||
vreplvei.w x1, VM0, 0
|
||||
vreplvei.w x2, VM0, 1
|
||||
vreplvei.w x3, VM0, 2
|
||||
vreplvei.w x4, VM0, 3
|
||||
VFMIN VM1, x1, x2
|
||||
VFMIN VM0, x3, x4
|
||||
VFMIN VM0, VM0, VM1
|
||||
#endif
|
||||
b .L23
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMIN s1, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMIN s1, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
addi.d I, I, -1
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMIN s3, t1, t3
|
||||
LD t1, X, 0 * SIZE
|
||||
LD t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD t3, X, 0 * SIZE
|
||||
LD t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, t1
|
||||
FABS t2, t2
|
||||
FABS t3, t3
|
||||
FABS t4, t4
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
FMIN s4, t1, t3
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
FMIN s1, s1, s2
|
||||
FMIN s3, s3, s4
|
||||
FMIN s1, s1, s3
|
||||
.align 3
|
||||
|
||||
.L23: //N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
LD a0, X, 0 * SIZE
|
||||
LD a1, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
FABS a0, a0
|
||||
FABS a1, a1
|
||||
ADD a0, a0, a1
|
||||
add.d X, X, INCX
|
||||
FMIN s1, a0, s1
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
MOV $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,329 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define t1 $r15
|
||||
#define t2 $r12
|
||||
#define t3 $r13
|
||||
#define t4 $r14
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define s1 $f16
|
||||
#define VX0 $xr12
|
||||
#define VX1 $xr13
|
||||
#define VX2 $xr14
|
||||
#define VX3 $xr15
|
||||
#define res1 $xr16
|
||||
#define res2 $xr17
|
||||
#define res3 $xr18
|
||||
#define res0 $xr19
|
||||
#define neg1 $xr20
|
||||
#define VT0 $xr21
|
||||
#define VT1 $xr22
|
||||
|
||||
PROLOGUE
|
||||
xvxor.v res1, res1, res1
|
||||
xvxor.v res2, res2, res2
|
||||
xvxor.v res0, res0, res0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
#ifdef DOUBLE
|
||||
li.d t1, -1
|
||||
xvreplgr2vr.d neg1, t1
|
||||
xvffint.d.l neg1, neg1
|
||||
#else
|
||||
li.w t1, -1
|
||||
xvreplgr2vr.w neg1, t1
|
||||
xvffint.s.w neg1, neg1
|
||||
#endif
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvfmul.d VX2, neg1, VX0
|
||||
xvfmul.d VX3, neg1, VX1
|
||||
xvfcmp.clt.d VT0, VX0, res0
|
||||
xvfcmp.clt.d VT1, VX1, res0
|
||||
xvbitsel.v VX0, VX0, VX2, VT0
|
||||
xvbitsel.v VX1, VX1, VX3, VT1
|
||||
xvfadd.d res2, VX0, VX1
|
||||
xvfadd.d res1, res1, res2
|
||||
xvld VX2, X, 8 * SIZE
|
||||
xvld VX3, X, 12 * SIZE
|
||||
xvfmul.d VX0, neg1, VX2
|
||||
xvfmul.d VX1, neg1, VX3
|
||||
xvfcmp.clt.d VT0, VX2, res0
|
||||
xvfcmp.clt.d VT1, VX3, res0
|
||||
xvbitsel.v VX2, VX2, VX0, VT0
|
||||
xvbitsel.v VX3, VX3, VX1, VT1
|
||||
xvfadd.d res2, VX2, VX3
|
||||
xvfadd.d res1, res1, res2
|
||||
#else
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX1, X, 8 * SIZE
|
||||
xvfmul.s VX2, neg1, VX0
|
||||
xvfmul.s VX3, neg1, VX1
|
||||
xvfcmp.clt.s VT0, VX0, res0
|
||||
xvfcmp.clt.s VT1, VX1, res0
|
||||
xvbitsel.v VX0, VX0, VX2, VT0
|
||||
xvbitsel.v VX1, VX1, VX3, VT1
|
||||
xvfadd.s res2, VX0, VX1
|
||||
xvfadd.s res1, res2, res1
|
||||
#endif
|
||||
addi.d X, X, 16 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L11
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VX1, res1, 1
|
||||
xvpickve.d VX2, res1, 2
|
||||
xvpickve.d VX3, res1, 3
|
||||
xvfadd.d res1, VX1, res1
|
||||
xvfadd.d res1, VX2, res1
|
||||
xvfadd.d res1, VX3, res1
|
||||
#else
|
||||
xvfadd.s res2, res1, res2
|
||||
xvpickve.w VX1, res1, 1
|
||||
xvpickve.w VX2, res1, 2
|
||||
xvpickve.w VX3, res1, 3
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX3, res1
|
||||
xvpickve.w VX0, res2, 4
|
||||
xvpickve.w VX1, res2, 5
|
||||
xvpickve.w VX2, res2, 6
|
||||
xvpickve.w VX3, res2, 7
|
||||
xvfadd.s res1, VX0, res1
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L14:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
FABS a1, a1
|
||||
FABS a2, a2
|
||||
addi.d I, I, -1
|
||||
ADD a1, a1, a2
|
||||
ADD s1, a1, s1
|
||||
addi.d X, X, 2 * SIZE
|
||||
blt $r0, I, .L14
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvfmul.d VX2, neg1, VX0
|
||||
xvfmul.d VX3, neg1, VX1
|
||||
xvfcmp.clt.d VT0, VX0, res0
|
||||
xvfcmp.clt.d VT1, VX1, res0
|
||||
xvbitsel.v VX0, VX0, VX2, VT0
|
||||
xvbitsel.v VX1, VX1, VX3, VT1
|
||||
xvfadd.d res2, VX0, VX1
|
||||
xvfadd.d res1, res1, res2
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvfmul.d VX2, neg1, VX0
|
||||
xvfmul.d VX3, neg1, VX1
|
||||
xvfcmp.clt.d VT0, VX0, res0
|
||||
xvfcmp.clt.d VT1, VX1, res0
|
||||
xvbitsel.v VX0, VX0, VX2, VT0
|
||||
xvbitsel.v VX1, VX1, VX3, VT1
|
||||
xvfadd.d res2, VX0, VX1
|
||||
xvfadd.d res1, res1, res2
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX1, t1, 0
|
||||
xvinsgr2vr.w VX1, t2, 1
|
||||
xvinsgr2vr.w VX1, t3, 2
|
||||
xvinsgr2vr.w VX1, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX1, t1, 4
|
||||
xvinsgr2vr.w VX1, t2, 5
|
||||
xvinsgr2vr.w VX1, t3, 6
|
||||
xvinsgr2vr.w VX1, t4, 7
|
||||
xvfmul.s VX2, neg1, VX0
|
||||
xvfmul.s VX3, neg1, VX1
|
||||
xvfcmp.clt.s VT0, VX0, res0
|
||||
xvfcmp.clt.s VT1, VX1, res0
|
||||
xvbitsel.v VX0, VX0, VX2, VT0
|
||||
xvbitsel.v VX1, VX1, VX3, VT1
|
||||
xvfadd.s res2, VX0, VX1
|
||||
xvfadd.s res1, res2, res1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VX1, res1, 1
|
||||
xvpickve.d VX2, res1, 2
|
||||
xvpickve.d VX3, res1, 3
|
||||
xvfadd.d res1, VX1, res1
|
||||
xvfadd.d res1, VX2, res1
|
||||
xvfadd.d res1, VX3, res1
|
||||
#else
|
||||
xvfadd.s res2, res1, res2
|
||||
xvpickve.w VX1, res1, 1
|
||||
xvpickve.w VX2, res1, 2
|
||||
xvpickve.w VX3, res1, 3
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX3, res1
|
||||
xvpickve.w VX0, res2, 4
|
||||
xvpickve.w VX1, res2, 5
|
||||
xvpickve.w VX2, res2, 6
|
||||
xvpickve.w VX3, res2, 7
|
||||
xvfadd.s res1, VX0, res1
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
FABS a1, a1
|
||||
FABS a2, a2
|
||||
addi.d I, I, -1
|
||||
ADD a1, a1, a2
|
||||
ADD s1, a1, s1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
MOV $f0, $f16
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,358 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define t1 $r15
|
||||
#define t2 $r12
|
||||
#define t3 $r13
|
||||
#define t4 $r14
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define s1 $f16
|
||||
#define VX0 $vr12
|
||||
#define VX1 $vr13
|
||||
#define VX2 $vr14
|
||||
#define VX3 $vr15
|
||||
#define res1 $vr16
|
||||
#define res2 $vr17
|
||||
#define res3 $vr18
|
||||
#define res0 $vr19
|
||||
#define neg1 $vr20
|
||||
#define VT0 $vr21
|
||||
#define VT1 $vr22
|
||||
|
||||
PROLOGUE
|
||||
vxor.v res1, res1, res1
|
||||
vxor.v res2, res2, res2
|
||||
vxor.v res0, res0, res0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
#ifdef DOUBLE
|
||||
li.d t1, -1
|
||||
vreplgr2vr.d neg1, t1
|
||||
vffint.d.l neg1, neg1
|
||||
#else
|
||||
li.w t1, -1
|
||||
vreplgr2vr.w neg1, t1
|
||||
vffint.s.w neg1, neg1
|
||||
#endif
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vfmul.d VX2, neg1, VX0
|
||||
vfmul.d VX3, neg1, VX1
|
||||
vfcmp.clt.d VT0, VX0, res0
|
||||
vfcmp.clt.d VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
vld VX2, X, 4 * SIZE
|
||||
vld VX3, X, 6 * SIZE
|
||||
vfmul.d VX0, neg1, VX2
|
||||
vfmul.d VX1, neg1, VX3
|
||||
vfcmp.clt.d VT0, VX2, res0
|
||||
vfcmp.clt.d VT1, VX3, res0
|
||||
vbitsel.v VX2, VX2, VX0, VT0
|
||||
vbitsel.v VX3, VX3, VX1, VT1
|
||||
vfadd.d res2, VX2, VX3
|
||||
vfadd.d res1, res1, res2
|
||||
vld VX0, X, 8 * SIZE
|
||||
vld VX1, X, 10 * SIZE
|
||||
vfmul.d VX2, neg1, VX0
|
||||
vfmul.d VX3, neg1, VX1
|
||||
vfcmp.clt.d VT0, VX0, res0
|
||||
vfcmp.clt.d VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
vld VX2, X, 12 * SIZE
|
||||
vld VX3, X, 14 * SIZE
|
||||
vfmul.d VX0, neg1, VX2
|
||||
vfmul.d VX1, neg1, VX3
|
||||
vfcmp.clt.d VT0, VX2, res0
|
||||
vfcmp.clt.d VT1, VX3, res0
|
||||
vbitsel.v VX2, VX2, VX0, VT0
|
||||
vbitsel.v VX3, VX3, VX1, VT1
|
||||
vfadd.d res2, VX2, VX3
|
||||
vfadd.d res1, res1, res2
|
||||
addi.d I, I, -1
|
||||
#else
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX1, X, 4 * SIZE
|
||||
vfmul.s VX2, neg1, VX0
|
||||
vfmul.s VX3, neg1, VX1
|
||||
vfcmp.clt.s VT0, VX0, res0
|
||||
vfcmp.clt.s VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.s res2, VX0, VX1
|
||||
vld VX0, X, 8 * SIZE
|
||||
vld VX1, X, 12 * SIZE
|
||||
addi.d I, I, -1
|
||||
vfmul.s VX2, neg1, VX0
|
||||
vfmul.s VX3, neg1, VX1
|
||||
vfcmp.clt.s VT0, VX0, res0
|
||||
vfcmp.clt.s VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.s res3, VX1, VX0
|
||||
vfadd.s res2, res3, res2
|
||||
vfadd.s res1, res1, res2
|
||||
#endif
|
||||
addi.d X, X, 16 * SIZE
|
||||
blt $r0, I, .L11
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX1, res1, 1
|
||||
vfadd.d res1, VX1, res1
|
||||
#else
|
||||
vreplvei.w VX1, res1, 1
|
||||
vreplvei.w VX2, res1, 2
|
||||
vreplvei.w VX3, res1, 3
|
||||
vfadd.s res1, VX1, res1
|
||||
vfadd.s res1, VX2, res1
|
||||
vfadd.s res1, VX3, res1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L14:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
FABS a1, a1
|
||||
FABS a2, a2
|
||||
addi.d I, I, -1
|
||||
ADD a1, a1, a2
|
||||
ADD s1, a1, s1
|
||||
addi.d X, X, 2 * SIZE
|
||||
blt $r0, I, .L14
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
vinsgr2vr.d VX1, t1, 0
|
||||
vinsgr2vr.d VX1, t2, 1
|
||||
add.d X, X, INCX
|
||||
vfmul.d VX2, neg1, VX0
|
||||
vfmul.d VX3, neg1, VX1
|
||||
vfcmp.clt.d VT0, VX0, res0
|
||||
vfcmp.clt.d VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t3, 0
|
||||
vinsgr2vr.d VX0, t4, 1
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
add.d X, X, INCX
|
||||
vfmul.d VX2, neg1, VX0
|
||||
vfmul.d VX3, neg1, VX1
|
||||
vfcmp.clt.d VT0, VX0, res0
|
||||
vfcmp.clt.d VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
vinsgr2vr.d VX1, t1, 0
|
||||
vinsgr2vr.d VX1, t2, 1
|
||||
add.d X, X, INCX
|
||||
vfmul.d VX2, neg1, VX0
|
||||
vfmul.d VX3, neg1, VX1
|
||||
vfcmp.clt.d VT0, VX0, res0
|
||||
vfcmp.clt.d VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t3, 0
|
||||
vinsgr2vr.d VX0, t4, 1
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
add.d X, X, INCX
|
||||
vfmul.d VX2, neg1, VX0
|
||||
vfmul.d VX3, neg1, VX1
|
||||
vfcmp.clt.d VT0, VX0, res0
|
||||
vfcmp.clt.d VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
vfmul.s VX2, neg1, VX0
|
||||
vfmul.s VX3, neg1, VX1
|
||||
vfcmp.clt.s VT0, VX0, res0
|
||||
vfcmp.clt.s VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.s res2, VX0, VX1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX2, t1, 0
|
||||
vinsgr2vr.w VX2, t2, 1
|
||||
vinsgr2vr.w VX2, t3, 2
|
||||
vinsgr2vr.w VX2, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX3, t1, 0
|
||||
vinsgr2vr.w VX3, t2, 1
|
||||
vinsgr2vr.w VX3, t3, 2
|
||||
vinsgr2vr.w VX3, t4, 3
|
||||
vfmul.s VX0, neg1, VX2
|
||||
vfmul.s VX1, neg1, VX3
|
||||
vfcmp.clt.s VT0, VX2, res0
|
||||
vfcmp.clt.s VT1, VX3, res0
|
||||
vbitsel.v VX2, VX2, VX0, VT0
|
||||
vbitsel.v VX3, VX3, VX1, VT1
|
||||
vfadd.s res3, VX2, VX3
|
||||
vfadd.s res2, res3, res2
|
||||
vfadd.s res1, res1, res2
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX1, res1, 1
|
||||
vfadd.d res1, VX1, res1
|
||||
#else
|
||||
vreplvei.w VX1, res1, 1
|
||||
vreplvei.w VX2, res1, 2
|
||||
vreplvei.w VX3, res1, 3
|
||||
vfadd.s res1, VX1, res1
|
||||
vfadd.s res1, VX2, res1
|
||||
vfadd.s res1, VX3, res1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
FABS a1, a1
|
||||
FABS a2, a2
|
||||
addi.d I, I, -1
|
||||
ADD a1, a1, a2
|
||||
ADD s1, a1, s1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
MOV $f0, $f16
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,707 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define XX $r5
|
||||
#define YY $r6
|
||||
#define ALPHAR $f0
|
||||
#define ALPHAI $f1
|
||||
#define X $r7
|
||||
#define INCX $r8
|
||||
#define Y $r9
|
||||
#define INCY $r10
|
||||
|
||||
#define I $r12
|
||||
#define TEMP $r13
|
||||
#define t1 $r14
|
||||
#define t2 $r16
|
||||
#define t3 $r15
|
||||
#define t4 $r17
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define s1 $f16
|
||||
#define s2 $f17
|
||||
#define s3 $f18
|
||||
#define s4 $f19
|
||||
#define VX0 $xr8
|
||||
#define VX1 $xr20
|
||||
#define VX2 $xr21
|
||||
#define VX3 $xr22
|
||||
#define VXAR $xr23
|
||||
#define VXAI $xr19
|
||||
#define x1 $xr18
|
||||
#define x2 $xr17
|
||||
#define x3 $xr16
|
||||
#define x4 $xr15
|
||||
|
||||
PROLOGUE
|
||||
|
||||
bge $r0, N, .L999
|
||||
li.d TEMP, 1
|
||||
movgr2fr.d a1, $r0
|
||||
FFINT a1, a1
|
||||
CMPEQ $fcc0, ALPHAR, a1
|
||||
CMPEQ $fcc1, ALPHAI, a1
|
||||
bceqz $fcc0, .L10
|
||||
bcnez $fcc1, .L999
|
||||
.L10:
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
slli.d INCY, INCY, ZBASE_SHIFT
|
||||
MTG t1, ALPHAR
|
||||
MTG t2, ALPHAI
|
||||
#ifdef DOUBLE
|
||||
xvreplgr2vr.d VXAR, t1
|
||||
xvreplgr2vr.d VXAI, t2
|
||||
srai.d I, N, 2
|
||||
#else
|
||||
xvreplgr2vr.w VXAR, t1
|
||||
xvreplgr2vr.w VXAI, t2
|
||||
srai.d I, N, 3
|
||||
#endif
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
|
||||
b .L11 // INCX==1 and INCY==1
|
||||
.L20:
|
||||
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
|
||||
b .L21 // INCX!=1 and INCY==1
|
||||
|
||||
.L11:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L111:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvld VX3, Y, 4 * SIZE
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
xvpickev.d x3, VX3, VX2
|
||||
xvpickod.d x4, VX3, VX2
|
||||
#else
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
xvld VX1, X, 8 * SIZE
|
||||
xvld VX3, Y, 8 * SIZE
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
xvpickev.w x3, VX3, VX2
|
||||
xvpickod.w x4, VX3, VX2
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
#ifdef DOUBLE
|
||||
xvfmul.d VX0, VXAI, x2
|
||||
xvfmul.d VX2, VXAI, x1
|
||||
xvfmsub.d VX1, VXAR, x1, VX0
|
||||
xvfmadd.d VX3, x2, VXAR, VX2
|
||||
xvfadd.d x3, x3, VX1
|
||||
xvfadd.d x4, x4, VX3
|
||||
#else
|
||||
xvfmul.s VX0, VXAI, x2
|
||||
xvfmul.s VX2, VXAI, x1
|
||||
xvfmsub.s VX1, VXAR, x1, VX0
|
||||
xvfmadd.s VX3, x2, VXAR, VX2
|
||||
xvfadd.s x3, x3, VX1
|
||||
xvfadd.s x4, x4, VX3
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
xvfmul.d VX0, VXAI, x2
|
||||
xvfmul.d VX2, VXAI, x1
|
||||
xvfmadd.d VX1, VXAR, x1, VX0
|
||||
xvfmsub.d VX3, x2, VXAR, VX2
|
||||
xvfadd.d x3, x3, VX1
|
||||
xvfsub.d x4, x4, VX3
|
||||
#else
|
||||
xvfmul.s VX0, VXAI, x2
|
||||
xvfmul.s VX2, VXAI, x1
|
||||
xvfmadd.s VX1, VXAR, x1, VX0
|
||||
xvfmsub.s VX3, x2, VXAR, VX2
|
||||
xvfadd.s x3, x3, VX1
|
||||
xvfsub.s x4, x4, VX3
|
||||
#endif
|
||||
#endif
|
||||
#ifdef DOUBLE
|
||||
xvilvl.d VX2, x4 ,x3
|
||||
xvilvh.d VX3, x4, x3
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
xvst VX3, Y, 4 * SIZE
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
#else
|
||||
xvilvl.w VX2, x4 ,x3
|
||||
xvilvh.w VX3, x4, x3
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
xvst VX3, Y, 8 * SIZE
|
||||
addi.d X, X, 16 * SIZE
|
||||
addi.d Y, Y, 16 * SIZE
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L111
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L12: // INCX==1 and INCY!=1
|
||||
bge $r0, I, .L997
|
||||
move YY, Y
|
||||
.align 3
|
||||
|
||||
.L121:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.d x3, t1, 0
|
||||
xvinsgr2vr.d x4, t2, 0
|
||||
xvinsgr2vr.d x3, t3, 2
|
||||
xvinsgr2vr.d x4, t4, 2
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
xvinsgr2vr.d x3, t1, 1
|
||||
xvinsgr2vr.d x4, t2, 1
|
||||
xvinsgr2vr.d x3, t3, 3
|
||||
xvinsgr2vr.d x4, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
#else
|
||||
xvld VX0, X, 0 * SIZE
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 0
|
||||
xvinsgr2vr.w x4, t2, 0
|
||||
xvinsgr2vr.w x3, t3, 1
|
||||
xvinsgr2vr.w x4, t4, 1
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 4
|
||||
xvinsgr2vr.w x4, t2, 4
|
||||
xvinsgr2vr.w x3, t3, 5
|
||||
xvinsgr2vr.w x4, t4, 5
|
||||
xvld VX1, X, 8 * SIZE
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 2
|
||||
xvinsgr2vr.w x4, t2, 2
|
||||
xvinsgr2vr.w x3, t3, 3
|
||||
xvinsgr2vr.w x4, t4, 3
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
xvinsgr2vr.w x3, t1, 6
|
||||
xvinsgr2vr.w x4, t2, 6
|
||||
xvinsgr2vr.w x3, t3, 7
|
||||
xvinsgr2vr.w x4, t4, 7
|
||||
add.d Y, Y, INCY
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
#ifdef DOUBLE
|
||||
xvfmul.d VX0, VXAI, x2
|
||||
xvfmul.d VX2, VXAI, x1
|
||||
xvfmsub.d VX1, VXAR, x1, VX0
|
||||
xvfmadd.d VX3, x2, VXAR, VX2
|
||||
xvfadd.d x3, x3, VX1
|
||||
xvfadd.d x4, x4, VX3
|
||||
#else
|
||||
xvfmul.s VX0, VXAI, x2
|
||||
xvfmul.s VX2, VXAI, x1
|
||||
xvfmsub.s VX1, VXAR, x1, VX0
|
||||
xvfmadd.s VX3, x2, VXAR, VX2
|
||||
xvfadd.s x3, x3, VX1
|
||||
xvfadd.s x4, x4, VX3
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
xvfmul.d VX0, VXAI, x2
|
||||
xvfmul.d VX2, VXAI, x1
|
||||
xvfmadd.d VX1, VXAR, x1, VX0
|
||||
xvfmsub.d VX3, x2, VXAR, VX2
|
||||
xvfadd.d x3, x3, VX1
|
||||
xvfsub.d x4, x4, VX3
|
||||
#else
|
||||
xvfmul.s VX0, VXAI, x2
|
||||
xvfmul.s VX2, VXAI, x1
|
||||
xvfmadd.s VX1, VXAR, x1, VX0
|
||||
xvfmsub.s VX3, x2, VXAR, VX2
|
||||
xvfadd.s x3, x3, VX1
|
||||
xvfsub.s x4, x4, VX3
|
||||
#endif
|
||||
#endif
|
||||
#ifdef DOUBLE
|
||||
xvstelm.d x3, YY, 0 * SIZE, 0
|
||||
xvstelm.d x4, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d x3, YY, 0 * SIZE, 2
|
||||
xvstelm.d x4, YY, 1 * SIZE, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d x3, YY, 0 * SIZE, 1
|
||||
xvstelm.d x4, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d x3, YY, 0 * SIZE, 3
|
||||
xvstelm.d x4, YY, 1 * SIZE, 3
|
||||
add.d YY, YY, INCY
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
#else
|
||||
addi.d I, I, -1
|
||||
xvstelm.w x3, YY, 0 * SIZE, 0
|
||||
xvstelm.w x4, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 1
|
||||
xvstelm.w x4, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 4
|
||||
xvstelm.w x4, YY, 1 * SIZE, 4
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 5
|
||||
xvstelm.w x4, YY, 1 * SIZE, 5
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 2
|
||||
xvstelm.w x4, YY, 1 * SIZE, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 3
|
||||
xvstelm.w x4, YY, 1 * SIZE, 3
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 6
|
||||
xvstelm.w x4, YY, 1 * SIZE, 6
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 7
|
||||
xvstelm.w x4, YY, 1 * SIZE, 7
|
||||
add.d YY, YY, INCY
|
||||
addi.d X, X, 16 * SIZE
|
||||
#endif
|
||||
blt $r0, I, .L121
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L21:// INCX!=1 and INCY==1
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L211:
|
||||
#ifdef DOUBLE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
xvld VX3, Y, 4 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 0
|
||||
xvinsgr2vr.d x2, t2, 0
|
||||
xvinsgr2vr.d x1, t3, 2
|
||||
xvinsgr2vr.d x2, t4, 2
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
xvinsgr2vr.d x1, t1, 1
|
||||
xvinsgr2vr.d x2, t2, 1
|
||||
xvinsgr2vr.d x1, t3, 3
|
||||
xvinsgr2vr.d x2, t4, 3
|
||||
add.d X, X, INCX
|
||||
xvpickev.d x3, VX3, VX2
|
||||
xvpickod.d x4, VX3, VX2
|
||||
#else
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 0
|
||||
xvinsgr2vr.w x2, t2, 0
|
||||
xvinsgr2vr.w x1, t3, 1
|
||||
xvinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 4
|
||||
xvinsgr2vr.w x2, t2, 4
|
||||
xvinsgr2vr.w x1, t3, 5
|
||||
xvinsgr2vr.w x2, t4, 5
|
||||
xvld VX3, Y, 8 * SIZE
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 2
|
||||
xvinsgr2vr.w x2, t2, 2
|
||||
xvinsgr2vr.w x1, t3, 3
|
||||
xvinsgr2vr.w x2, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 6
|
||||
xvinsgr2vr.w x2, t2, 6
|
||||
xvinsgr2vr.w x1, t3, 7
|
||||
xvinsgr2vr.w x2, t4, 7
|
||||
xvpickev.w x3, VX3, VX2
|
||||
xvpickod.w x4, VX3, VX2
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
#ifdef DOUBLE
|
||||
xvfmul.d VX0, VXAI, x2
|
||||
xvfmul.d VX2, VXAI, x1
|
||||
xvfmsub.d VX1, VXAR, x1, VX0
|
||||
xvfmadd.d VX3, x2, VXAR, VX2
|
||||
xvfadd.d x3, x3, VX1
|
||||
xvfadd.d x4, x4, VX3
|
||||
#else
|
||||
xvfmul.s VX0, VXAI, x2
|
||||
xvfmul.s VX2, VXAI, x1
|
||||
xvfmsub.s VX1, VXAR, x1, VX0
|
||||
xvfmadd.s VX3, x2, VXAR, VX2
|
||||
xvfadd.s x3, x3, VX1
|
||||
xvfadd.s x4, x4, VX3
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
xvfmul.d VX0, VXAI, x2
|
||||
xvfmul.d VX2, VXAI, x1
|
||||
xvfmadd.d VX1, VXAR, x1, VX0
|
||||
xvfmsub.d VX3, x2, VXAR, VX2
|
||||
xvfadd.d x3, x3, VX1
|
||||
xvfsub.d x4, x4, VX3
|
||||
#else
|
||||
xvfmul.s VX0, VXAI, x2
|
||||
xvfmul.s VX2, VXAI, x1
|
||||
xvfmadd.s VX1, VXAR, x1, VX0
|
||||
xvfmsub.s VX3, x2, VXAR, VX2
|
||||
xvfadd.s x3, x3, VX1
|
||||
xvfsub.s x4, x4, VX3
|
||||
#endif
|
||||
#endif
|
||||
#ifdef DOUBLE
|
||||
xvilvl.d VX2, x4 ,x3
|
||||
xvilvh.d VX3, x4, x3
|
||||
addi.d I, I, -1
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
xvst VX3, Y, 4 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
#else
|
||||
xvilvl.w VX2, x4 ,x3
|
||||
xvilvh.w VX3, x4, x3
|
||||
addi.d I, I, -1
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
xvst VX3, Y, 8 * SIZE
|
||||
addi.d Y, Y, 16 * SIZE
|
||||
#endif
|
||||
blt $r0, I, .L211
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
bge $r0, I, .L997
|
||||
move YY, Y
|
||||
.align 3
|
||||
|
||||
.L222:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 0
|
||||
xvinsgr2vr.d x2, t2, 0
|
||||
xvinsgr2vr.d x1, t3, 1
|
||||
xvinsgr2vr.d x2, t4, 1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 2
|
||||
xvinsgr2vr.d x2, t2, 2
|
||||
xvinsgr2vr.d x1, t3, 3
|
||||
xvinsgr2vr.d x2, t4, 3
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.d x3, t1, 0
|
||||
xvinsgr2vr.d x4, t2, 0
|
||||
xvinsgr2vr.d x3, t3, 1
|
||||
xvinsgr2vr.d x4, t4, 1
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.d x3, t1, 2
|
||||
xvinsgr2vr.d x4, t2, 2
|
||||
xvinsgr2vr.d x3, t3, 3
|
||||
xvinsgr2vr.d x4, t4, 3
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 0
|
||||
xvinsgr2vr.w x2, t2, 0
|
||||
xvinsgr2vr.w x1, t3, 1
|
||||
xvinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 2
|
||||
xvinsgr2vr.w x2, t2, 2
|
||||
xvinsgr2vr.w x1, t3, 3
|
||||
xvinsgr2vr.w x2, t4, 3
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 0
|
||||
xvinsgr2vr.w x4, t2, 0
|
||||
xvinsgr2vr.w x3, t3, 1
|
||||
xvinsgr2vr.w x4, t4, 1
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 2
|
||||
xvinsgr2vr.w x4, t2, 2
|
||||
xvinsgr2vr.w x3, t3, 3
|
||||
xvinsgr2vr.w x4, t4, 3
|
||||
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 4
|
||||
xvinsgr2vr.w x2, t2, 4
|
||||
xvinsgr2vr.w x1, t3, 5
|
||||
xvinsgr2vr.w x2, t4, 5
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 6
|
||||
xvinsgr2vr.w x2, t2, 6
|
||||
xvinsgr2vr.w x1, t3, 7
|
||||
xvinsgr2vr.w x2, t4, 7
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 4
|
||||
xvinsgr2vr.w x4, t2, 4
|
||||
xvinsgr2vr.w x3, t3, 5
|
||||
xvinsgr2vr.w x4, t4, 5
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 6
|
||||
xvinsgr2vr.w x4, t2, 6
|
||||
xvinsgr2vr.w x3, t3, 7
|
||||
xvinsgr2vr.w x4, t4, 7
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
#ifdef DOUBLE
|
||||
xvfmul.d VX0, VXAI, x2
|
||||
xvfmul.d VX2, VXAI, x1
|
||||
xvfmsub.d VX1, VXAR, x1, VX0
|
||||
xvfmadd.d VX3, x2, VXAR, VX2
|
||||
xvfadd.d x3, x3, VX1
|
||||
xvfadd.d x4, x4, VX3
|
||||
#else
|
||||
xvfmul.s VX0, VXAI, x2
|
||||
xvfmul.s VX2, VXAI, x1
|
||||
xvfmsub.s VX1, VXAR, x1, VX0
|
||||
xvfmadd.s VX3, x2, VXAR, VX2
|
||||
xvfadd.s x3, x3, VX1
|
||||
xvfadd.s x4, x4, VX3
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
xvfmul.d VX0, VXAI, x2
|
||||
xvfmul.d VX2, VXAI, x1
|
||||
xvfmadd.d VX1, VXAR, x1, VX0
|
||||
xvfmsub.d VX3, x2, VXAR, VX2
|
||||
xvfadd.d x3, x3, VX1
|
||||
xvfsub.d x4, x4, VX3
|
||||
#else
|
||||
xvfmul.s VX0, VXAI, x2
|
||||
xvfmul.s VX2, VXAI, x1
|
||||
xvfmadd.s VX1, VXAR, x1, VX0
|
||||
xvfmsub.s VX3, x2, VXAR, VX2
|
||||
xvfadd.s x3, x3, VX1
|
||||
xvfsub.s x4, x4, VX3
|
||||
#endif
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
#ifdef DOUBLE
|
||||
xvstelm.d x3, YY, 0 * SIZE, 0
|
||||
xvstelm.d x4, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d x3, YY, 0 * SIZE, 1
|
||||
xvstelm.d x4, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d x3, YY, 0 * SIZE, 2
|
||||
xvstelm.d x4, YY, 1 * SIZE, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d x3, YY, 0 * SIZE, 3
|
||||
xvstelm.d x4, YY, 1 * SIZE, 3
|
||||
#else
|
||||
xvstelm.w x3, YY, 0 * SIZE, 0
|
||||
xvstelm.w x4, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 1
|
||||
xvstelm.w x4, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 2
|
||||
xvstelm.w x4, YY, 1 * SIZE, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 3
|
||||
xvstelm.w x4, YY, 1 * SIZE, 3
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 4
|
||||
xvstelm.w x4, YY, 1 * SIZE, 4
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 5
|
||||
xvstelm.w x4, YY, 1 * SIZE, 5
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 6
|
||||
xvstelm.w x4, YY, 1 * SIZE, 6
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 7
|
||||
xvstelm.w x4, YY, 1 * SIZE, 7
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
blt $r0, I, .L222
|
||||
.align 3
|
||||
|
||||
.L997:
|
||||
#ifdef DOUBLE
|
||||
andi I, N, 3
|
||||
#else
|
||||
andi I, N, 7
|
||||
#endif
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, Y, 0 * SIZE
|
||||
LD a4, Y, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
#if !defined(CONJ)
|
||||
MUL s1, ALPHAI, a2
|
||||
MUL s2, ALPHAI, a1
|
||||
MSUB s3, ALPHAR, a1, s1
|
||||
MADD s4, a2, ALPHAR, s2
|
||||
ADD s3, s3, a3
|
||||
ADD s4, s4, a4
|
||||
#else
|
||||
MUL s1, ALPHAI, a2
|
||||
MUL s2, ALPHAI, a1
|
||||
MADD s3, ALPHAR, a1, s1
|
||||
MSUB s4, a2, ALPHAR, s2
|
||||
ADD s3, s3, a3
|
||||
SUB s4, a4, s4
|
||||
#endif
|
||||
ST s3, Y, 0 * SIZE
|
||||
ST s4, Y, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r12
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,679 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define XX $r5
|
||||
#define YY $r6
|
||||
#define ALPHAR $f0
|
||||
#define ALPHAI $f1
|
||||
#define X $r7
|
||||
#define INCX $r8
|
||||
#define Y $r9
|
||||
#define INCY $r10
|
||||
|
||||
#define I $r12
|
||||
#define TEMP $r13
|
||||
#define t1 $r14
|
||||
#define t2 $r16
|
||||
#define t3 $r15
|
||||
#define t4 $r17
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define s1 $f16
|
||||
#define s2 $f17
|
||||
#define s3 $f18
|
||||
#define s4 $f19
|
||||
#define VX0 $vr8
|
||||
#define VX1 $vr20
|
||||
#define VX2 $vr21
|
||||
#define VX3 $vr22
|
||||
#define VXAR $vr23
|
||||
#define VXAI $vr19
|
||||
#define x1 $vr18
|
||||
#define x2 $vr17
|
||||
#define x3 $vr16
|
||||
#define x4 $vr15
|
||||
|
||||
PROLOGUE
|
||||
|
||||
bge $r0, N, .L999
|
||||
li.d TEMP, 1
|
||||
movgr2fr.d a1, $r0
|
||||
FFINT a1, a1
|
||||
CMPEQ $fcc0, ALPHAR, a1
|
||||
CMPEQ $fcc1, ALPHAI, a1
|
||||
bceqz $fcc0, .L10
|
||||
bcnez $fcc1, .L999
|
||||
.L10:
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
slli.d INCY, INCY, ZBASE_SHIFT
|
||||
MTG t1, ALPHAR
|
||||
MTG t2, ALPHAI
|
||||
#ifdef DOUBLE
|
||||
vreplgr2vr.d VXAR, t1
|
||||
vreplgr2vr.d VXAI, t2
|
||||
#else
|
||||
vreplgr2vr.w VXAR, t1
|
||||
vreplgr2vr.w VXAI, t2
|
||||
#endif
|
||||
srai.d I, N, 2
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
|
||||
b .L11 // INCX==1 and INCY==1
|
||||
.L20:
|
||||
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
|
||||
b .L21 // INCX!=1 and INCY==1
|
||||
|
||||
.L11:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L111:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vld VX3, Y, 2 * SIZE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vpickev.d x3, VX3, VX2
|
||||
vpickod.d x4, VX3, VX2
|
||||
#else
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
vld VX1, X, 4 * SIZE
|
||||
vld VX3, Y, 4 * SIZE
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vpickev.w x3, VX3, VX2
|
||||
vpickod.w x4, VX3, VX2
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
#ifdef DOUBLE
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmsub.d VX1, VXAR, x1, VX0
|
||||
vfmadd.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfadd.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.s VX0, VXAI, x2
|
||||
vfmul.s VX2, VXAI, x1
|
||||
vfmsub.s VX1, VXAR, x1, VX0
|
||||
vfmadd.s VX3, x2, VXAR, VX2
|
||||
vfadd.s x3, x3, VX1
|
||||
vfadd.s x4, x4, VX3
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmadd.d VX1, VXAR, x1, VX0
|
||||
vfmsub.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfsub.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.s VX0, VXAI, x2
|
||||
vfmul.s VX2, VXAI, x1
|
||||
vfmadd.s VX1, VXAR, x1, VX0
|
||||
vfmsub.s VX3, x2, VXAR, VX2
|
||||
vfadd.s x3, x3, VX1
|
||||
vfsub.s x4, x4, VX3
|
||||
#endif
|
||||
#endif
|
||||
#ifdef DOUBLE
|
||||
vilvl.d VX2, x4 ,x3
|
||||
vilvh.d VX3, x4, x3
|
||||
vst VX2, Y, 0 * SIZE
|
||||
vst VX3, Y, 2 * SIZE
|
||||
vld VX0, X, 4 * SIZE
|
||||
vld VX2, Y, 4 * SIZE
|
||||
vld VX1, X, 6 * SIZE
|
||||
vld VX3, Y, 6 * SIZE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vpickev.d x3, VX3, VX2
|
||||
vpickod.d x4, VX3, VX2
|
||||
#if !defined(CONJ)
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmsub.d VX1, VXAR, x1, VX0
|
||||
vfmadd.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfadd.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmadd.d VX1, VXAR, x1, VX0
|
||||
vfmsub.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfsub.d x4, x4, VX3
|
||||
#endif
|
||||
vilvl.d VX2, x4 ,x3
|
||||
vilvh.d VX3, x4, x3
|
||||
vst VX2, Y, 4 * SIZE
|
||||
vst VX3, Y, 6 * SIZE
|
||||
#else
|
||||
vilvl.w VX2, x4 ,x3
|
||||
vilvh.w VX3, x4, x3
|
||||
vst VX2, Y, 0 * SIZE
|
||||
vst VX3, Y, 4 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L111
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L12: // INCX==1 and INCY!=1
|
||||
bge $r0, I, .L997
|
||||
move YY, Y
|
||||
.align 3
|
||||
|
||||
.L121:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX1, X, 2 * SIZE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
vinsgr2vr.d x3, t1, 0
|
||||
vinsgr2vr.d x4, t2, 0
|
||||
vinsgr2vr.d x3, t3, 1
|
||||
vinsgr2vr.d x4, t4, 1
|
||||
add.d Y, Y, INCY
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
#else
|
||||
vld VX0, X, 0 * SIZE
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.w x3, t1, 0
|
||||
vinsgr2vr.w x4, t2, 0
|
||||
vinsgr2vr.w x3, t3, 1
|
||||
vinsgr2vr.w x4, t4, 1
|
||||
vld VX1, X, 4 * SIZE
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
vinsgr2vr.w x3, t1, 2
|
||||
vinsgr2vr.w x4, t2, 2
|
||||
vinsgr2vr.w x3, t3, 3
|
||||
vinsgr2vr.w x4, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
#ifdef DOUBLE
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmsub.d VX1, VXAR, x1, VX0
|
||||
vfmadd.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfadd.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.s VX0, VXAI, x2
|
||||
vfmul.s VX2, VXAI, x1
|
||||
vfmsub.s VX1, VXAR, x1, VX0
|
||||
vfmadd.s VX3, x2, VXAR, VX2
|
||||
vfadd.s x3, x3, VX1
|
||||
vfadd.s x4, x4, VX3
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmadd.d VX1, VXAR, x1, VX0
|
||||
vfmsub.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfsub.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.s VX0, VXAI, x2
|
||||
vfmul.s VX2, VXAI, x1
|
||||
vfmadd.s VX1, VXAR, x1, VX0
|
||||
vfmsub.s VX3, x2, VXAR, VX2
|
||||
vfadd.s x3, x3, VX1
|
||||
vfsub.s x4, x4, VX3
|
||||
#endif
|
||||
#endif
|
||||
#ifdef DOUBLE
|
||||
vstelm.d x3, YY, 0 * SIZE, 0
|
||||
vstelm.d x4, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d x3, YY, 0 * SIZE, 1
|
||||
vstelm.d x4, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
|
||||
vld VX0, X, 4 * SIZE
|
||||
vld VX1, X, 6 * SIZE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
vinsgr2vr.d x3, t1, 0
|
||||
vinsgr2vr.d x4, t2, 0
|
||||
vinsgr2vr.d x3, t3, 1
|
||||
vinsgr2vr.d x4, t4, 1
|
||||
add.d Y, Y, INCY
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
#if !defined(CONJ)
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmsub.d VX1, VXAR, x1, VX0
|
||||
vfmadd.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfadd.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmadd.d VX1, VXAR, x1, VX0
|
||||
vfmsub.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfsub.d x4, x4, VX3
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
vstelm.d x3, YY, 0 * SIZE, 0
|
||||
vstelm.d x4, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d x3, YY, 0 * SIZE, 1
|
||||
vstelm.d x4, YY, 1 * SIZE, 1
|
||||
#else
|
||||
addi.d I, I, -1
|
||||
vstelm.w x3, YY, 0 * SIZE, 0
|
||||
vstelm.w x4, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w x3, YY, 0 * SIZE, 1
|
||||
vstelm.w x4, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w x3, YY, 0 * SIZE, 2
|
||||
vstelm.w x4, YY, 1 * SIZE, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w x3, YY, 0 * SIZE, 3
|
||||
vstelm.w x4, YY, 1 * SIZE, 3
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
addi.d X, X, 8 * SIZE
|
||||
blt $r0, I, .L121
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L21:// INCX!=1 and INCY==1
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L211:
|
||||
#ifdef DOUBLE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
vld VX3, Y, 2 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
add.d X, X, INCX
|
||||
vpickev.d x3, VX3, VX2
|
||||
vpickod.d x4, VX3, VX2
|
||||
#else
|
||||
vld VX2, Y, 0 * SIZE
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 0
|
||||
vinsgr2vr.w x2, t2, 0
|
||||
vinsgr2vr.w x1, t3, 1
|
||||
vinsgr2vr.w x2, t4, 1
|
||||
vld VX3, Y, 4 * SIZE
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
vinsgr2vr.w x1, t1, 2
|
||||
vinsgr2vr.w x2, t2, 2
|
||||
vinsgr2vr.w x1, t3, 3
|
||||
vinsgr2vr.w x2, t4, 3
|
||||
add.d X, X, INCX
|
||||
vpickev.w x3, VX3, VX2
|
||||
vpickod.w x4, VX3, VX2
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
#ifdef DOUBLE
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmsub.d VX1, VXAR, x1, VX0
|
||||
vfmadd.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfadd.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.s VX0, VXAI, x2
|
||||
vfmul.s VX2, VXAI, x1
|
||||
vfmsub.s VX1, VXAR, x1, VX0
|
||||
vfmadd.s VX3, x2, VXAR, VX2
|
||||
vfadd.s x3, x3, VX1
|
||||
vfadd.s x4, x4, VX3
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmadd.d VX1, VXAR, x1, VX0
|
||||
vfmsub.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfsub.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.s VX0, VXAI, x2
|
||||
vfmul.s VX2, VXAI, x1
|
||||
vfmadd.s VX1, VXAR, x1, VX0
|
||||
vfmsub.s VX3, x2, VXAR, VX2
|
||||
vfadd.s x3, x3, VX1
|
||||
vfsub.s x4, x4, VX3
|
||||
#endif
|
||||
#endif
|
||||
#ifdef DOUBLE
|
||||
vilvl.d VX2, x4 ,x3
|
||||
vilvh.d VX3, x4, x3
|
||||
vst VX2, Y, 0 * SIZE
|
||||
vst VX3, Y, 2 * SIZE
|
||||
|
||||
vld VX2, Y, 4 * SIZE
|
||||
vld VX3, Y, 6 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
add.d X, X, INCX
|
||||
vpickev.d x3, VX3, VX2
|
||||
vpickod.d x4, VX3, VX2
|
||||
#if !defined(CONJ)
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmsub.d VX1, VXAR, x1, VX0
|
||||
vfmadd.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfadd.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmadd.d VX1, VXAR, x1, VX0
|
||||
vfmsub.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfsub.d x4, x4, VX3
|
||||
#endif
|
||||
vilvl.d VX2, x4 ,x3
|
||||
vilvh.d VX3, x4, x3
|
||||
addi.d I, I, -1
|
||||
vst VX2, Y, 4 * SIZE
|
||||
vst VX3, Y, 6 * SIZE
|
||||
#else
|
||||
vilvl.w VX2, x4 ,x3
|
||||
vilvh.w VX3, x4, x3
|
||||
addi.d I, I, -1
|
||||
vst VX2, Y, 0 * SIZE
|
||||
vst VX3, Y, 4 * SIZE
|
||||
#endif
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L211
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
bge $r0, I, .L997
|
||||
move YY, Y
|
||||
.align 3
|
||||
|
||||
.L222:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
vinsgr2vr.d x3, t1, 0
|
||||
vinsgr2vr.d x4, t2, 0
|
||||
vinsgr2vr.d x3, t3, 1
|
||||
vinsgr2vr.d x4, t4, 1
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 0
|
||||
vinsgr2vr.w x2, t2, 0
|
||||
vinsgr2vr.w x1, t3, 1
|
||||
vinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.w x3, t1, 0
|
||||
vinsgr2vr.w x4, t2, 0
|
||||
vinsgr2vr.w x3, t3, 1
|
||||
vinsgr2vr.w x4, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 2
|
||||
vinsgr2vr.w x2, t2, 2
|
||||
vinsgr2vr.w x1, t3, 3
|
||||
vinsgr2vr.w x2, t4, 3
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
vinsgr2vr.w x3, t1, 2
|
||||
vinsgr2vr.w x4, t2, 2
|
||||
vinsgr2vr.w x3, t3, 3
|
||||
vinsgr2vr.w x4, t4, 3
|
||||
#endif
|
||||
add.d Y, Y, INCY
|
||||
#if !defined(CONJ)
|
||||
#ifdef DOUBLE
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmsub.d VX1, VXAR, x1, VX0
|
||||
vfmadd.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfadd.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.s VX0, VXAI, x2
|
||||
vfmul.s VX2, VXAI, x1
|
||||
vfmsub.s VX1, VXAR, x1, VX0
|
||||
vfmadd.s VX3, x2, VXAR, VX2
|
||||
vfadd.s x3, x3, VX1
|
||||
vfadd.s x4, x4, VX3
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmadd.d VX1, VXAR, x1, VX0
|
||||
vfmsub.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfsub.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.s VX0, VXAI, x2
|
||||
vfmul.s VX2, VXAI, x1
|
||||
vfmadd.s VX1, VXAR, x1, VX0
|
||||
vfmsub.s VX3, x2, VXAR, VX2
|
||||
vfadd.s x3, x3, VX1
|
||||
vfsub.s x4, x4, VX3
|
||||
#endif
|
||||
#endif
|
||||
#ifdef DOUBLE
|
||||
vstelm.d x3, YY, 0 * SIZE, 0
|
||||
vstelm.d x4, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d x3, YY, 0 * SIZE, 1
|
||||
vstelm.d x4, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
vinsgr2vr.d x3, t1, 0
|
||||
vinsgr2vr.d x4, t2, 0
|
||||
vinsgr2vr.d x3, t3, 1
|
||||
vinsgr2vr.d x4, t4, 1
|
||||
add.d Y, Y, INCY
|
||||
#if !defined(CONJ)
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmsub.d VX1, VXAR, x1, VX0
|
||||
vfmadd.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfadd.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmadd.d VX1, VXAR, x1, VX0
|
||||
vfmsub.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfsub.d x4, x4, VX3
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
vstelm.d x3, YY, 0 * SIZE, 0
|
||||
vstelm.d x4, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d x3, YY, 0 * SIZE, 1
|
||||
vstelm.d x4, YY, 1 * SIZE, 1
|
||||
#else
|
||||
addi.d I, I, -1
|
||||
vstelm.w x3, YY, 0 * SIZE, 0
|
||||
vstelm.w x4, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w x3, YY, 0 * SIZE, 1
|
||||
vstelm.w x4, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w x3, YY, 0 * SIZE, 2
|
||||
vstelm.w x4, YY, 1 * SIZE, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w x3, YY, 0 * SIZE, 3
|
||||
vstelm.w x4, YY, 1 * SIZE, 3
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
blt $r0, I, .L222
|
||||
.align 3
|
||||
|
||||
.L997:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, Y, 0 * SIZE
|
||||
LD a4, Y, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
#if !defined(CONJ)
|
||||
MUL s1, ALPHAI, a2
|
||||
MUL s2, ALPHAI, a1
|
||||
MSUB s3, ALPHAR, a1, s1
|
||||
MADD s4, a2, ALPHAR, s2
|
||||
ADD s3, s3, a3
|
||||
ADD s4, s4, a4
|
||||
#else
|
||||
MUL s1, ALPHAI, a2
|
||||
MUL s2, ALPHAI, a1
|
||||
MADD s3, ALPHAR, a1, s1
|
||||
MSUB s4, a2, ALPHAR, s2
|
||||
ADD s3, s3, a3
|
||||
SUB s4, a4, s4
|
||||
#endif
|
||||
ST s3, Y, 0 * SIZE
|
||||
ST s4, Y, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r12
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,386 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define Y $r7
|
||||
#define INCY $r8
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define t1 $r14
|
||||
#define t2 $r15
|
||||
#define t3 $r16
|
||||
#define t4 $r19
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define VX0 $xr12
|
||||
#define VX1 $xr13
|
||||
#define VX2 $xr14
|
||||
#define VX3 $xr15
|
||||
|
||||
PROLOGUE
|
||||
bge $r0, N, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
slli.d INCY, INCY, ZBASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
|
||||
b .L11 // INCX==1 and INCY==1
|
||||
.L20:
|
||||
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
|
||||
b .L21 // INCX!=1 and INCY==1
|
||||
|
||||
.L11:
|
||||
bge $r0, I, .L112
|
||||
.align 3
|
||||
|
||||
.L111:
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvld VX2, X, 8 * SIZE
|
||||
xvld VX3, X, 12 * SIZE
|
||||
xvst VX0, Y, 0 * SIZE
|
||||
xvst VX1, Y, 4 * SIZE
|
||||
xvst VX2, Y, 8 * SIZE
|
||||
xvst VX3, Y, 12 * SIZE
|
||||
#else
|
||||
xvld VX1, X, 8 * SIZE
|
||||
xvst VX0, Y, 0 * SIZE
|
||||
xvst VX1, Y, 8 * SIZE
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 16 * SIZE
|
||||
addi.d Y, Y, 16 * SIZE
|
||||
blt $r0, I, .L111
|
||||
.align 3
|
||||
|
||||
.L112:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L113:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 2 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
addi.d Y, Y, 2 * SIZE
|
||||
blt $r0, I, .L113
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
bge $r0, I, .L122
|
||||
.align 3
|
||||
|
||||
.L121:
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvld VX2, X, 8 * SIZE
|
||||
xvld VX3, X, 12 * SIZE
|
||||
xvstelm.d VX0, Y, 0 * SIZE, 0
|
||||
xvstelm.d VX0, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.d VX0, Y, 0 * SIZE, 2
|
||||
xvstelm.d VX0, Y, 1 * SIZE, 3
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.d VX1, Y, 0 * SIZE, 0
|
||||
xvstelm.d VX1, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.d VX1, Y, 0 * SIZE, 2
|
||||
xvstelm.d VX1, Y, 1 * SIZE, 3
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.d VX2, Y, 0 * SIZE, 0
|
||||
xvstelm.d VX2, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.d VX2, Y, 0 * SIZE, 2
|
||||
xvstelm.d VX2, Y, 1 * SIZE, 3
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.d VX3, Y, 0 * SIZE, 0
|
||||
xvstelm.d VX3, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.d VX3, Y, 0 * SIZE, 2
|
||||
xvstelm.d VX3, Y, 1 * SIZE, 3
|
||||
#else
|
||||
xvld VX1, X, 8 * SIZE
|
||||
xvstelm.w VX0, Y, 0 * SIZE, 0
|
||||
xvstelm.w VX0, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.w VX0, Y, 0 * SIZE, 2
|
||||
xvstelm.w VX0, Y, 1 * SIZE, 3
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.w VX0, Y, 0 * SIZE, 4
|
||||
xvstelm.w VX0, Y, 1 * SIZE, 5
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.w VX0, Y, 0 * SIZE, 6
|
||||
xvstelm.w VX0, Y, 1 * SIZE, 7
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.w VX1, Y, 0 * SIZE, 0
|
||||
xvstelm.w VX1, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.w VX1, Y, 0 * SIZE, 2
|
||||
xvstelm.w VX1, Y, 1 * SIZE, 3
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.w VX1, Y, 0 * SIZE, 4
|
||||
xvstelm.w VX1, Y, 1 * SIZE, 5
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.w VX1, Y, 0 * SIZE, 6
|
||||
xvstelm.w VX1, Y, 1 * SIZE, 7
|
||||
#endif
|
||||
add.d Y, Y, INCY
|
||||
addi.d X, X, 16 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L121
|
||||
.align 3
|
||||
|
||||
.L122:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L123:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 2 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L123
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
bge $r0, I, .L212
|
||||
.align 3
|
||||
|
||||
.L211:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX2, t1, 0
|
||||
xvinsgr2vr.d VX2, t2, 1
|
||||
xvinsgr2vr.d VX2, t3, 2
|
||||
xvinsgr2vr.d VX2, t4, 3
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX3, t1, 0
|
||||
xvinsgr2vr.d VX3, t2, 1
|
||||
xvinsgr2vr.d VX3, t3, 2
|
||||
xvinsgr2vr.d VX3, t4, 3
|
||||
xvst VX0, Y, 0 * SIZE
|
||||
xvst VX1, Y, 4 * SIZE
|
||||
xvst VX2, Y, 8 * SIZE
|
||||
xvst VX3, Y, 12 * SIZE
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX1, t1, 0
|
||||
xvinsgr2vr.w VX1, t2, 1
|
||||
xvinsgr2vr.w VX1, t3, 2
|
||||
xvinsgr2vr.w VX1, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX1, t1, 4
|
||||
xvinsgr2vr.w VX1, t2, 5
|
||||
xvinsgr2vr.w VX1, t3, 6
|
||||
xvinsgr2vr.w VX1, t4, 7
|
||||
xvst VX0, Y, 0 * SIZE
|
||||
xvst VX1, Y, 8 * SIZE
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
addi.d Y, Y, 16 * SIZE
|
||||
blt $r0, I, .L211
|
||||
.align 3
|
||||
|
||||
.L212:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L213:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
addi.d Y, Y, 2 * SIZE
|
||||
blt $r0, I, .L213
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
bge $r0, I, .L223
|
||||
.align 3
|
||||
|
||||
.L222:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L222
|
||||
.align 3
|
||||
|
||||
.L223:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L224:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L224
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r12
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,411 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define Y $r7
|
||||
#define INCY $r8
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define t1 $r14
|
||||
#define t2 $r15
|
||||
#define t3 $r16
|
||||
#define t4 $r19
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define VX0 $vr12
|
||||
#define VX1 $vr13
|
||||
#define VX2 $vr14
|
||||
#define VX3 $vr15
|
||||
|
||||
PROLOGUE
|
||||
bge $r0, N, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
slli.d INCY, INCY, ZBASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
|
||||
b .L11 // INCX==1 and INCY==1
|
||||
.L20:
|
||||
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
|
||||
b .L21 // INCX!=1 and INCY==1
|
||||
|
||||
.L11:// INCX==1 and INCY==1
|
||||
bge $r0, I, .L112
|
||||
.align 3
|
||||
|
||||
.L111:
|
||||
vld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vld VX2, X, 4 * SIZE
|
||||
vld VX3, X, 6 * SIZE
|
||||
vst VX0, Y, 0 * SIZE
|
||||
vst VX1, Y, 2 * SIZE
|
||||
vst VX2, Y, 4 * SIZE
|
||||
vst VX3, Y, 6 * SIZE
|
||||
vld VX0, X, 8 * SIZE
|
||||
vld VX1, X, 10 * SIZE
|
||||
vld VX2, X, 12 * SIZE
|
||||
vld VX3, X, 14 * SIZE
|
||||
addi.d I, I, -1
|
||||
vst VX0, Y, 8 * SIZE
|
||||
vst VX1, Y, 10 * SIZE
|
||||
vst VX2, Y, 12 * SIZE
|
||||
vst VX3, Y, 14 * SIZE
|
||||
#else
|
||||
vld VX1, X, 4 * SIZE
|
||||
vld VX2, X, 8 * SIZE
|
||||
vld VX3, X, 12 * SIZE
|
||||
addi.d I, I, -1
|
||||
vst VX0, Y, 0 * SIZE
|
||||
vst VX1, Y, 4 * SIZE
|
||||
vst VX2, Y, 8 * SIZE
|
||||
vst VX3, Y, 12 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 16 * SIZE
|
||||
addi.d Y, Y, 16 * SIZE
|
||||
blt $r0, I, .L111
|
||||
.align 3
|
||||
|
||||
.L112:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L113:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 2 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
addi.d Y, Y, 2 * SIZE
|
||||
blt $r0, I, .L113
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L12: // INCX==1 and INCY!=1
|
||||
bge $r0, I, .L122
|
||||
.align 3
|
||||
|
||||
.L121:
|
||||
vld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vld VX2, X, 4 * SIZE
|
||||
vld VX3, X, 6 * SIZE
|
||||
vstelm.d VX0, Y, 0 * SIZE, 0
|
||||
vstelm.d VX0, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.d VX1, Y, 0 * SIZE, 0
|
||||
vstelm.d VX1, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.d VX2, Y, 0 * SIZE, 0
|
||||
vstelm.d VX2, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.d VX3, Y, 0 * SIZE, 0
|
||||
vstelm.d VX3, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vld VX0, X, 8 * SIZE
|
||||
vld VX1, X, 10 * SIZE
|
||||
vld VX2, X, 12 * SIZE
|
||||
vld VX3, X, 14 * SIZE
|
||||
vstelm.d VX0, Y, 0 * SIZE, 0
|
||||
vstelm.d VX0, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.d VX1, Y, 0 * SIZE, 0
|
||||
vstelm.d VX1, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.d VX2, Y, 0 * SIZE, 0
|
||||
vstelm.d VX2, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.d VX3, Y, 0 * SIZE, 0
|
||||
vstelm.d VX3, Y, 1 * SIZE, 1
|
||||
#else
|
||||
vld VX1, X, 4 * SIZE
|
||||
vld VX2, X, 8 * SIZE
|
||||
vld VX3, X, 12 * SIZE
|
||||
vstelm.w VX0, Y, 0 * SIZE, 0
|
||||
vstelm.w VX0, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.w VX0, Y, 0 * SIZE, 2
|
||||
vstelm.w VX0, Y, 1 * SIZE, 3
|
||||
add.d Y, Y, INCY
|
||||
vstelm.w VX1, Y, 0 * SIZE, 0
|
||||
vstelm.w VX1, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.w VX1, Y, 0 * SIZE, 2
|
||||
vstelm.w VX1, Y, 1 * SIZE, 3
|
||||
add.d Y, Y, INCY
|
||||
vstelm.w VX2, Y, 0 * SIZE, 0
|
||||
vstelm.w VX2, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.w VX2, Y, 0 * SIZE, 2
|
||||
vstelm.w VX2, Y, 1 * SIZE, 3
|
||||
add.d Y, Y, INCY
|
||||
vstelm.w VX3, Y, 0 * SIZE, 0
|
||||
vstelm.w VX3, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.w VX3, Y, 0 * SIZE, 2
|
||||
vstelm.w VX3, Y, 1 * SIZE, 3
|
||||
#endif
|
||||
add.d Y, Y, INCY
|
||||
addi.d X, X, 16 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L121
|
||||
.align 3
|
||||
|
||||
.L122:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L123:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 2 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L123
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
bge $r0, I, .L212
|
||||
.align 3
|
||||
|
||||
.L211:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vst VX0, Y, 0 * SIZE
|
||||
vst VX1, Y, 2 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vst VX0, Y, 4 * SIZE
|
||||
vst VX1, Y, 6 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vst VX0, Y, 8 * SIZE
|
||||
vst VX1, Y, 10 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vst VX0, Y, 12 * SIZE
|
||||
vst VX1, Y, 14 * SIZE
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX2, t1, 0
|
||||
vinsgr2vr.w VX2, t2, 1
|
||||
vinsgr2vr.w VX2, t3, 2
|
||||
vinsgr2vr.w VX2, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX3, t1, 0
|
||||
vinsgr2vr.w VX3, t2, 1
|
||||
vinsgr2vr.w VX3, t3, 2
|
||||
vinsgr2vr.w VX3, t4, 3
|
||||
vst VX0, Y, 0 * SIZE
|
||||
vst VX1, Y, 4 * SIZE
|
||||
vst VX2, Y, 8 * SIZE
|
||||
vst VX3, Y, 12 * SIZE
|
||||
#endif
|
||||
addi.d Y, Y, 16 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L211
|
||||
.align 3
|
||||
|
||||
.L212:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L213:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
addi.d Y, Y, 2 * SIZE
|
||||
blt $r0, I, .L213
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
bge $r0, I, .L223
|
||||
.align 3
|
||||
|
||||
.L222:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L222
|
||||
.align 3
|
||||
|
||||
.L223:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L224:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L224
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r12
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,565 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define Y $r7
|
||||
#define INCY $r8
|
||||
#define I $r19
|
||||
#define TEMP $r10
|
||||
#define t1 $r11
|
||||
#define t2 $r12
|
||||
#define t3 $r13
|
||||
#define t4 $r14
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define s1 $f16
|
||||
#define s2 $f17
|
||||
#define s3 $f18
|
||||
#define s4 $f19
|
||||
#define res1 $xr16
|
||||
#define res2 $xr17
|
||||
#define res3 $xr18
|
||||
#define res4 $xr19
|
||||
#define VX0 $xr12
|
||||
#define VX1 $xr13
|
||||
#define VX2 $xr14
|
||||
#define VX3 $xr15
|
||||
#define x1 $xr20
|
||||
#define x2 $xr21
|
||||
#define x3 $xr22
|
||||
#define x4 $xr23
|
||||
|
||||
PROLOGUE
|
||||
xvxor.v res1, res1, res1
|
||||
xvxor.v res2, res2, res2
|
||||
xvxor.v res3, res3, res3
|
||||
xvxor.v res4, res4, res4
|
||||
bge $r0, N, .L999
|
||||
li.d TEMP, 2 * SIZE
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
slli.d INCY, INCY, ZBASE_SHIFT
|
||||
#ifdef DOUBLE
|
||||
srai.d I, N, 2
|
||||
#else
|
||||
srai.d I, N, 3
|
||||
#endif
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
|
||||
b .L11 // INCX==1 and INCY==1
|
||||
.L20:
|
||||
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
|
||||
b .L21 // INCX!=1 and INCY==1
|
||||
|
||||
.L11:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L111:
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
xvld VX3, Y, 4 * SIZE
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
xvpickev.d x3, VX3, VX2
|
||||
xvpickod.d x4, VX3, VX2
|
||||
xvfmadd.d res1, x1, x3, res1
|
||||
xvfmadd.d res2, x2, x3, res2
|
||||
xvfmadd.d res3, x1, x4, res3
|
||||
xvfmadd.d res4, x2, x4, res4
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
#else
|
||||
xvld VX1, X, 8 * SIZE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
xvld VX3, Y, 8 * SIZE
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
xvpickev.w x3, VX3, VX2
|
||||
xvpickod.w x4, VX3, VX2
|
||||
xvfmadd.s res1, x1, x3, res1
|
||||
xvfmadd.s res2, x2, x3, res2
|
||||
xvfmadd.s res3, x1, x4, res3
|
||||
xvfmadd.s res4, x2, x4, res4
|
||||
addi.d X, X, 16 * SIZE
|
||||
addi.d Y, Y, 16 * SIZE
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L111
|
||||
b .L996
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L121:
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.d x3, t1, 0
|
||||
xvinsgr2vr.d x4, t2, 0
|
||||
xvinsgr2vr.d x3, t3, 2
|
||||
xvinsgr2vr.d x4, t4, 2
|
||||
xvld VX1, X, 4 * SIZE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.d x3, t1, 1
|
||||
xvinsgr2vr.d x4, t2, 1
|
||||
xvinsgr2vr.d x3, t3, 3
|
||||
xvinsgr2vr.d x4, t4, 3
|
||||
addi.d X, X, 8 * SIZE
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
xvfmadd.d res1, x1, x3, res1
|
||||
xvfmadd.d res2, x2, x3, res2
|
||||
xvfmadd.d res3, x1, x4, res3
|
||||
xvfmadd.d res4, x2, x4, res4
|
||||
#else
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 0
|
||||
xvinsgr2vr.w x4, t2, 0
|
||||
xvinsgr2vr.w x3, t3, 1
|
||||
xvinsgr2vr.w x4, t4, 1
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 4
|
||||
xvinsgr2vr.w x4, t2, 4
|
||||
xvinsgr2vr.w x3, t3, 5
|
||||
xvinsgr2vr.w x4, t4, 5
|
||||
xvld VX1, X, 8 * SIZE
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 2
|
||||
xvinsgr2vr.w x4, t2, 2
|
||||
xvinsgr2vr.w x3, t3, 3
|
||||
xvinsgr2vr.w x4, t4, 3
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 6
|
||||
xvinsgr2vr.w x4, t2, 6
|
||||
xvinsgr2vr.w x3, t3, 7
|
||||
xvinsgr2vr.w x4, t4, 7
|
||||
addi.d X, X, 16 * SIZE
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
xvfmadd.s res1, x1, x3, res1
|
||||
xvfmadd.s res2, x2, x3, res2
|
||||
xvfmadd.s res3, x1, x4, res3
|
||||
xvfmadd.s res4, x2, x4, res4
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L121
|
||||
b .L996
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L211:
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 0
|
||||
xvinsgr2vr.d x2, t2, 0
|
||||
xvinsgr2vr.d x1, t3, 2
|
||||
xvinsgr2vr.d x2, t4, 2
|
||||
xvld VX3, Y, 4 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 1
|
||||
xvinsgr2vr.d x2, t2, 1
|
||||
xvinsgr2vr.d x1, t3, 3
|
||||
xvinsgr2vr.d x2, t4, 3
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
xvpickev.d x3, VX3, VX2
|
||||
xvpickod.d x4, VX3, VX2
|
||||
xvfmadd.d res1, x1, x3, res1
|
||||
xvfmadd.d res2, x2, x3, res2
|
||||
xvfmadd.d res3, x1, x4, res3
|
||||
xvfmadd.d res4, x2, x4, res4
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 0
|
||||
xvinsgr2vr.w x2, t2, 0
|
||||
xvinsgr2vr.w x1, t3, 1
|
||||
xvinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 4
|
||||
xvinsgr2vr.w x2, t2, 4
|
||||
xvinsgr2vr.w x1, t3, 5
|
||||
xvinsgr2vr.w x2, t4, 5
|
||||
xvld VX3, Y, 8 * SIZE
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 2
|
||||
xvinsgr2vr.w x2, t2, 2
|
||||
xvinsgr2vr.w x1, t3, 3
|
||||
xvinsgr2vr.w x2, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 6
|
||||
xvinsgr2vr.w x2, t2, 6
|
||||
xvinsgr2vr.w x1, t3, 7
|
||||
xvinsgr2vr.w x2, t4, 7
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
xvpickev.w x3, VX3, VX2
|
||||
xvpickod.w x4, VX3, VX2
|
||||
xvfmadd.s res1, x1, x3, res1
|
||||
xvfmadd.s res2, x2, x3, res2
|
||||
xvfmadd.s res3, x1, x4, res3
|
||||
xvfmadd.s res4, x2, x4, res4
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L211
|
||||
b .L996
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L222:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 0
|
||||
xvinsgr2vr.d x2, t2, 0
|
||||
xvinsgr2vr.d x1, t3, 1
|
||||
xvinsgr2vr.d x2, t4, 1
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.d x3, t1, 0
|
||||
xvinsgr2vr.d x4, t2, 0
|
||||
xvinsgr2vr.d x3, t3, 1
|
||||
xvinsgr2vr.d x4, t4, 1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 2
|
||||
xvinsgr2vr.d x2, t2, 2
|
||||
xvinsgr2vr.d x1, t3, 3
|
||||
xvinsgr2vr.d x2, t4, 3
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.d x3, t1, 2
|
||||
xvinsgr2vr.d x4, t2, 2
|
||||
xvinsgr2vr.d x3, t3, 3
|
||||
xvinsgr2vr.d x4, t4, 3
|
||||
xvfmadd.d res1, x1, x3, res1
|
||||
xvfmadd.d res2, x2, x3, res2
|
||||
xvfmadd.d res3, x1, x4, res3
|
||||
xvfmadd.d res4, x2, x4, res4
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 0
|
||||
xvinsgr2vr.w x2, t2, 0
|
||||
xvinsgr2vr.w x1, t3, 1
|
||||
xvinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 0
|
||||
xvinsgr2vr.w x4, t2, 0
|
||||
xvinsgr2vr.w x3, t3, 1
|
||||
xvinsgr2vr.w x4, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 2
|
||||
xvinsgr2vr.w x2, t2, 2
|
||||
xvinsgr2vr.w x1, t3, 3
|
||||
xvinsgr2vr.w x2, t4, 3
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 2
|
||||
xvinsgr2vr.w x4, t2, 2
|
||||
xvinsgr2vr.w x3, t3, 3
|
||||
xvinsgr2vr.w x4, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 4
|
||||
xvinsgr2vr.w x2, t2, 4
|
||||
xvinsgr2vr.w x1, t3, 5
|
||||
xvinsgr2vr.w x2, t4, 5
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 4
|
||||
xvinsgr2vr.w x4, t2, 4
|
||||
xvinsgr2vr.w x3, t3, 5
|
||||
xvinsgr2vr.w x4, t4, 5
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 6
|
||||
xvinsgr2vr.w x2, t2, 6
|
||||
xvinsgr2vr.w x1, t3, 7
|
||||
xvinsgr2vr.w x2, t4, 7
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 6
|
||||
xvinsgr2vr.w x4, t2, 6
|
||||
xvinsgr2vr.w x3, t3, 7
|
||||
xvinsgr2vr.w x4, t4, 7
|
||||
xvfmadd.s res1, x1, x3, res1
|
||||
xvfmadd.s res2, x2, x3, res2
|
||||
xvfmadd.s res3, x1, x4, res3
|
||||
xvfmadd.s res4, x2, x4, res4
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L222
|
||||
.align 3
|
||||
|
||||
.L996:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VX1, res1, 1
|
||||
xvpickve.d VX2, res1, 2
|
||||
xvpickve.d VX3, res1, 3
|
||||
xvfadd.d res1, VX1, res1
|
||||
xvfadd.d res1, VX2, res1
|
||||
xvfadd.d res1, VX3, res1
|
||||
xvpickve.d VX1, res2, 1
|
||||
xvpickve.d VX2, res2, 2
|
||||
xvpickve.d VX3, res2, 3
|
||||
xvfadd.d res2, VX1, res2
|
||||
xvfadd.d res2, VX2, res2
|
||||
xvfadd.d res2, VX3, res2
|
||||
xvpickve.d VX1, res3, 1
|
||||
xvpickve.d VX2, res3, 2
|
||||
xvpickve.d VX3, res3, 3
|
||||
xvfadd.d res3, VX1, res3
|
||||
xvfadd.d res3, VX2, res3
|
||||
xvfadd.d res3, VX3, res3
|
||||
xvpickve.d VX1, res4, 1
|
||||
xvpickve.d VX2, res4, 2
|
||||
xvpickve.d VX3, res4, 3
|
||||
xvfadd.d res4, VX1, res4
|
||||
xvfadd.d res4, VX2, res4
|
||||
xvfadd.d res4, VX3, res4
|
||||
#else
|
||||
xvpickve.w VX0, res1, 1
|
||||
xvpickve.w VX1, res1, 2
|
||||
xvpickve.w VX2, res1, 3
|
||||
xvpickve.w VX3, res1, 4
|
||||
xvpickve.w x1, res1, 5
|
||||
xvpickve.w x2, res1, 6
|
||||
xvpickve.w x3, res1, 7
|
||||
xvfadd.s res1, VX0, res1
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX3, res1
|
||||
xvfadd.s res1, x1, res1
|
||||
xvfadd.s res1, x2, res1
|
||||
xvfadd.s res1, x3, res1
|
||||
xvpickve.w VX0, res2, 1
|
||||
xvpickve.w VX1, res2, 2
|
||||
xvpickve.w VX2, res2, 3
|
||||
xvpickve.w VX3, res2, 4
|
||||
xvpickve.w x1, res2, 5
|
||||
xvpickve.w x2, res2, 6
|
||||
xvpickve.w x3, res2, 7
|
||||
xvfadd.s res2, VX0, res2
|
||||
xvfadd.s res2, VX1, res2
|
||||
xvfadd.s res2, VX2, res2
|
||||
xvfadd.s res2, VX3, res2
|
||||
xvfadd.s res2, x1, res2
|
||||
xvfadd.s res2, x2, res2
|
||||
xvfadd.s res2, x3, res2
|
||||
xvpickve.w VX0, res3, 1
|
||||
xvpickve.w VX1, res3, 2
|
||||
xvpickve.w VX2, res3, 3
|
||||
xvpickve.w VX3, res3, 4
|
||||
xvpickve.w x1, res3, 5
|
||||
xvpickve.w x2, res3, 6
|
||||
xvpickve.w x3, res3, 7
|
||||
xvfadd.s res3, VX0, res3
|
||||
xvfadd.s res3, VX1, res3
|
||||
xvfadd.s res3, VX2, res3
|
||||
xvfadd.s res3, VX3, res3
|
||||
xvfadd.s res3, x1, res3
|
||||
xvfadd.s res3, x2, res3
|
||||
xvfadd.s res3, x3, res3
|
||||
xvpickve.w VX0, res4, 1
|
||||
xvpickve.w VX1, res4, 2
|
||||
xvpickve.w VX2, res4, 3
|
||||
xvpickve.w VX3, res4, 4
|
||||
xvpickve.w x1, res4, 5
|
||||
xvpickve.w x2, res4, 6
|
||||
xvpickve.w x3, res4, 7
|
||||
xvfadd.s res4, VX0, res4
|
||||
xvfadd.s res4, VX1, res4
|
||||
xvfadd.s res4, VX2, res4
|
||||
xvfadd.s res4, VX3, res4
|
||||
xvfadd.s res4, x1, res4
|
||||
xvfadd.s res4, x2, res4
|
||||
xvfadd.s res4, x3, res4
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L997:
|
||||
#ifdef DOUBLE
|
||||
andi I, N, 3
|
||||
#else
|
||||
andi I, N, 7
|
||||
#endif
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, Y, 0 * SIZE
|
||||
LD a4, Y, 1 * SIZE
|
||||
MADD s1, a1, a3, s1
|
||||
MADD s2, a2, a3, s2
|
||||
MADD s3, a1, a4, s3
|
||||
MADD s4, a2, a4, s4
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
#ifndef CONJ
|
||||
SUB $f0, s1, s4
|
||||
ADD $f1, s3, s2
|
||||
#else
|
||||
ADD $f0, s1, s4
|
||||
SUB $f1, s3, s2
|
||||
#endif
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,397 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define Y $r7
|
||||
#define INCY $r8
|
||||
#define I $r19
|
||||
#define TEMP $r10
|
||||
#define t1 $r11
|
||||
#define t2 $r12
|
||||
#define t3 $r13
|
||||
#define t4 $r14
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define s1 $f16
|
||||
#define s2 $f17
|
||||
#define s3 $f18
|
||||
#define s4 $f19
|
||||
#define res1 $vr16
|
||||
#define res2 $vr17
|
||||
#define res3 $vr18
|
||||
#define res4 $vr19
|
||||
#define VX0 $vr12
|
||||
#define VX1 $vr13
|
||||
#define VX2 $vr14
|
||||
#define VX3 $vr15
|
||||
#define x1 $vr20
|
||||
#define x2 $vr21
|
||||
#define x3 $vr22
|
||||
#define x4 $vr23
|
||||
|
||||
PROLOGUE
|
||||
vxor.v res1, res1, res1
|
||||
vxor.v res2, res2, res2
|
||||
vxor.v res3, res3, res3
|
||||
vxor.v res4, res4, res4
|
||||
bge $r0, N, .L999
|
||||
li.d TEMP, 2 * SIZE
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
slli.d INCY, INCY, ZBASE_SHIFT
|
||||
#ifdef DOUBLE
|
||||
srai.d I, N, 1
|
||||
#else
|
||||
srai.d I, N, 2
|
||||
#endif
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
|
||||
b .L11 // INCX==1 and INCY==1
|
||||
.L20:
|
||||
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
|
||||
b .L21 // INCX!=1 and INCY==1
|
||||
|
||||
.L11:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L111:
|
||||
vld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
vld VX3, Y, 2 * SIZE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vpickev.d x3, VX3, VX2
|
||||
vpickod.d x4, VX3, VX2
|
||||
vfmadd.d res1, x1, x3, res1
|
||||
vfmadd.d res2, x2, x3, res2
|
||||
vfmadd.d res3, x1, x4, res3
|
||||
vfmadd.d res4, x2, x4, res4
|
||||
addi.d X, X, 4 * SIZE
|
||||
addi.d Y, Y, 4 * SIZE
|
||||
#else
|
||||
vld VX1, X, 4 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
vld VX3, Y, 4 * SIZE
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vpickev.w x3, VX3, VX2
|
||||
vpickod.w x4, VX3, VX2
|
||||
vfmadd.s res1, x1, x3, res1
|
||||
vfmadd.s res2, x2, x3, res2
|
||||
vfmadd.s res3, x1, x4, res3
|
||||
vfmadd.s res4, x2, x4, res4
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L111
|
||||
b .L996
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L121:
|
||||
vld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vld VX1, X, 2 * SIZE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.d x3, t1, 0
|
||||
vinsgr2vr.d x4, t2, 0
|
||||
vinsgr2vr.d x3, t3, 1
|
||||
vinsgr2vr.d x4, t4, 1
|
||||
addi.d X, X, 4 * SIZE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vfmadd.d res1, x1, x3, res1
|
||||
vfmadd.d res2, x2, x3, res2
|
||||
vfmadd.d res3, x1, x4, res3
|
||||
vfmadd.d res4, x2, x4, res4
|
||||
#else
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.w x3, t1, 0
|
||||
vinsgr2vr.w x4, t2, 0
|
||||
vinsgr2vr.w x3, t3, 1
|
||||
vinsgr2vr.w x4, t4, 1
|
||||
vld VX1, X, 4 * SIZE
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.w x3, t1, 2
|
||||
vinsgr2vr.w x4, t2, 2
|
||||
vinsgr2vr.w x3, t3, 3
|
||||
vinsgr2vr.w x4, t4, 3
|
||||
addi.d X, X, 8 * SIZE
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vfmadd.s res1, x1, x3, res1
|
||||
vfmadd.s res2, x2, x3, res2
|
||||
vfmadd.s res3, x1, x4, res3
|
||||
vfmadd.s res4, x2, x4, res4
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L121
|
||||
b .L996
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L211:
|
||||
vld VX2, Y, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vld VX3, Y, 2 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
addi.d Y, Y, 4 * SIZE
|
||||
vpickev.d x3, VX3, VX2
|
||||
vpickod.d x4, VX3, VX2
|
||||
vfmadd.d res1, x1, x3, res1
|
||||
vfmadd.d res2, x2, x3, res2
|
||||
vfmadd.d res3, x1, x4, res3
|
||||
vfmadd.d res4, x2, x4, res4
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 0
|
||||
vinsgr2vr.w x2, t2, 0
|
||||
vinsgr2vr.w x1, t3, 1
|
||||
vinsgr2vr.w x2, t4, 1
|
||||
vld VX3, Y, 4 * SIZE
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 2
|
||||
vinsgr2vr.w x2, t2, 2
|
||||
vinsgr2vr.w x1, t3, 3
|
||||
vinsgr2vr.w x2, t4, 3
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
vpickev.w x3, VX3, VX2
|
||||
vpickod.w x4, VX3, VX2
|
||||
vfmadd.s res1, x1, x3, res1
|
||||
vfmadd.s res2, x2, x3, res2
|
||||
vfmadd.s res3, x1, x4, res3
|
||||
vfmadd.s res4, x2, x4, res4
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L211
|
||||
b .L996
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L222:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.d x3, t1, 0
|
||||
vinsgr2vr.d x4, t2, 0
|
||||
vinsgr2vr.d x3, t3, 1
|
||||
vinsgr2vr.d x4, t4, 1
|
||||
vfmadd.d res1, x1, x3, res1
|
||||
vfmadd.d res2, x2, x3, res2
|
||||
vfmadd.d res3, x1, x4, res3
|
||||
vfmadd.d res4, x2, x4, res4
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 0
|
||||
vinsgr2vr.w x2, t2, 0
|
||||
vinsgr2vr.w x1, t3, 1
|
||||
vinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.w x3, t1, 0
|
||||
vinsgr2vr.w x4, t2, 0
|
||||
vinsgr2vr.w x3, t3, 1
|
||||
vinsgr2vr.w x4, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 2
|
||||
vinsgr2vr.w x2, t2, 2
|
||||
vinsgr2vr.w x1, t3, 3
|
||||
vinsgr2vr.w x2, t4, 3
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.w x3, t1, 2
|
||||
vinsgr2vr.w x4, t2, 2
|
||||
vinsgr2vr.w x3, t3, 3
|
||||
vinsgr2vr.w x4, t4, 3
|
||||
vfmadd.s res1, x1, x3, res1
|
||||
vfmadd.s res2, x2, x3, res2
|
||||
vfmadd.s res3, x1, x4, res3
|
||||
vfmadd.s res4, x2, x4, res4
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L222
|
||||
.align 3
|
||||
|
||||
.L996:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX1, res1, 1
|
||||
vfadd.d res1, VX1, res1
|
||||
vreplvei.d VX1, res2, 1
|
||||
vfadd.d res2, VX1, res2
|
||||
vreplvei.d VX1, res3, 1
|
||||
vfadd.d res3, VX1, res3
|
||||
vreplvei.d VX1, res4, 1
|
||||
vfadd.d res4, VX1, res4
|
||||
#else
|
||||
vreplvei.w VX1, res1, 1
|
||||
vreplvei.w VX2, res1, 2
|
||||
vreplvei.w VX3, res1, 3
|
||||
vfadd.s res1, VX1, res1
|
||||
vfadd.s res1, VX2, res1
|
||||
vfadd.s res1, VX3, res1
|
||||
vreplvei.w VX1, res2, 1
|
||||
vreplvei.w VX2, res2, 2
|
||||
vreplvei.w VX3, res2, 3
|
||||
vfadd.s res2, VX1, res2
|
||||
vfadd.s res2, VX2, res2
|
||||
vfadd.s res2, VX3, res2
|
||||
vreplvei.w VX1, res3, 1
|
||||
vreplvei.w VX2, res3, 2
|
||||
vreplvei.w VX3, res3, 3
|
||||
vfadd.s res3, VX1, res3
|
||||
vfadd.s res3, VX2, res3
|
||||
vfadd.s res3, VX3, res3
|
||||
vreplvei.w VX1, res4, 1
|
||||
vreplvei.w VX2, res4, 2
|
||||
vreplvei.w VX3, res4, 3
|
||||
vfadd.s res4, VX1, res4
|
||||
vfadd.s res4, VX2, res4
|
||||
vfadd.s res4, VX3, res4
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L997:
|
||||
#ifdef DOUBLE
|
||||
andi I, N, 1
|
||||
#else
|
||||
andi I, N, 3
|
||||
#endif
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, Y, 0 * SIZE
|
||||
LD a4, Y, 1 * SIZE
|
||||
MADD s1, a1, a3, s1
|
||||
MADD s2, a2, a3, s2
|
||||
MADD s3, a1, a4, s3
|
||||
MADD s4, a2, a4, s4
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
#ifndef CONJ
|
||||
SUB $f0, s1, s4
|
||||
ADD $f1, s3, s2
|
||||
#else
|
||||
ADD $f0, s1, s4
|
||||
SUB $f1, s3, s2
|
||||
#endif
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,857 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: bm
|
||||
#define N $r5 // param 2: bn
|
||||
#define K $r6 // param 3: bk
|
||||
#define ALPHA_R $f0 // param 4: alphar
|
||||
#define ALPHA_I $f1 // param 5: alphai
|
||||
#define A $r7 // param 6: ba
|
||||
#define B $r8 // param 7: bb
|
||||
#define C $r9 // param 8: bc
|
||||
#define LDC $r10 // param 9: ldc
|
||||
|
||||
#if defined (TRMMKERNEL)
|
||||
#define OFFSET $r11 // param 10: offset
|
||||
#endif
|
||||
#define OFF $r26
|
||||
|
||||
#define I $r12
|
||||
#define J $r13
|
||||
#define L $r14
|
||||
#define TL $r15
|
||||
#define A0 $r16
|
||||
#define B0 $r17
|
||||
#define C0 $r18
|
||||
#define C1 $r19
|
||||
#define C2 $r20
|
||||
#define C3 $r23
|
||||
#define T0 $r24
|
||||
#define T1 $r25
|
||||
|
||||
#define a1 $f2
|
||||
#define a2 $f3
|
||||
#define a3 $f4
|
||||
#define a4 $f5
|
||||
#define a5 $f6
|
||||
#define a6 $f7
|
||||
#define a7 $f8
|
||||
#define a8 $f9
|
||||
#define b1 $f10
|
||||
#define b2 $f11
|
||||
#define b3 $f12
|
||||
#define b4 $f13
|
||||
#define b5 $f14
|
||||
#define b6 $f15
|
||||
#define b7 $f16
|
||||
#define b8 $f17
|
||||
#define c11 $f18
|
||||
#define c12 $f19
|
||||
#define c21 $f20
|
||||
#define c22 $f21
|
||||
#define c31 $f22
|
||||
#define c32 $f23
|
||||
#define c41 $f24
|
||||
#define c42 $f25
|
||||
|
||||
/* LASX vectors */
|
||||
#define U0 $xr30
|
||||
#define U1 $xr31
|
||||
#define U2 $xr2
|
||||
#define U3 $xr3
|
||||
#define U4 $xr4
|
||||
#define U5 $xr5
|
||||
#define U6 $xr6
|
||||
#define U7 $xr7
|
||||
#define U8 $xr8
|
||||
#define U9 $xr9
|
||||
#define U10 $xr10
|
||||
#define U11 $xr11
|
||||
#define U12 $xr12
|
||||
#define U13 $xr13
|
||||
#define U14 $xr14
|
||||
#define U15 $xr15
|
||||
#define D0 $xr16
|
||||
#define D1 $xr17
|
||||
#define D2 $xr18
|
||||
#define D3 $xr19
|
||||
#define D4 $xr20
|
||||
#define D5 $xr21
|
||||
#define D6 $xr22
|
||||
#define D7 $xr23
|
||||
#define D8 $xr24
|
||||
#define D9 $xr25
|
||||
#define D10 $xr26
|
||||
#define D11 $xr27
|
||||
#define VALPHAR $xr28
|
||||
#define VALPHAI $xr29
|
||||
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
#define XVMADD1 XVFMADD
|
||||
#define XVMADD2 XVFMADD
|
||||
#define XVMADD3 XVNMSUB
|
||||
#define XVMADD4 XVFMADD
|
||||
|
||||
#define VMADD1 VFMADD
|
||||
#define VMADD2 VFMADD
|
||||
#define VMADD3 VNMSUB
|
||||
#define VMADD4 VFMADD
|
||||
|
||||
#define XVFADD1 XVFADD
|
||||
#define XVFADD2 XVFADD
|
||||
#define XVFADD3 XVFSUB
|
||||
#define XVFADD4 XVFADD
|
||||
|
||||
#define MADD1 MADD
|
||||
#define MADD2 MADD
|
||||
#define MADD3 NMSUB
|
||||
#define MADD4 MADD
|
||||
#endif
|
||||
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
#define XVMADD1 XVFMADD
|
||||
#define XVMADD2 XVFMADD
|
||||
#define XVMADD3 XVFMADD
|
||||
#define XVMADD4 XVNMSUB
|
||||
|
||||
#define VMADD1 VFMADD
|
||||
#define VMADD2 VFMADD
|
||||
#define VMADD3 VFMADD
|
||||
#define VMADD4 VNMSUB
|
||||
|
||||
#define XVFADD1 XVFADD
|
||||
#define XVFADD2 XVFADD
|
||||
#define XVFADD3 XVFADD
|
||||
#define XVFADD4 XVFSUB
|
||||
|
||||
#define MADD1 MADD
|
||||
#define MADD2 MADD
|
||||
#define MADD3 MADD
|
||||
#define MADD4 NMSUB
|
||||
#endif
|
||||
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
#define XVMADD1 XVFMADD
|
||||
#define XVMADD2 XVNMSUB
|
||||
#define XVMADD3 XVFMADD
|
||||
#define XVMADD4 XVFMADD
|
||||
|
||||
#define VMADD1 VFMADD
|
||||
#define VMADD2 VNMSUB
|
||||
#define VMADD3 VFMADD
|
||||
#define VMADD4 VFMADD
|
||||
|
||||
#define XVFADD1 XVFADD
|
||||
#define XVFADD2 XVFSUB
|
||||
#define XVFADD3 XVFADD
|
||||
#define XVFADD4 XVFADD
|
||||
|
||||
#define MADD1 MADD
|
||||
#define MADD2 NMSUB
|
||||
#define MADD3 MADD
|
||||
#define MADD4 MADD
|
||||
#endif
|
||||
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
#define XVMADD1 XVFMADD
|
||||
#define XVMADD2 XVNMSUB
|
||||
#define XVMADD3 XVNMSUB
|
||||
#define XVMADD4 XVNMSUB
|
||||
|
||||
#define VMADD1 VFMADD
|
||||
#define VMADD2 VNMSUB
|
||||
#define VMADD3 VNMSUB
|
||||
#define VMADD4 VNMSUB
|
||||
|
||||
#define XVFADD1 XVFADD
|
||||
#define XVFADD2 XVFSUB
|
||||
#define XVFADD3 XVFSUB
|
||||
#define XVFADD4 XVFSUB
|
||||
|
||||
#define MADD1 MADD
|
||||
#define MADD2 NMSUB
|
||||
#define MADD3 NMSUB
|
||||
#define MADD4 NMSUB
|
||||
#endif
|
||||
|
||||
PROLOGUE
|
||||
|
||||
addi.d $sp, $sp, -128
|
||||
SDARG $r23, $sp, 0
|
||||
SDARG $r24, $sp, 8
|
||||
SDARG $r25, $sp, 16
|
||||
SDARG $r26, $sp, 24
|
||||
SDARG $r27, $sp, 32
|
||||
ST $f23, $sp, 40
|
||||
ST $f24, $sp, 48
|
||||
ST $f25, $sp, 56
|
||||
ST $f26, $sp, 64
|
||||
ST $f27, $sp, 72
|
||||
ST $f28, $sp, 80
|
||||
ST $f29, $sp, 88
|
||||
ST $f30, $sp, 96
|
||||
ST $f31, $sp, 104
|
||||
ST ALPHA_R,$sp, 112
|
||||
ST ALPHA_I,$sp, 120
|
||||
|
||||
xvldrepl.w VALPHAR, $sp, 112
|
||||
xvldrepl.w VALPHAI, $sp, 120
|
||||
|
||||
#if defined (TRMMKERNEL) && !defined(LEFT)
|
||||
sub.d OFF, $r0, OFFSET
|
||||
#else
|
||||
xor OFF, OFF, OFF
|
||||
#endif
|
||||
|
||||
slli.d LDC, LDC, 2
|
||||
|
||||
move J, $r0
|
||||
srai.d T0, N, 1
|
||||
beq J, T0, .L19
|
||||
|
||||
.L10: /* for(j=0; j<bn/2; j+=1) */
|
||||
move C0, C
|
||||
slli.d TL, LDC, 1
|
||||
add.d C1, C0, TL
|
||||
move A0, A //ptrba
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
move OFF, OFFSET
|
||||
#endif
|
||||
|
||||
move I, $r0
|
||||
srai.d T0, M, 1
|
||||
beq I, T0, .L150
|
||||
|
||||
.L11: /* for(i=0; i<bm/2; i+=1) */
|
||||
move B0, B //ptrbb
|
||||
move TL, K /* TL = bk */
|
||||
#if defined(TRMMKERNEL)
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
move B0, B //ptrbb
|
||||
#else
|
||||
slli.d C3, OFF, 0x04
|
||||
add.d A0, A0, C3
|
||||
add.d B0, B, C3
|
||||
#endif
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
sub.d TL, K, OFF //temp
|
||||
#elif defined(LEFT)
|
||||
addi.d TL, OFF, 2
|
||||
#else
|
||||
addi.d TL, OFF, 2
|
||||
#endif
|
||||
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
xvxor.v U0, U0, U0
|
||||
xvxor.v U1, U1, U1
|
||||
|
||||
move L, $r0 //cycle param k
|
||||
srai.d C2, TL, 2
|
||||
beq L, C2, .L130
|
||||
blt C2, L, .L130
|
||||
|
||||
.L12: /* for(k=0; k<bk/4; k+=1) */
|
||||
xvld D0, A0, 0x00 //a 0-7
|
||||
xvld D1, A0, 0x20 //a 8-15
|
||||
xvld D2, B0, 0x00 //b 0-7
|
||||
xvld D3, B0, 0x20 //b 8-15
|
||||
|
||||
xvand.v D4, D0, D0
|
||||
xvpermi.q D4, D1, 0x02 //a 0 1 2 3 8 9 10 11
|
||||
xvand.v D5, D4, D4
|
||||
xvshuf4i.w D4, D4, 0x88 //a 0 2 0 2 8 10 8 10
|
||||
xvshuf4i.w D5, D5, 0xdd //a 1 3 1 3 9 11 9 11
|
||||
|
||||
xvand.v D6, D1, D1
|
||||
xvpermi.q D6, D0, 0x31 //a 4 5 6 7 12 13 14 15
|
||||
xvand.v D7, D6, D6
|
||||
xvshuf4i.w D6, D6, 0x88 //a 4 6 4 6 12 14 12 14
|
||||
xvshuf4i.w D7, D7, 0xdd //a 5 7 5 7 13 15 13 15
|
||||
|
||||
xvand.v D8, D2, D2
|
||||
xvpermi.q D8, D3, 0x02 //b 0 1 2 3 8 9 10 11
|
||||
xvand.v D9, D8, D8
|
||||
xvshuf4i.w D8, D8, 0xa0 //b 0 0 2 2 8 8 10 10
|
||||
xvshuf4i.w D9, D9, 0xf5 //a 1 1 3 3 9 9 11 11
|
||||
|
||||
xvand.v D10, D3, D3
|
||||
xvpermi.q D10, D2, 0x31 //b 4 5 6 7 12 13 14 15
|
||||
xvand.v D11, D10, D10
|
||||
xvshuf4i.w D10, D10, 0xa0 //b 4 4 6 6 12 12 14 14
|
||||
xvshuf4i.w D11, D11, 0xf5 //a 5 5 7 7 13 13 15 15
|
||||
|
||||
XVMADD1 U0, D4, D8, U0 //res0 2 4 6 0 2 4 6
|
||||
XVMADD2 U1, D5, D8, U1 //res1 3 4 7 1 3 4 7
|
||||
|
||||
xvpermi.q U0, U0, 0x01
|
||||
xvpermi.q U1, U1, 0x01
|
||||
XVMADD1 U0, D4, D8, U0
|
||||
XVMADD2 U1, D5, D8, U1
|
||||
|
||||
XVMADD3 U0, D5, D9, U0
|
||||
XVMADD4 U1, D4, D9, U1
|
||||
|
||||
xvpermi.q U0, U0, 0x01
|
||||
xvpermi.q U1, U1, 0x01
|
||||
XVMADD3 U0, D5, D9, U0
|
||||
XVMADD4 U1, D4, D9, U1
|
||||
|
||||
XVMADD1 U0, D6, D10, U0 //res0 2 4 6 0 2 4 6
|
||||
XVMADD2 U1, D7, D10, U1 //res1 3 4 7 1 3 4 7
|
||||
|
||||
xvpermi.q U0, U0, 0x01
|
||||
xvpermi.q U1, U1, 0x01
|
||||
XVMADD1 U0, D6, D10, U0
|
||||
XVMADD2 U1, D7, D10, U1
|
||||
|
||||
XVMADD3 U0, D7, D11, U0
|
||||
XVMADD4 U1, D6, D11, U1
|
||||
|
||||
xvpermi.q U0, U0, 0x01
|
||||
xvpermi.q U1, U1, 0x01
|
||||
XVMADD3 U0, D7, D11, U0
|
||||
XVMADD4 U1, D6, D11, U1
|
||||
|
||||
addi.d A0, A0, 0x40
|
||||
addi.d B0, B0, 0x40
|
||||
|
||||
addi.d L, L, 1
|
||||
blt L, C2, .L12
|
||||
|
||||
.L130:
|
||||
move L, $r0
|
||||
andi C2, TL, 3
|
||||
beq L, C2, .L14
|
||||
|
||||
.L13: /* for(k=0; k<(bk&3); k+=1) */
|
||||
vld $vr16, A0, 0x00 //a0 a1 a2 a3
|
||||
vld $vr17, B0, 0x00 //b0 b1 b2 b3
|
||||
|
||||
vshuf4i.w $vr20, $vr17, 0xa0 //b0 b0 b2 b2
|
||||
vshuf4i.w $vr21, $vr17, 0xf5 //b1 b1 b3 b3
|
||||
|
||||
vshuf4i.w $vr18, $vr16, 0x88 //a0 a2 a0 a2
|
||||
vshuf4i.w $vr19, $vr16, 0xdd //a1 a3 a1 a3
|
||||
|
||||
VMADD1 $vr30, $vr18, $vr20, $vr30 //res0 2 4 6
|
||||
VMADD2 $vr31, $vr19, $vr20, $vr31 //res1 3 5 7
|
||||
VMADD3 $vr30, $vr19, $vr21, $vr30
|
||||
VMADD4 $vr31, $vr18, $vr21, $vr31
|
||||
|
||||
addi.d A0, A0, 0x10
|
||||
addi.d B0, B0, 0x10
|
||||
|
||||
addi.d L, L, 1
|
||||
blt L, C2, .L13
|
||||
|
||||
.L14:
|
||||
#if defined(TRMMKERNEL)
|
||||
vld $vr8, C0, 0x00 //0 1 2 3
|
||||
vld $vr9, C1, 0x00 //4 5 6 7
|
||||
|
||||
vpackev.w $vr10, $vr9, $vr8 //0 4 2 6
|
||||
vpermi.w $vr10, $vr10, 0xd8 //0 2 4 6
|
||||
|
||||
vpackod.w $vr11, $vr9, $vr8 //1 5 3 7
|
||||
vpermi.w $vr11, $vr11, 0xd8 //1 3 5 7
|
||||
|
||||
vfmul.s $vr10, $vr30, $vr28
|
||||
vfmul.s $vr11, $vr31, $vr28
|
||||
VNMSUB $vr10, $vr31, $vr29, $vr10
|
||||
VFMADD $vr11, $vr30, $vr29, $vr11
|
||||
|
||||
vilvl.w $vr8, $vr11, $vr10 //0 1 2 3
|
||||
|
||||
vilvh.w $vr9, $vr11, $vr10 //4 5 6 7
|
||||
|
||||
vst $vr8, C0, 0x00
|
||||
vst $vr9, C1, 0x00
|
||||
#else
|
||||
vld $vr8, C0, 0x00 //0 1 2 3
|
||||
vld $vr9, C1, 0x00 //4 5 6 7
|
||||
|
||||
vpackev.w $vr10, $vr9, $vr8 //0 4 2 6
|
||||
vpermi.w $vr10, $vr10, 0xd8 //0 2 4 6
|
||||
|
||||
vpackod.w $vr11, $vr9, $vr8 //1 5 3 7
|
||||
vpermi.w $vr11, $vr11, 0xd8 //1 3 5 7
|
||||
|
||||
VFMADD $vr10, $vr30, $vr28, $vr10
|
||||
VFMADD $vr11, $vr31, $vr28, $vr11
|
||||
VNMSUB $vr10, $vr31, $vr29, $vr10
|
||||
VFMADD $vr11, $vr30, $vr29, $vr11
|
||||
|
||||
vilvl.w $vr8, $vr11, $vr10 //0 1 2 3
|
||||
|
||||
vilvh.w $vr9, $vr11, $vr10 //4 5 6 7
|
||||
|
||||
vst $vr8, C0, 0x00
|
||||
vst $vr9, C1, 0x00
|
||||
#endif
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#ifdef LEFT
|
||||
addi.d TL, TL, -2
|
||||
#else
|
||||
addi.d TL, TL, -2
|
||||
#endif
|
||||
slli.d C3, TL, 0x04
|
||||
add.d A0, A0, C3
|
||||
add.d B0, B0, C3
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
addi.d OFF, OFF, 2
|
||||
#endif
|
||||
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
addi.d C0, C0, 0x10
|
||||
addi.d C1, C1, 0x10
|
||||
|
||||
addi.d I, I, 1
|
||||
blt I, T0, .L11
|
||||
|
||||
.L150:
|
||||
move I, $r0
|
||||
andi T0, M, 1
|
||||
beq I, T0, .L18
|
||||
|
||||
.L15: /* for(i=0; i<(bm&1); i+=1) */
|
||||
move B0, B //ptrbb
|
||||
move TL, K /* TL = bk */
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
|
||||
move B0, B //ptrbb
|
||||
#else
|
||||
slli.d C3, OFF, 0x03
|
||||
add.d A0, A0, C3
|
||||
slli.d C3, OFF, 0x04
|
||||
add.d B0, B, C3
|
||||
#endif
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#elif defined(LEFT)
|
||||
addi.d TL, OFF, 1
|
||||
#else
|
||||
addi.d TL, OFF, 2
|
||||
#endif
|
||||
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
MTC c11, $r0
|
||||
MTC c12, $r0
|
||||
MTC c21, $r0
|
||||
MTC c22, $r0
|
||||
|
||||
move L, $r0 //cycle param k
|
||||
beq L, TL, .L17
|
||||
blt TL, L, .L17
|
||||
|
||||
.L16: /* for (k=0; k<bk; k+=1) */
|
||||
LD a1, A0, 0x00 //load0
|
||||
LD b1, B0, 0x00 //load1
|
||||
MADD1 c11, a1, b1, c11 //res0
|
||||
LD a2, A0, 0x04 //load2
|
||||
MADD2 c12, a2, b1, c12 //res1
|
||||
LD b2, B0, 0x04 //load3
|
||||
MADD3 c11, a2, b2, c11
|
||||
MADD4 c12, a1, b2, c12
|
||||
LD b3, B0, 0x08 //load4
|
||||
MADD1 c21, a1, b3, c21 //res2
|
||||
MADD2 c22, a2, b3, c22 //res3
|
||||
LD b4, B0, 0x0c //load5
|
||||
MADD3 c21, a2, b4, c21
|
||||
MADD4 c22, a1, b4, c22
|
||||
|
||||
addi.d A0, A0, 0x08
|
||||
addi.d B0, B0, 0x10
|
||||
|
||||
addi.d L, L, 1
|
||||
blt L, TL, .L16
|
||||
|
||||
.L17:
|
||||
#if defined(TRMMKERNEL)
|
||||
MUL a5, c11, ALPHA_R
|
||||
MUL a6, c12, ALPHA_I
|
||||
SUB a5, a5, a6
|
||||
ST a5, C0, 0x00
|
||||
|
||||
MUL a5, c12, ALPHA_R
|
||||
MUL a6, c11, ALPHA_I
|
||||
ADD a6, a5, a6
|
||||
ST a6, C0, 0x04
|
||||
|
||||
MUL b5, c21, ALPHA_R
|
||||
MUL b6, c22, ALPHA_I
|
||||
SUB b5, b5, b6
|
||||
ST b5, C1, 0x00
|
||||
|
||||
MUL b5, c22, ALPHA_R
|
||||
MUL b6, c21, ALPHA_I
|
||||
ADD b6, b5, b6
|
||||
ST b6, C1, 0x04
|
||||
#else
|
||||
LD a5, C0, 0x00 //C0[0]
|
||||
LD a6, C0, 0x04 //C0[1]
|
||||
LD b5, C1, 0x00 //C1[0]
|
||||
LD b6, C1, 0x04 //C1[1]
|
||||
|
||||
MADD a5, c11, ALPHA_R, a5
|
||||
MADD a6, c12, ALPHA_R, a6
|
||||
NMSUB a5, c12, ALPHA_I, a5
|
||||
MADD a6, c11, ALPHA_I, a6
|
||||
ST a5, C0, 0x00
|
||||
ST a6, C0, 0x04
|
||||
|
||||
MADD b5, c21, ALPHA_R, b5
|
||||
MADD b6, c22, ALPHA_R, b6
|
||||
NMSUB b5, c22, ALPHA_I, b5
|
||||
MADD b6, c21, ALPHA_I, b6
|
||||
ST b5, C1, 0x00
|
||||
ST b6, C1, 0x04
|
||||
#endif
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#ifdef LEFT
|
||||
addi.d TL, TL, -1
|
||||
#else
|
||||
addi.d TL, TL, -2
|
||||
#endif
|
||||
slli.d C3, TL, 0x03
|
||||
add.d A0, A0, C3
|
||||
slli.d C3, TL, 0x04
|
||||
add.d B0, B0, C3
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
addi.d OFF, OFF, 1
|
||||
#endif
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
addi.d C0, C0, 0x08
|
||||
addi.d C1, C1, 0x08
|
||||
|
||||
addi.d I, I, 1
|
||||
blt I, T0, .L15
|
||||
|
||||
.L18:
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
addi.d OFF, OFF, 2
|
||||
#endif
|
||||
|
||||
slli.d L, K, 0x04
|
||||
add.d B, B, L
|
||||
|
||||
slli.d I, LDC, 0x02
|
||||
add.d C, C, I
|
||||
|
||||
addi.d J, J, 1
|
||||
srai.d T0, N, 1
|
||||
blt J, T0, .L10
|
||||
|
||||
.L19:
|
||||
move J, $r0
|
||||
andi T0, N, 1
|
||||
beq J, T0, .L30
|
||||
|
||||
.L20: /* for (j=0; j<(bn&1); j+=1) */
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
move OFF, OFFSET
|
||||
#endif
|
||||
|
||||
move C0, C
|
||||
move A0, A //ptrba
|
||||
|
||||
move I, $r0
|
||||
srai.d T0, M, 1
|
||||
beq I, T0, .L24
|
||||
|
||||
.L21: /* for (i=0; i<bm/2; i+=1) */
|
||||
move B0, B //ptrbb
|
||||
move TL, K /* TL = bk */
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
|
||||
move B0, B //ptrbb
|
||||
#else
|
||||
slli.d C3, OFF, 0x04
|
||||
add.d A0, A0, C3
|
||||
slli.d C3, OFF, 0x03
|
||||
add.d B0, B, C3
|
||||
#endif
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#elif defined(LEFT)
|
||||
addi.d TL, OFF, 2
|
||||
#else
|
||||
addi.d TL, OFF, 1
|
||||
#endif
|
||||
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
MTC c11, $r0
|
||||
MTC c12, $r0
|
||||
MTC c21, $r0
|
||||
MTC c22, $r0
|
||||
|
||||
move L, $r0 //cycle param k
|
||||
beq L, TL, .L23
|
||||
blt TL, L, .L23
|
||||
|
||||
.L22: /* for (k=0; k<bk; k+=1) */
|
||||
LD a1, A0, 0x00 //load0
|
||||
LD b1, B0, 0x00 //load1
|
||||
MADD1 c11, a1, b1, c11 //res0
|
||||
LD a2, A0, 0x04 //load2
|
||||
MADD2 c12, a2, b1, c12 //res1
|
||||
LD b2, B0, 0x04 //load3
|
||||
MADD3 c11, a2, b2, c11
|
||||
MADD4 c12, a1, b2, c12
|
||||
LD a3, A0, 0x08 //load4
|
||||
MADD1 c21, a3, b1, c21 //res2
|
||||
LD a4, A0, 0x0c //load5
|
||||
MADD2 c22, a4, b1, c22 //res3
|
||||
MADD3 c21, a4, b2, c21
|
||||
MADD4 c22, a3, b2, c22
|
||||
|
||||
addi.d A0, A0, 0x10
|
||||
addi.d B0, B0, 0x08
|
||||
|
||||
addi.d L, L, 1
|
||||
blt L, TL, .L22
|
||||
|
||||
.L23:
|
||||
#if defined(TRMMKERNEL)
|
||||
MUL a5, c11, ALPHA_R
|
||||
MUL a6, c12, ALPHA_I
|
||||
SUB a5, a5, a6
|
||||
ST a5, C0, 0x00
|
||||
|
||||
MUL a5, c12, ALPHA_R
|
||||
MUL a6, c11, ALPHA_I
|
||||
ADD a6, a5, a6
|
||||
ST a6, C0, 0x04
|
||||
|
||||
MUL a7, c21, ALPHA_R
|
||||
MUL a8, c22, ALPHA_I
|
||||
SUB a7, a7, a8
|
||||
ST a7, C0, 0x08
|
||||
|
||||
MUL a7, c22, ALPHA_R
|
||||
MUL a8, c21, ALPHA_I
|
||||
ADD a8, a7, a8
|
||||
ST a8, C0, 0x0c
|
||||
#else
|
||||
LD a5, C0, 0x00 //C0[0]
|
||||
LD a6, C0, 0x04 //C0[1]
|
||||
LD a7, C0, 0x08 //C1[2]
|
||||
LD a8, C0, 0x0c //C1[3]
|
||||
|
||||
MADD a5, c11, ALPHA_R, a5
|
||||
MADD a6, c12, ALPHA_R, a6
|
||||
NMSUB a5, c12, ALPHA_I, a5
|
||||
MADD a6, c11, ALPHA_I, a6
|
||||
MADD a7, c21, ALPHA_R, a7
|
||||
MADD a8, c22, ALPHA_R, a8
|
||||
NMSUB a7, c22, ALPHA_I, a7
|
||||
MADD a8, c21, ALPHA_I, a8
|
||||
|
||||
ST a5, C0, 0x00
|
||||
ST a6, C0, 0x04
|
||||
ST a7, C0, 0x08
|
||||
ST a8, C0, 0x0c
|
||||
#endif
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#ifdef LEFT
|
||||
addi.d TL, TL, -2
|
||||
#else
|
||||
addi.d TL, TL, -1
|
||||
#endif
|
||||
slli.d C3, TL, 0x04
|
||||
add.d A0, A0, C3
|
||||
slli.d C3, TL, 0x03
|
||||
add.d B0, B0, C3
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
addi.d OFF, OFF, 2
|
||||
#endif
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
addi.d C0, C0, 0x10
|
||||
|
||||
addi.d I, I, 1
|
||||
blt I, T0, .L21
|
||||
|
||||
.L24:
|
||||
move I, $r0
|
||||
andi T1, M, 1 //bm&1
|
||||
beq I, T1, .L28
|
||||
|
||||
.L25: /* for (i=0; i<(bm&1); i+=1) */
|
||||
move B0, B //ptrbb
|
||||
move TL, K /* TL = bk */
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
|
||||
move B0, B //ptrbb
|
||||
#else
|
||||
slli.d C3, OFF, 0x03
|
||||
add.d A0, A0, C3
|
||||
add.d B0, B, C3
|
||||
#endif
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#elif defined(LEFT)
|
||||
addi.d TL, OFF, 1
|
||||
#else
|
||||
addi.d TL, OFF, 1
|
||||
#endif
|
||||
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
MTC c11, $r0
|
||||
MTC c12, $r0
|
||||
|
||||
move L, $r0 //cycle param k
|
||||
beq L, TL, .L27
|
||||
blt TL, L, .L27
|
||||
|
||||
.L26: /* for (k=0; k<bk; k+=1) */
|
||||
LD a1, A0, 0x00 //load0
|
||||
LD b1, B0, 0x00 //load1
|
||||
MADD1 c11, a1, b1, c11 //res0
|
||||
LD a2, A0, 0x04 //load2
|
||||
MADD2 c12, a2, b1, c12 //res1
|
||||
LD b2, B0, 0x04 //load3
|
||||
MADD3 c11, a2, b2, c11
|
||||
MADD4 c12, a1, b2, c12
|
||||
|
||||
addi.d A0, A0, 0x08
|
||||
addi.d B0, B0, 0x08
|
||||
|
||||
addi.d L, L, 1
|
||||
blt L, TL, .L26
|
||||
|
||||
.L27:
|
||||
#if defined(TRMMKERNEL)
|
||||
MUL a5, c11, ALPHA_R
|
||||
MUL a6, c12, ALPHA_I
|
||||
SUB a5, a5, a6
|
||||
ST a5, C0, 0x00
|
||||
|
||||
MUL a5, c12, ALPHA_R
|
||||
MUL a6, c11, ALPHA_I
|
||||
ADD a6, a5, a6
|
||||
ST a6, C0, 0x04
|
||||
#else
|
||||
LD a5, C0, 0x00 //C0[0]
|
||||
LD a6, C0, 0x04 //C0[1]
|
||||
|
||||
MADD a5, c11, ALPHA_R, a5
|
||||
MADD a6, c12, ALPHA_R, a6
|
||||
NMSUB a5, c12, ALPHA_I, a5
|
||||
MADD a6, c11, ALPHA_I, a6
|
||||
|
||||
ST a5, C0, 0x00
|
||||
ST a6, C0, 0x04
|
||||
#endif
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#ifdef LEFT
|
||||
addi.d TL, TL, -1
|
||||
#else
|
||||
addi.d TL, TL, -1
|
||||
#endif
|
||||
slli.d C3, TL, 0x03
|
||||
add.d A0, A0, C3
|
||||
add.d B0, B0, C3
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
addi.d OFF, OFF, 1
|
||||
#endif
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
addi.d C0, C0, 0x08
|
||||
|
||||
addi.d I, I, 1
|
||||
blt I, T1, .L25
|
||||
|
||||
.L28:
|
||||
slli.d L, K, 3
|
||||
add.d B, B, L
|
||||
|
||||
slli.d I, LDC, 1
|
||||
add.d C, C, I
|
||||
|
||||
addi.d J, J, 1
|
||||
andi T0, N, 1
|
||||
blt J, T0, .L20
|
||||
|
||||
.L30:
|
||||
LDARG $r23, $sp, 0
|
||||
LDARG $r24, $sp, 8
|
||||
LDARG $r25, $sp, 16
|
||||
LDARG $r26, $sp, 24
|
||||
LDARG $r27, $sp, 32
|
||||
LD $f23, $sp, 40
|
||||
LD $f24, $sp, 48
|
||||
LD $f25, $sp, 56
|
||||
LD $f26, $sp, 64
|
||||
LD $f27, $sp, 72
|
||||
LD $f28, $sp, 80
|
||||
LD $f29, $sp, 88
|
||||
LD $f30, $sp, 96
|
||||
LD $f31, $sp, 104
|
||||
|
||||
addi.d $sp, $sp, 128
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,812 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: bm
|
||||
#define N $r5 // param 2: bn
|
||||
#define K $r6 // param 3: bk
|
||||
#define ALPHA_R $f0 // param 4: alphar
|
||||
#define ALPHA_I $f1 // param 5: alphai
|
||||
#define A $r7 // param 6: ba
|
||||
#define B $r8 // param 7: bb
|
||||
#define C $r9 // param 8: bc
|
||||
#define LDC $r10 // param 9: ldc
|
||||
|
||||
#if defined (TRMMKERNEL)
|
||||
#define OFFSET $r11 // param 10: offset
|
||||
#endif
|
||||
#define OFF $r26
|
||||
|
||||
#define I $r12
|
||||
#define J $r13
|
||||
#define L $r14
|
||||
#define TL $r15
|
||||
#define A0 $r16
|
||||
#define B0 $r17
|
||||
#define C0 $r18
|
||||
#define C1 $r19
|
||||
#define C2 $r20
|
||||
#define C3 $r23
|
||||
#define T0 $r24
|
||||
#define T1 $r25
|
||||
|
||||
#define a1 $f2
|
||||
#define a2 $f3
|
||||
#define a3 $f4
|
||||
#define a4 $f5
|
||||
#define a5 $f6
|
||||
#define a6 $f7
|
||||
#define a7 $f8
|
||||
#define a8 $f9
|
||||
#define b1 $f10
|
||||
#define b2 $f11
|
||||
#define b3 $f12
|
||||
#define b4 $f13
|
||||
#define b5 $f14
|
||||
#define b6 $f15
|
||||
#define b7 $f16
|
||||
#define b8 $f17
|
||||
#define c11 $f18
|
||||
#define c12 $f19
|
||||
#define c21 $f20
|
||||
#define c22 $f21
|
||||
#define c31 $f22
|
||||
#define c32 $f23
|
||||
#define c41 $f24
|
||||
#define c42 $f25
|
||||
|
||||
/* LASX vectors */
|
||||
#define U0 $vr30
|
||||
#define U1 $vr31
|
||||
#define U2 $vr2
|
||||
#define U3 $vr3
|
||||
#define U4 $vr4
|
||||
#define U5 $vr5
|
||||
#define U6 $vr6
|
||||
#define U7 $vr7
|
||||
#define U8 $vr8
|
||||
#define U9 $vr9
|
||||
#define U10 $vr10
|
||||
#define U11 $vr11
|
||||
#define U12 $vr12
|
||||
#define U13 $vr13
|
||||
#define U14 $vr14
|
||||
#define U15 $vr15
|
||||
#define D0 $vr16
|
||||
#define D1 $vr17
|
||||
#define D2 $vr18
|
||||
#define D3 $vr19
|
||||
#define D4 $vr20
|
||||
#define D5 $vr21
|
||||
#define D6 $vr22
|
||||
#define D7 $vr23
|
||||
#define D8 $vr24
|
||||
#define D9 $vr25
|
||||
#define D10 $vr26
|
||||
#define D11 $vr27
|
||||
#define VALPHAR $vr28
|
||||
#define VALPHAI $vr29
|
||||
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
#define VMADD1 VFMADD
|
||||
#define VMADD2 VFMADD
|
||||
#define VMADD3 VNMSUB
|
||||
#define VMADD4 VFMADD
|
||||
|
||||
#define MADD1 MADD
|
||||
#define MADD2 MADD
|
||||
#define MADD3 NMSUB
|
||||
#define MADD4 MADD
|
||||
#endif
|
||||
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
#define VMADD1 VFMADD
|
||||
#define VMADD2 VFMADD
|
||||
#define VMADD3 VFMADD
|
||||
#define VMADD4 VNMSUB
|
||||
|
||||
#define MADD1 MADD
|
||||
#define MADD2 MADD
|
||||
#define MADD3 MADD
|
||||
#define MADD4 NMSUB
|
||||
#endif
|
||||
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
#define VMADD1 VFMADD
|
||||
#define VMADD2 VNMSUB
|
||||
#define VMADD3 VFMADD
|
||||
#define VMADD4 VFMADD
|
||||
|
||||
#define MADD1 MADD
|
||||
#define MADD2 NMSUB
|
||||
#define MADD3 MADD
|
||||
#define MADD4 MADD
|
||||
#endif
|
||||
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
#define VMADD1 VFMADD
|
||||
#define VMADD2 VNMSUB
|
||||
#define VMADD3 VNMSUB
|
||||
#define VMADD4 VNMSUB
|
||||
|
||||
#define MADD1 MADD
|
||||
#define MADD2 NMSUB
|
||||
#define MADD3 NMSUB
|
||||
#define MADD4 NMSUB
|
||||
#endif
|
||||
|
||||
PROLOGUE
|
||||
|
||||
addi.d $sp, $sp, -128
|
||||
SDARG $r23, $sp, 0
|
||||
SDARG $r24, $sp, 8
|
||||
SDARG $r25, $sp, 16
|
||||
SDARG $r26, $sp, 24
|
||||
SDARG $r27, $sp, 32
|
||||
ST $f23, $sp, 40
|
||||
ST $f24, $sp, 48
|
||||
ST $f25, $sp, 56
|
||||
ST $f26, $sp, 64
|
||||
ST $f27, $sp, 72
|
||||
ST $f28, $sp, 80
|
||||
ST $f29, $sp, 88
|
||||
ST $f30, $sp, 96
|
||||
ST $f31, $sp, 104
|
||||
ST ALPHA_R,$sp, 112
|
||||
ST ALPHA_I,$sp, 120
|
||||
|
||||
vldrepl.w VALPHAR, $sp, 112
|
||||
vldrepl.w VALPHAI, $sp, 120
|
||||
|
||||
#if defined (TRMMKERNEL) && !defined(LEFT)
|
||||
sub.d OFF, $r0, OFFSET
|
||||
#else
|
||||
xor OFF, OFF, OFF
|
||||
#endif
|
||||
|
||||
slli.d LDC, LDC, 2
|
||||
|
||||
move J, $r0
|
||||
srai.d T0, N, 1
|
||||
beq J, T0, .L19
|
||||
|
||||
.L10: /* for(j=0; j<bn/2; j+=1) */
|
||||
move C0, C
|
||||
slli.d TL, LDC, 1
|
||||
add.d C1, C0, TL
|
||||
move A0, A //ptrba
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
move OFF, OFFSET
|
||||
#endif
|
||||
|
||||
move I, $r0
|
||||
srai.d T0, M, 1
|
||||
beq I, T0, .L150
|
||||
|
||||
.L11: /* for(i=0; i<bm/2; i+=1) */
|
||||
move B0, B //ptrbb
|
||||
move TL, K /* TL = bk */
|
||||
#if defined(TRMMKERNEL)
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
move B0, B //ptrbb
|
||||
#else
|
||||
slli.d C3, OFF, 0x04
|
||||
add.d A0, A0, C3
|
||||
add.d B0, B, C3
|
||||
#endif
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
sub.d TL, K, OFF //temp
|
||||
#elif defined(LEFT)
|
||||
addi.d TL, OFF, 2
|
||||
#else
|
||||
addi.d TL, OFF, 2
|
||||
#endif
|
||||
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
vxor.v U0, U0, U0
|
||||
vxor.v U1, U1, U1
|
||||
|
||||
move L, $r0 //cycle param k
|
||||
srai.d C2, TL, 2
|
||||
beq L, C2, .L130
|
||||
blt C2, L, .L130
|
||||
|
||||
.L12: /* for(k=0; k<bk/4; k+=1) */
|
||||
vld D0, A0, 0x00 //a0 a1 a2 a3
|
||||
vld D1, B0, 0x00 //b0 b1 b2 b3
|
||||
|
||||
vshuf4i.w D4, D1, 0xa0 //b0 b0 b2 b2
|
||||
vshuf4i.w D5, D1, 0xf5 //b1 b1 b3 b3
|
||||
|
||||
vshuf4i.w D2, D0, 0x88 //a0 a2 a0 a2
|
||||
vshuf4i.w D3, D0, 0xdd //a1 a3 a1 a3
|
||||
|
||||
VMADD1 U0, D2, D4, U0 //res0 2 4 6
|
||||
VMADD2 U1, D3, D4, U1 //res1 3 4 7
|
||||
VMADD3 U0, D3, D5, U0
|
||||
VMADD4 U1, D2, D5, U1
|
||||
|
||||
vld D0, A0, 0x10 //a0 a1 a2 a3
|
||||
vld D1, B0, 0x10 //b0 b1 b2 b3
|
||||
|
||||
vshuf4i.w D4, D1, 0xa0 //b0 b0 b2 b2
|
||||
vshuf4i.w D5, D1, 0xf5 //b1 b1 b3 b3
|
||||
|
||||
vshuf4i.w D2, D0, 0x88 //a0 a2 a0 a2
|
||||
vshuf4i.w D3, D0, 0xdd //a1 a3 a1 a3
|
||||
|
||||
VMADD1 U0, D2, D4, U0 //res0 2 4 6
|
||||
VMADD2 U1, D3, D4, U1 //res1 3 4 7
|
||||
VMADD3 U0, D3, D5, U0
|
||||
VMADD4 U1, D2, D5, U1
|
||||
|
||||
vld D0, A0, 0x20 //a0 a1 a2 a3
|
||||
vld D1, B0, 0x20 //b0 b1 b2 b3
|
||||
|
||||
vshuf4i.w D4, D1, 0xa0 //b0 b0 b2 b2
|
||||
vshuf4i.w D5, D1, 0xf5 //b1 b1 b3 b3
|
||||
|
||||
vshuf4i.w D2, D0, 0x88 //a0 a2 a0 a2
|
||||
vshuf4i.w D3, D0, 0xdd //a1 a3 a1 a3
|
||||
|
||||
VMADD1 U0, D2, D4, U0 //res0 2 4 6
|
||||
VMADD2 U1, D3, D4, U1 //res1 3 4 7
|
||||
VMADD3 U0, D3, D5, U0
|
||||
VMADD4 U1, D2, D5, U1
|
||||
|
||||
vld D0, A0, 0x30 //a0 a1 a2 a3
|
||||
vld D1, B0, 0x30 //b0 b1 b2 b3
|
||||
|
||||
vshuf4i.w D4, D1, 0xa0 //b0 b0 b2 b2
|
||||
vshuf4i.w D5, D1, 0xf5 //b1 b1 b3 b3
|
||||
|
||||
vshuf4i.w D2, D0, 0x88 //a0 a2 a0 a2
|
||||
vshuf4i.w D3, D0, 0xdd //a1 a3 a1 a3
|
||||
|
||||
VMADD1 U0, D2, D4, U0 //res0 2 4 6
|
||||
VMADD2 U1, D3, D4, U1 //res1 3 4 7
|
||||
VMADD3 U0, D3, D5, U0
|
||||
VMADD4 U1, D2, D5, U1
|
||||
|
||||
addi.d A0, A0, 0x40
|
||||
addi.d B0, B0, 0x40
|
||||
|
||||
addi.d L, L, 1
|
||||
blt L, C2, .L12
|
||||
|
||||
.L130:
|
||||
move L, $r0
|
||||
andi C2, TL, 3
|
||||
beq L, C2, .L14
|
||||
|
||||
.L13: /* for(k=0; k<(bk&3); k+=1) */
|
||||
vld D0, A0, 0x00 //a0 a1 a2 a3
|
||||
vld D1, B0, 0x00 //b0 b1 b2 b3
|
||||
|
||||
vshuf4i.w D4, D1, 0xa0 //b0 b0 b2 b2
|
||||
vshuf4i.w D5, D1, 0xf5 //b1 b1 b3 b3
|
||||
|
||||
vshuf4i.w D2, D0, 0x88 //a0 a2 a0 a2
|
||||
vshuf4i.w D3, D0, 0xdd //a1 a3 a1 a3
|
||||
|
||||
VMADD1 U0, D2, D4, U0 //res0 2 4 6
|
||||
VMADD2 U1, D3, D4, U1 //res1 3 5 7
|
||||
VMADD3 U0, D3, D5, U0
|
||||
VMADD4 U1, D2, D5, U1
|
||||
|
||||
addi.d A0, A0, 0x10
|
||||
addi.d B0, B0, 0x10
|
||||
|
||||
addi.d L, L, 1
|
||||
blt L, C2, .L13
|
||||
|
||||
.L14:
|
||||
#if defined(TRMMKERNEL)
|
||||
vld U8, C0, 0x00 //0 1 2 3
|
||||
vld U9, C1, 0x00 //4 5 6 7
|
||||
|
||||
vpackev.w U10, U9, U8 //0 4 2 6
|
||||
vpermi.w U10, U10, 0xd8 //0 2 4 6
|
||||
|
||||
vpackod.w U11, U9, U8 //1 5 3 7
|
||||
vpermi.w U11, U11, 0xd8 //1 3 5 7
|
||||
|
||||
vfmul.s U10, U0, VALPHAR
|
||||
vfmul.s U11, U1, VALPHAR
|
||||
VNMSUB U10, U1, VALPHAI, U10
|
||||
VFMADD U11, U0, VALPHAI, U11
|
||||
|
||||
vilvl.w U8, U11, U10 //0 1 2 3
|
||||
|
||||
vilvh.w U9, U11, U10 //4 5 6 7
|
||||
|
||||
vst U8, C0, 0x00
|
||||
vst U9, C1, 0x00
|
||||
#else
|
||||
vld U8, C0, 0x00 //0 1 2 3
|
||||
vld U9, C1, 0x00 //4 5 6 7
|
||||
|
||||
vpackev.w U10, U9, U8 //0 4 2 6
|
||||
vpermi.w U10, U10, 0xd8 //0 2 4 6
|
||||
|
||||
vpackod.w U11, U9, U8 //1 5 3 7
|
||||
vpermi.w U11, U11, 0xd8 //1 3 5 7
|
||||
|
||||
VFMADD U10, U0, VALPHAR, U10
|
||||
VFMADD U11, U1, VALPHAR, U11
|
||||
VNMSUB U10, U1, VALPHAI, U10
|
||||
VFMADD U11, U0, VALPHAI, U11
|
||||
|
||||
vilvl.w U8, U11, U10 //0 1 2 3
|
||||
|
||||
vilvh.w U9, U11, U10 //4 5 6 7
|
||||
|
||||
vst U8, C0, 0x00
|
||||
vst U9, C1, 0x00
|
||||
#endif
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#ifdef LEFT
|
||||
addi.d TL, TL, -2
|
||||
#else
|
||||
addi.d TL, TL, -2
|
||||
#endif
|
||||
slli.d C3, TL, 0x04
|
||||
add.d A0, A0, C3
|
||||
add.d B0, B0, C3
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
addi.d OFF, OFF, 2
|
||||
#endif
|
||||
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
addi.d C0, C0, 0x10
|
||||
addi.d C1, C1, 0x10
|
||||
|
||||
addi.d I, I, 1
|
||||
blt I, T0, .L11
|
||||
|
||||
.L150:
|
||||
move I, $r0
|
||||
andi T0, M, 1
|
||||
beq I, T0, .L18
|
||||
|
||||
.L15: /* for(i=0; i<(bm&1); i+=1) */
|
||||
move B0, B //ptrbb
|
||||
move TL, K /* TL = bk */
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
|
||||
move B0, B //ptrbb
|
||||
#else
|
||||
slli.d C3, OFF, 0x03
|
||||
add.d A0, A0, C3
|
||||
slli.d C3, OFF, 0x04
|
||||
add.d B0, B, C3
|
||||
#endif
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#elif defined(LEFT)
|
||||
addi.d TL, OFF, 1
|
||||
#else
|
||||
addi.d TL, OFF, 2
|
||||
#endif
|
||||
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
MTC c11, $r0
|
||||
MTC c12, $r0
|
||||
MTC c21, $r0
|
||||
MTC c22, $r0
|
||||
|
||||
move L, $r0 //cycle param k
|
||||
beq L, TL, .L17
|
||||
blt TL, L, .L17
|
||||
|
||||
.L16: /* for (k=0; k<bk; k+=1) */
|
||||
LD a1, A0, 0x00 //load0
|
||||
LD b1, B0, 0x00 //load1
|
||||
MADD1 c11, a1, b1, c11 //res0
|
||||
LD a2, A0, 0x04 //load2
|
||||
MADD2 c12, a2, b1, c12 //res1
|
||||
LD b2, B0, 0x04 //load3
|
||||
MADD3 c11, a2, b2, c11
|
||||
MADD4 c12, a1, b2, c12
|
||||
LD b3, B0, 0x08 //load4
|
||||
MADD1 c21, a1, b3, c21 //res2
|
||||
MADD2 c22, a2, b3, c22 //res3
|
||||
LD b4, B0, 0x0c //load5
|
||||
MADD3 c21, a2, b4, c21
|
||||
MADD4 c22, a1, b4, c22
|
||||
|
||||
addi.d A0, A0, 0x08
|
||||
addi.d B0, B0, 0x10
|
||||
|
||||
addi.d L, L, 1
|
||||
blt L, TL, .L16
|
||||
|
||||
.L17:
|
||||
#if defined(TRMMKERNEL)
|
||||
MUL a5, c11, ALPHA_R
|
||||
MUL a6, c12, ALPHA_I
|
||||
SUB a5, a5, a6
|
||||
ST a5, C0, 0x00
|
||||
|
||||
MUL a5, c12, ALPHA_R
|
||||
MUL a6, c11, ALPHA_I
|
||||
ADD a6, a5, a6
|
||||
ST a6, C0, 0x04
|
||||
|
||||
MUL b5, c21, ALPHA_R
|
||||
MUL b6, c22, ALPHA_I
|
||||
SUB b5, b5, b6
|
||||
ST b5, C1, 0x00
|
||||
|
||||
MUL b5, c22, ALPHA_R
|
||||
MUL b6, c21, ALPHA_I
|
||||
ADD b6, b5, b6
|
||||
ST b6, C1, 0x04
|
||||
#else
|
||||
LD a5, C0, 0x00 //C0[0]
|
||||
LD a6, C0, 0x04 //C0[1]
|
||||
LD b5, C1, 0x00 //C1[0]
|
||||
LD b6, C1, 0x04 //C1[1]
|
||||
|
||||
MADD a5, c11, ALPHA_R, a5
|
||||
MADD a6, c12, ALPHA_R, a6
|
||||
NMSUB a5, c12, ALPHA_I, a5
|
||||
MADD a6, c11, ALPHA_I, a6
|
||||
ST a5, C0, 0x00
|
||||
ST a6, C0, 0x04
|
||||
|
||||
MADD b5, c21, ALPHA_R, b5
|
||||
MADD b6, c22, ALPHA_R, b6
|
||||
NMSUB b5, c22, ALPHA_I, b5
|
||||
MADD b6, c21, ALPHA_I, b6
|
||||
ST b5, C1, 0x00
|
||||
ST b6, C1, 0x04
|
||||
#endif
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#ifdef LEFT
|
||||
addi.d TL, TL, -1
|
||||
#else
|
||||
addi.d TL, TL, -2
|
||||
#endif
|
||||
slli.d C3, TL, 0x03
|
||||
add.d A0, A0, C3
|
||||
slli.d C3, TL, 0x04
|
||||
add.d B0, B0, C3
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
addi.d OFF, OFF, 1
|
||||
#endif
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
addi.d C0, C0, 0x08
|
||||
addi.d C1, C1, 0x08
|
||||
|
||||
addi.d I, I, 1
|
||||
blt I, T0, .L15
|
||||
|
||||
.L18:
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
addi.d OFF, OFF, 2
|
||||
#endif
|
||||
|
||||
slli.d L, K, 0x04
|
||||
add.d B, B, L
|
||||
|
||||
slli.d I, LDC, 0x02
|
||||
add.d C, C, I
|
||||
|
||||
addi.d J, J, 1
|
||||
srai.d T0, N, 1
|
||||
blt J, T0, .L10
|
||||
|
||||
.L19:
|
||||
move J, $r0
|
||||
andi T0, N, 1
|
||||
beq J, T0, .L30
|
||||
|
||||
.L20: /* for (j=0; j<(bn&1); j+=1) */
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
move OFF, OFFSET
|
||||
#endif
|
||||
|
||||
move C0, C
|
||||
move A0, A //ptrba
|
||||
|
||||
move I, $r0
|
||||
srai.d T0, M, 1
|
||||
beq I, T0, .L24
|
||||
|
||||
.L21: /* for (i=0; i<bm/2; i+=1) */
|
||||
move B0, B //ptrbb
|
||||
move TL, K /* TL = bk */
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
|
||||
move B0, B //ptrbb
|
||||
#else
|
||||
slli.d C3, OFF, 0x04
|
||||
add.d A0, A0, C3
|
||||
slli.d C3, OFF, 0x03
|
||||
add.d B0, B, C3
|
||||
#endif
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#elif defined(LEFT)
|
||||
addi.d TL, OFF, 2
|
||||
#else
|
||||
addi.d TL, OFF, 1
|
||||
#endif
|
||||
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
MTC c11, $r0
|
||||
MTC c12, $r0
|
||||
MTC c21, $r0
|
||||
MTC c22, $r0
|
||||
|
||||
move L, $r0 //cycle param k
|
||||
beq L, TL, .L23
|
||||
blt TL, L, .L23
|
||||
|
||||
.L22: /* for (k=0; k<bk; k+=1) */
|
||||
LD a1, A0, 0x00 //load0
|
||||
LD b1, B0, 0x00 //load1
|
||||
MADD1 c11, a1, b1, c11 //res0
|
||||
LD a2, A0, 0x04 //load2
|
||||
MADD2 c12, a2, b1, c12 //res1
|
||||
LD b2, B0, 0x04 //load3
|
||||
MADD3 c11, a2, b2, c11
|
||||
MADD4 c12, a1, b2, c12
|
||||
LD a3, A0, 0x08 //load4
|
||||
MADD1 c21, a3, b1, c21 //res2
|
||||
LD a4, A0, 0x0c //load5
|
||||
MADD2 c22, a4, b1, c22 //res3
|
||||
MADD3 c21, a4, b2, c21
|
||||
MADD4 c22, a3, b2, c22
|
||||
|
||||
addi.d A0, A0, 0x10
|
||||
addi.d B0, B0, 0x08
|
||||
|
||||
addi.d L, L, 1
|
||||
blt L, TL, .L22
|
||||
|
||||
.L23:
|
||||
#if defined(TRMMKERNEL)
|
||||
MUL a5, c11, ALPHA_R
|
||||
MUL a6, c12, ALPHA_I
|
||||
SUB a5, a5, a6
|
||||
ST a5, C0, 0x00
|
||||
|
||||
MUL a5, c12, ALPHA_R
|
||||
MUL a6, c11, ALPHA_I
|
||||
ADD a6, a5, a6
|
||||
ST a6, C0, 0x04
|
||||
|
||||
MUL a7, c21, ALPHA_R
|
||||
MUL a8, c22, ALPHA_I
|
||||
SUB a7, a7, a8
|
||||
ST a7, C0, 0x08
|
||||
|
||||
MUL a7, c22, ALPHA_R
|
||||
MUL a8, c21, ALPHA_I
|
||||
ADD a8, a7, a8
|
||||
ST a8, C0, 0x0c
|
||||
#else
|
||||
LD a5, C0, 0x00 //C0[0]
|
||||
LD a6, C0, 0x04 //C0[1]
|
||||
LD a7, C0, 0x08 //C1[2]
|
||||
LD a8, C0, 0x0c //C1[3]
|
||||
|
||||
MADD a5, c11, ALPHA_R, a5
|
||||
MADD a6, c12, ALPHA_R, a6
|
||||
NMSUB a5, c12, ALPHA_I, a5
|
||||
MADD a6, c11, ALPHA_I, a6
|
||||
MADD a7, c21, ALPHA_R, a7
|
||||
MADD a8, c22, ALPHA_R, a8
|
||||
NMSUB a7, c22, ALPHA_I, a7
|
||||
MADD a8, c21, ALPHA_I, a8
|
||||
|
||||
ST a5, C0, 0x00
|
||||
ST a6, C0, 0x04
|
||||
ST a7, C0, 0x08
|
||||
ST a8, C0, 0x0c
|
||||
#endif
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#ifdef LEFT
|
||||
addi.d TL, TL, -2
|
||||
#else
|
||||
addi.d TL, TL, -1
|
||||
#endif
|
||||
slli.d C3, TL, 0x04
|
||||
add.d A0, A0, C3
|
||||
slli.d C3, TL, 0x03
|
||||
add.d B0, B0, C3
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
addi.d OFF, OFF, 2
|
||||
#endif
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
addi.d C0, C0, 0x10
|
||||
|
||||
addi.d I, I, 1
|
||||
blt I, T0, .L21
|
||||
|
||||
.L24:
|
||||
move I, $r0
|
||||
andi T1, M, 1 //bm&1
|
||||
beq I, T1, .L28
|
||||
|
||||
.L25: /* for (i=0; i<(bm&1); i+=1) */
|
||||
move B0, B //ptrbb
|
||||
move TL, K /* TL = bk */
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
|
||||
move B0, B //ptrbb
|
||||
#else
|
||||
slli.d C3, OFF, 0x03
|
||||
add.d A0, A0, C3
|
||||
add.d B0, B, C3
|
||||
#endif
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#elif defined(LEFT)
|
||||
addi.d TL, OFF, 1
|
||||
#else
|
||||
addi.d TL, OFF, 1
|
||||
#endif
|
||||
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
MTC c11, $r0
|
||||
MTC c12, $r0
|
||||
|
||||
move L, $r0 //cycle param k
|
||||
beq L, TL, .L27
|
||||
blt TL, L, .L27
|
||||
|
||||
.L26: /* for (k=0; k<bk; k+=1) */
|
||||
LD a1, A0, 0x00 //load0
|
||||
LD b1, B0, 0x00 //load1
|
||||
MADD1 c11, a1, b1, c11 //res0
|
||||
LD a2, A0, 0x04 //load2
|
||||
MADD2 c12, a2, b1, c12 //res1
|
||||
LD b2, B0, 0x04 //load3
|
||||
MADD3 c11, a2, b2, c11
|
||||
MADD4 c12, a1, b2, c12
|
||||
|
||||
addi.d A0, A0, 0x08
|
||||
addi.d B0, B0, 0x08
|
||||
|
||||
addi.d L, L, 1
|
||||
blt L, TL, .L26
|
||||
|
||||
.L27:
|
||||
#if defined(TRMMKERNEL)
|
||||
MUL a5, c11, ALPHA_R
|
||||
MUL a6, c12, ALPHA_I
|
||||
SUB a5, a5, a6
|
||||
ST a5, C0, 0x00
|
||||
|
||||
MUL a5, c12, ALPHA_R
|
||||
MUL a6, c11, ALPHA_I
|
||||
ADD a6, a5, a6
|
||||
ST a6, C0, 0x04
|
||||
#else
|
||||
LD a5, C0, 0x00 //C0[0]
|
||||
LD a6, C0, 0x04 //C0[1]
|
||||
|
||||
MADD a5, c11, ALPHA_R, a5
|
||||
MADD a6, c12, ALPHA_R, a6
|
||||
NMSUB a5, c12, ALPHA_I, a5
|
||||
MADD a6, c11, ALPHA_I, a6
|
||||
|
||||
ST a5, C0, 0x00
|
||||
ST a6, C0, 0x04
|
||||
#endif
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#ifdef LEFT
|
||||
addi.d TL, TL, -1
|
||||
#else
|
||||
addi.d TL, TL, -1
|
||||
#endif
|
||||
slli.d C3, TL, 0x03
|
||||
add.d A0, A0, C3
|
||||
add.d B0, B0, C3
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
addi.d OFF, OFF, 1
|
||||
#endif
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
addi.d C0, C0, 0x08
|
||||
|
||||
addi.d I, I, 1
|
||||
blt I, T1, .L25
|
||||
|
||||
.L28:
|
||||
slli.d L, K, 3
|
||||
add.d B, B, L
|
||||
|
||||
slli.d I, LDC, 1
|
||||
add.d C, C, I
|
||||
|
||||
addi.d J, J, 1
|
||||
andi T0, N, 1
|
||||
blt J, T0, .L20
|
||||
|
||||
.L30:
|
||||
LDARG $r23, $sp, 0
|
||||
LDARG $r24, $sp, 8
|
||||
LDARG $r25, $sp, 16
|
||||
LDARG $r26, $sp, 24
|
||||
LDARG $r27, $sp, 32
|
||||
LD $f23, $sp, 40
|
||||
LD $f24, $sp, 48
|
||||
LD $f25, $sp, 56
|
||||
LD $f26, $sp, 64
|
||||
LD $f27, $sp, 72
|
||||
LD $f28, $sp, 80
|
||||
LD $f29, $sp, 88
|
||||
LD $f30, $sp, 96
|
||||
LD $f31, $sp, 104
|
||||
|
||||
addi.d $sp, $sp, 128
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,193 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define S5 $r16
|
||||
#define S6 $r17
|
||||
#define S7 $r18
|
||||
#define S8 $r19
|
||||
#define TD $r20
|
||||
#define TS $r11
|
||||
#define TL $r7
|
||||
#define T0 $r23
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
#define F4 $f4
|
||||
#define F5 $f5
|
||||
#define F6 $f6
|
||||
#define F7 $f7
|
||||
|
||||
/* LASX vectors */
|
||||
#define U0 $xr0
|
||||
#define U1 $xr1
|
||||
#define U2 $xr2
|
||||
#define U3 $xr3
|
||||
#define U4 $xr4
|
||||
#define U5 $xr5
|
||||
#define U6 $xr6
|
||||
#define U7 $xr7
|
||||
#define D0 $xr8
|
||||
#define D1 $xr9
|
||||
#define D2 $xr10
|
||||
#define D3 $xr11
|
||||
#define D4 $xr12
|
||||
#define D5 $xr13
|
||||
#define D6 $xr14
|
||||
#define D7 $xr15
|
||||
#define D8 $xr16
|
||||
|
||||
PROLOGUE
|
||||
|
||||
addi.d $sp, $sp, -8
|
||||
SDARG $r23, $sp, 0
|
||||
|
||||
move TD, DST //boffset
|
||||
move TS, SRC //aoffset
|
||||
|
||||
slli.d TL, LDA, 0x02 //lda
|
||||
slli.d TL, TL, 0x01
|
||||
slli.d T0, TL, 0x01
|
||||
|
||||
srai.d I, N, 0x01
|
||||
beq I, ZERO, .L_N0
|
||||
|
||||
.L_J1: /* if (i > 0) I-- */
|
||||
move S1, TS //a_offset1
|
||||
add.d S2, TS, TL //a_offset2
|
||||
srai.d J, M, 0x02
|
||||
add.d TS, TS, T0
|
||||
|
||||
beq J, ZERO, .L_I3
|
||||
|
||||
.L_I1: /* if (j > 0) J-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x00
|
||||
xvld U2, S2, 0x00
|
||||
|
||||
xvpermi.q U0, U2, 0x02
|
||||
xvpermi.q U2, U1, 0x31
|
||||
|
||||
xvpermi.d U0, U0, 0xd8
|
||||
xvpermi.d U2, U2, 0xd8
|
||||
|
||||
xvst U0, TD, 0x00
|
||||
xvst U2, TD, 0x20
|
||||
|
||||
addi.d S1, S1, 0x20 // a_offset1
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d TD, TD, 0x40 // b_offset
|
||||
|
||||
addi.d J, J, -1
|
||||
blt ZERO, J, .L_I1
|
||||
|
||||
.L_I3:
|
||||
andi J, M, 0x03
|
||||
beq J, ZERO, .L_II20
|
||||
|
||||
.L_II1: /* j = (m & 3) if (j > 0) */
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S1, 0x04
|
||||
fld.s F2, S2, 0x00
|
||||
fld.s F3, S2, 0x04
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
fst.s F1, TD, 0x04
|
||||
fst.s F2, TD, 0x08
|
||||
fst.s F3, TD, 0x0c
|
||||
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d TD, TD, 0x10
|
||||
|
||||
addi.d J, J, -1
|
||||
blt ZERO, J, .L_II1
|
||||
|
||||
.L_II20:
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_J1
|
||||
|
||||
.L_N0: /* if(n&1)*/
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_N00
|
||||
|
||||
.L_N1:
|
||||
srai.d J, M, 0x02
|
||||
beq ZERO, J, .L_N10
|
||||
|
||||
.L_N11: /* j = (m >> 2) if (j > 0) */
|
||||
xvld U0, TS, 0x00
|
||||
|
||||
xvst U0, TD, 0x00
|
||||
|
||||
addi.d TS, TS, 0x20 // a_offset
|
||||
addi.d TD, TD, 0x20 // b_offset
|
||||
|
||||
addi.d J, J, -1
|
||||
blt ZERO, J, .L_N11
|
||||
|
||||
.L_N10:
|
||||
andi J, M, 0x03
|
||||
beq J, ZERO, .L_N00
|
||||
|
||||
.L_N12: /* j = (m & 3) if (j > 0) */
|
||||
fld.s F0, TS, 0x00
|
||||
fld.s F1, TS, 0x04
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
fst.s F1, TD, 0x04
|
||||
|
||||
addi.d TS, TS, 0x08 // a_offset
|
||||
addi.d TD, TD, 0x08 // b_offset
|
||||
|
||||
addi.d J, J, -1
|
||||
blt ZERO, J, .L_N12
|
||||
|
||||
.L_N00:
|
||||
LDARG $r23, $sp, 0
|
||||
addi.d $sp, $sp, 8
|
||||
jirl $r0, $r1, 0x00
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,202 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define S5 $r16
|
||||
#define S6 $r17
|
||||
#define S7 $r18
|
||||
#define S8 $r19
|
||||
#define TD $r20
|
||||
#define TS $r11
|
||||
#define TL $r7
|
||||
#define T0 $r23
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
#define F4 $f4
|
||||
#define F5 $f5
|
||||
#define F6 $f6
|
||||
#define F7 $f7
|
||||
|
||||
/* LASX vectors */
|
||||
#define U0 $vr0
|
||||
#define U1 $vr1
|
||||
#define U2 $vr2
|
||||
#define U3 $vr3
|
||||
#define U4 $vr4
|
||||
#define U5 $vr5
|
||||
#define U6 $vr6
|
||||
#define U7 $vr7
|
||||
#define D0 $vr8
|
||||
#define D1 $vr9
|
||||
#define D2 $vr10
|
||||
#define D3 $vr11
|
||||
#define D4 $vr12
|
||||
#define D5 $vr13
|
||||
#define D6 $vr14
|
||||
#define D7 $vr15
|
||||
#define D8 $vr16
|
||||
|
||||
PROLOGUE
|
||||
|
||||
addi.d $sp, $sp, -8
|
||||
SDARG $r23, $sp, 0
|
||||
|
||||
move TD, DST //boffset
|
||||
move TS, SRC //aoffset
|
||||
|
||||
slli.d TL, LDA, 0x02 //lda
|
||||
slli.d TL, TL, 0x01
|
||||
slli.d T0, TL, 0x01
|
||||
|
||||
srai.d I, N, 0x01
|
||||
beq I, ZERO, .L_N0
|
||||
|
||||
.L_J1: /* if (i > 0) I-- */
|
||||
move S1, TS //a_offset1
|
||||
add.d S2, TS, TL //a_offset2
|
||||
srai.d J, M, 0x02
|
||||
add.d TS, TS, T0
|
||||
|
||||
beq J, ZERO, .L_I3
|
||||
|
||||
.L_I1: /* if (j > 0) J-- */
|
||||
vld U0, S1, 0x00
|
||||
vld U1, S1, 0x10
|
||||
vld U2, S2, 0x00
|
||||
vld U3, S2, 0x10
|
||||
|
||||
vand.v D0, U2, U2
|
||||
vand.v D1, U3, U3
|
||||
vand.v D2, U2, U2
|
||||
vand.v D3, U3, U3
|
||||
|
||||
vpermi.w D0, U0, 0x44
|
||||
vpermi.w D2, U0, 0xee
|
||||
vpermi.w D1, U1, 0x44
|
||||
vpermi.w D3, U1, 0xee
|
||||
|
||||
vst D0, TD, 0x00
|
||||
vst D2, TD, 0x10
|
||||
vst D1, TD, 0x20
|
||||
vst D3, TD, 0x30
|
||||
|
||||
addi.d S1, S1, 0x20 // a_offset1
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d TD, TD, 0x40 // b_offset
|
||||
|
||||
addi.d J, J, -1
|
||||
blt ZERO, J, .L_I1
|
||||
|
||||
.L_I3:
|
||||
andi J, M, 0x03
|
||||
beq J, ZERO, .L_II20
|
||||
|
||||
.L_II1: /* j = (m & 3) if (j > 0) */
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S1, 0x04
|
||||
fld.s F2, S2, 0x00
|
||||
fld.s F3, S2, 0x04
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
fst.s F1, TD, 0x04
|
||||
fst.s F2, TD, 0x08
|
||||
fst.s F3, TD, 0x0c
|
||||
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d TD, TD, 0x10
|
||||
|
||||
addi.d J, J, -1
|
||||
blt ZERO, J, .L_II1
|
||||
|
||||
.L_II20:
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_J1
|
||||
|
||||
.L_N0: /* if(n&1)*/
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_N00
|
||||
|
||||
.L_N1:
|
||||
srai.d J, M, 0x02
|
||||
beq ZERO, J, .L_N10
|
||||
|
||||
.L_N11: /* j = (m >> 2) if (j > 0) */
|
||||
vld U0, TS, 0x00
|
||||
vld U1, TS, 0x10
|
||||
|
||||
vst U0, TD, 0x00
|
||||
vst U1, TD, 0x10
|
||||
|
||||
addi.d TS, TS, 0x20 // a_offset
|
||||
addi.d TD, TD, 0x20 // b_offset
|
||||
|
||||
addi.d J, J, -1
|
||||
blt ZERO, J, .L_N11
|
||||
|
||||
.L_N10:
|
||||
andi J, M, 0x03
|
||||
beq J, ZERO, .L_N00
|
||||
|
||||
.L_N12: /* j = (m & 3) if (j > 0) */
|
||||
fld.s F0, TS, 0x00
|
||||
fld.s F1, TS, 0x04
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
fst.s F1, TD, 0x04
|
||||
|
||||
addi.d TS, TS, 0x08 // a_offset
|
||||
addi.d TD, TD, 0x08 // b_offset
|
||||
|
||||
addi.d J, J, -1
|
||||
blt ZERO, J, .L_N12
|
||||
|
||||
.L_N00:
|
||||
LDARG $r23, $sp, 0
|
||||
addi.d $sp, $sp, 8
|
||||
jirl $r0, $r1, 0x00
|
||||
|
||||
EPILOGUE
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue