Merge pull request #4408 from OpenMathLib/develop
merge develop for 0.3.26 release
This commit is contained in:
commit
8fe7f80271
|
|
@ -29,7 +29,7 @@ task:
|
|||
- mkdir build
|
||||
- cd build
|
||||
- cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
|
||||
- make
|
||||
- make -j 4
|
||||
|
||||
task:
|
||||
name: AppleM1/GCC/MAKE/OPENMP
|
||||
|
|
|
|||
|
|
@ -16,13 +16,13 @@ jobs:
|
|||
include:
|
||||
- target: LOONGSONGENERIC
|
||||
triple: loongarch64-unknown-linux-gnu
|
||||
opts: NO_SHARED=1 TARGET=LOONGSONGENERIC
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSONGENERIC
|
||||
- target: LOONGSON3R5
|
||||
triple: loongarch64-unknown-linux-gnu
|
||||
opts: NO_SHARED=1 TARGET=LOONGSON3R5
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5
|
||||
- target: LOONGSON2K1000
|
||||
triple: loongarch64-unknown-linux-gnu
|
||||
opts: NO_SHARED=1 TARGET=LOONGSON2K1000
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000
|
||||
- target: DYNAMIC_ARCH
|
||||
triple: loongarch64-unknown-linux-gnu
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC
|
||||
|
|
@ -40,8 +40,9 @@ jobs:
|
|||
|
||||
- name: Download and install loongarch64-toolchain
|
||||
run: |
|
||||
wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz
|
||||
tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt
|
||||
wget https://github.com/sunhaiyong1978/CLFS-for-LoongArch/releases/download/8.1/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
|
||||
#wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
|
||||
tar -xf CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz -C /opt
|
||||
|
||||
- name: Set env
|
||||
run: |
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
|
|||
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 25)
|
||||
set(OpenBLAS_PATCH_VERSION 25.dev)
|
||||
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
|
|
@ -249,20 +249,21 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago
|
|||
endif()
|
||||
endif()
|
||||
|
||||
if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS)
|
||||
# Seems that this hack doesn't required since macOS 11 Big Sur
|
||||
if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
|
||||
set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
|
||||
if (NOT NOFORTRAN)
|
||||
set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
|
||||
set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
|
||||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
|
||||
"sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
|
||||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
|
||||
"sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
|
||||
"sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
|
||||
"sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
|
||||
"sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
|
||||
else ()
|
||||
set (CMAKE_C_CREATE_SHARED_LIBRARY
|
||||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
|
||||
"sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
|
||||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
|
||||
"sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
|
||||
"sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
|
||||
endif ()
|
||||
endif()
|
||||
|
|
@ -541,7 +542,7 @@ if(NOT NO_LAPACKE)
|
|||
ADD_CUSTOM_TARGET(genlapacke
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
|
||||
)
|
||||
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
|
||||
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
endif()
|
||||
|
||||
# Install pkg-config files
|
||||
|
|
|
|||
|
|
@ -216,3 +216,6 @@ In chronological order:
|
|||
|
||||
* Pablo Romero <https://github.com/pablorcum>
|
||||
* [2022-08] Fix building from sources for QNX
|
||||
|
||||
* Mark Seminatore <https://github.com/mseminatore>
|
||||
* [2023-11-09] Improve Windows threading performance scaling
|
||||
|
|
@ -1,4 +1,49 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.26
|
||||
2-Jan-2024
|
||||
|
||||
general:
|
||||
- improved the version of openblas.pc that is created by the CMAKE build
|
||||
- fixed a CMAKE-specific build problem on older versions of MacOS
|
||||
- worked around linking problems on old versions of MacOS
|
||||
- corrected installation location of the lapacke_mangling header in CMAKE builds
|
||||
- added type declarations for complex variables to the MSVC-specific parts of the LAPACK header
|
||||
- significantly sped up ?GESV for small problem sizes by introducing a lower bound for multithreading
|
||||
- imported additions and corrections from the Reference-LAPACK project:
|
||||
- added new LAPACK functions for truncated QR with pivoting (Reference-LAPACK PRs 891&941)
|
||||
- handle miscalculation of minimum work array size in corner cases (Reference-LAPACK PR 942)
|
||||
- fixed use of uninitialized variables in ?GEDMD and improved inline documentation (PR 959)
|
||||
- fixed use of uninitialized variables (and consequential failures) in ?BBCSD (PR 967)
|
||||
- added tests for the recently introduced Dynamic Mode Decomposition functions (PR 736)
|
||||
- fixed several memory leaks in the LAPACK testsuite (PR 953)
|
||||
- fixed counting of testsuite results by the Python script (PR 954)
|
||||
|
||||
x86-64:
|
||||
- fixed computation of CASUM on SkylakeX and newer targets in the special
|
||||
case that AVX512 is not supported by the compiler or operating environment
|
||||
- fixed potential undefined behaviour in the CASUM/ZASUM kernels for AVX512 targets
|
||||
- worked around a problem in the pre-AVX kernels for GEMV
|
||||
- sped up the thread management code on MS Windows
|
||||
|
||||
arm64:
|
||||
- fixed building of the LAPACK testsuite with Xcode 15 on Apple M1 and newer
|
||||
- sped up the thread management code on MS Windows
|
||||
- sped up SGEMM and DGEMM on Neoverse V1 and N1
|
||||
- sped up ?DOT on SVE-capable targets
|
||||
- reduced the number of targets in DYNAMIC_ARCH builds by eliminating functionally equivalent ones
|
||||
- included support for Apple M1 and newer targets in DYNAMIC_ARCH builds
|
||||
|
||||
power:
|
||||
- improved the SGEMM kernel for POWER10
|
||||
- fixed compilation with (very) old versions of gcc
|
||||
- fixed detection of old 32bit PPC targets in CMAKE-based builds
|
||||
- added autodetection of the POWERPC 7400 subtype
|
||||
- fixed CMAKE-based compilation for PPCG4 and PPC970 targets
|
||||
|
||||
loongarch64:
|
||||
- added and improved optimized kernels for almost all BLAS functions
|
||||
|
||||
====================================================================
|
||||
Version 0.3.25
|
||||
12-Nov-2023
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@
|
|||
operation is finished.
|
||||
|
||||
|
||||
2. Simlar problem may happen under virtual machine. If supervisor
|
||||
2. Similar problem may happen under virtual machine. If supervisor
|
||||
allocates different cores for each scheduling, BLAS performnace
|
||||
will be bad. This is because BLAS also utilizes all cache,
|
||||
unexpected re-schedule for different core may result of heavy
|
||||
|
|
|
|||
|
|
@ -11,7 +11,19 @@ endif
|
|||
|
||||
ifeq ($(CORE), POWER10)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
|
||||
else ifneq ($(GCCVERSIONGT4), 1)
|
||||
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
|
||||
CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
|
||||
else
|
||||
$(warning your compiler is too old to fully support POWER10, getting a newer version of gcc is recommended)
|
||||
CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
|
||||
endif
|
||||
ifeq ($(F_COMPILER), IBM)
|
||||
FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr10 -qtune=pwr10 -qfloat=nomaf -qzerosize
|
||||
else
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.25
|
||||
VERSION = 0.3.25.dev
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
|
|
|||
|
|
@ -407,6 +407,7 @@ XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables |awk '/vers
|
|||
endif
|
||||
ifeq (x$(XCVER), x 15)
|
||||
CCOMMON_OPT += -Wl,-ld_classic
|
||||
FCOMMON_OPT += -Wl,-ld_classic
|
||||
endif
|
||||
endif
|
||||
|
||||
|
|
@ -676,16 +677,12 @@ ifeq ($(ARCH), arm64)
|
|||
DYNAMIC_CORE = ARMV8
|
||||
DYNAMIC_CORE += CORTEXA53
|
||||
DYNAMIC_CORE += CORTEXA57
|
||||
DYNAMIC_CORE += CORTEXA72
|
||||
DYNAMIC_CORE += CORTEXA73
|
||||
DYNAMIC_CORE += NEOVERSEN1
|
||||
ifneq ($(NO_SVE), 1)
|
||||
DYNAMIC_CORE += NEOVERSEV1
|
||||
DYNAMIC_CORE += NEOVERSEN2
|
||||
DYNAMIC_CORE += ARMV8SVE
|
||||
endif
|
||||
DYNAMIC_CORE += CORTEXA55
|
||||
DYNAMIC_CORE += FALKOR
|
||||
DYNAMIC_CORE += THUNDERX
|
||||
DYNAMIC_CORE += THUNDERX2T99
|
||||
DYNAMIC_CORE += TSV110
|
||||
|
|
|
|||
10
README.md
10
README.md
|
|
@ -196,20 +196,22 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
|||
```sh
|
||||
make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran
|
||||
```
|
||||
(also known to work on C906)
|
||||
(also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision)
|
||||
|
||||
### Support for multiple targets in a single library
|
||||
|
||||
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
|
||||
|
||||
For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default.
|
||||
For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX, Cooper Lake, Sapphire Rapids. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default.
|
||||
|
||||
`DYNAMIC_ARCH` is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
|
||||
Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.
|
||||
|
||||
On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus.
|
||||
On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. If compiler support for SVE is available at build time, support for NeoverseN2, NeoverseV1 as well as generic ArmV8SVE targets is also enabled.
|
||||
|
||||
For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
|
||||
For **POWER**, the list encompasses POWER6, POWER8 and POWER9. POWER10 is additionally available if a sufficiently recent compiler is used for the build.
|
||||
|
||||
on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support.
|
||||
|
||||
The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the
|
||||
common code in the library, usually you will want to set this to the oldest model you expect to encounter.
|
||||
|
|
|
|||
|
|
@ -288,9 +288,9 @@ jobs:
|
|||
vmImage: 'ubuntu-latest'
|
||||
steps:
|
||||
- script: |
|
||||
wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \
|
||||
&& echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc alpine-chroot-install' | sha1sum -c \
|
||||
|| exit 1
|
||||
wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.14.0/alpine-chroot-install \
|
||||
&& echo 'ccbf65f85cdc351851f8ad025bb3e65bae4d5b06 alpine-chroot-install' | sha1sum -c \
|
||||
|| exit 1
|
||||
alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
|
||||
sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo'
|
||||
alpine make DYNAMIC_ARCH=1 BINARY=64
|
||||
|
|
|
|||
|
|
@ -127,7 +127,7 @@ int main(int argc, char *argv[]){
|
|||
long long muls = n*(n+1)/2.0;
|
||||
long long adds = (n - 1.0)*n/2.0;
|
||||
|
||||
fprintf(stderr, "%10d %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg);
|
||||
fprintf(stderr, "%10d : %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg);
|
||||
if(a != NULL){
|
||||
free(a);
|
||||
}
|
||||
|
|
|
|||
6
c_check
6
c_check
|
|
@ -199,8 +199,7 @@ if [ "$architecture" = "loongarch64" ]; then
|
|||
tmpd="$(mktemp -d)"
|
||||
tmplsx="$tmpd/lsx.c"
|
||||
codelsx='"vadd.b $vr0, $vr0, $vr0"'
|
||||
lsx_flags='-march=loongarch64 -mlsx'
|
||||
printf "#include <lsxintrin.h>\n\n" >> "$tmplsx"
|
||||
lsx_flags='-march=loongarch64'
|
||||
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx"
|
||||
args="$lsx_flags -o $tmplsx.o $tmplsx"
|
||||
{
|
||||
|
|
@ -211,8 +210,7 @@ if [ "$architecture" = "loongarch64" ]; then
|
|||
|
||||
tmplasx="$tmpd/lasx.c"
|
||||
codelasx='"xvadd.b $xr0, $xr0, $xr0"'
|
||||
lasx_flags='-march=loongarch64 -mlasx'
|
||||
printf "#include <lasxintrin.h>\n\n" >> "$tmplasx"
|
||||
lasx_flags='-march=loongarch64'
|
||||
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx"
|
||||
args="$lasx_flags -o $tmplasx.o $tmplasx"
|
||||
{
|
||||
|
|
|
|||
|
|
@ -241,8 +241,7 @@ if (($architecture eq "loongarch64")) {
|
|||
} else {
|
||||
$tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
|
||||
$codelsx = '"vadd.b $vr0, $vr0, $vr0"';
|
||||
$lsx_flags = "-march=loongarch64 -mlsx";
|
||||
print $tmplsx "#include <lsxintrin.h>\n\n";
|
||||
$lsx_flags = "-march=loongarch64";
|
||||
print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n";
|
||||
|
||||
$args = "$lsx_flags -o $tmplsx.o $tmplsx";
|
||||
|
|
@ -257,8 +256,7 @@ if (($architecture eq "loongarch64")) {
|
|||
|
||||
$tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
|
||||
$codelasx = '"xvadd.b $xr0, $xr0, $xr0"';
|
||||
$lasx_flags = "-march=loongarch64 -mlasx";
|
||||
print $tmplasx "#include <lasxintrin.h>\n\n";
|
||||
$lasx_flags = "-march=loongarch64";
|
||||
print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n";
|
||||
|
||||
$args = "$lasx_flags -o $tmplasx.o $tmplasx";
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ endif ()
|
|||
|
||||
if (DYNAMIC_ARCH)
|
||||
if (ARM64)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE)
|
||||
endif ()
|
||||
|
|
|
|||
|
|
@ -36,9 +36,19 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS
|
|||
|
||||
if (LOONGARCH64)
|
||||
if (BINARY64)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64")
|
||||
CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI)
|
||||
if(COMPILER_SUPPORT_LP64D_ABI)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64d")
|
||||
else()
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64")
|
||||
endif ()
|
||||
else ()
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32")
|
||||
CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI)
|
||||
if(COMPILER_SUPPORT_ILP32D_ABI)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=ilp32d")
|
||||
else()
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32")
|
||||
endif ()
|
||||
endif ()
|
||||
set(BINARY_DEFINED 1)
|
||||
endif ()
|
||||
|
|
@ -282,6 +292,27 @@ if (${CORE} STREQUAL POWER8)
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
# With -mcpu=970 added it compiles, but library is broken, at least on macOS. If someone
|
||||
# tests on *BSD or Linux and adds this flag, please make sure it is not used for macOS case.
|
||||
if (${CORE} STREQUAL PPC970)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mtune=970 -maltivec -fno-fast-math")
|
||||
endif ()
|
||||
if (APPLE)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -force_cpusubtype_ALL")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
# -mcpu=G4 seems to work fine, but perhaps avoid it for the sake of consistency?
|
||||
if (${CORE} STREQUAL PPCG4)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mtune=G4 -maltivec -fno-fast-math")
|
||||
endif ()
|
||||
if (APPLE)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -force_cpusubtype_ALL")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (HAVE_AVX2)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2")
|
||||
|
|
|
|||
|
|
@ -61,9 +61,19 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
|
|||
endif ()
|
||||
if (LOONGARCH64)
|
||||
if (BINARY64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
|
||||
CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI)
|
||||
if(COMPILER_SUPPORT_LP64D_ABI)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d")
|
||||
else()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
|
||||
endif ()
|
||||
else ()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
|
||||
CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI)
|
||||
if(COMPILER_SUPPORT_ILP32D_ABI)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d")
|
||||
else()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
if (RISCV64)
|
||||
|
|
|
|||
|
|
@ -52,7 +52,7 @@ set(SLASRC
|
|||
sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f
|
||||
sgehd2.f sgehrd.f sgelq2.f sgelqf.f
|
||||
sgels.f sgelsd.f sgelss.f sgelsy.f sgeql2.f sgeqlf.f
|
||||
sgeqp3.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f
|
||||
sgeqp3.f sgeqp3rk.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f
|
||||
sgesc2.f sgesdd.f sgesvd.f sgesvdx.f sgesvx.f sgetc2.f
|
||||
sgetrf2.f sgetri.f
|
||||
sggbak.f sggbal.f
|
||||
|
|
@ -67,7 +67,7 @@ set(SLASRC
|
|||
slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f
|
||||
slansy.f slantb.f slantp.f slantr.f slanv2.f
|
||||
slapll.f slapmt.f
|
||||
slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f
|
||||
slaqgb.f slaqge.f slaqp2.f slaqps.f slaqp2rk.f slaqp3rk.f slaqsb.f slaqsp.f slaqsy.f
|
||||
slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
|
||||
slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
|
||||
slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
|
||||
|
|
@ -139,7 +139,7 @@ set(CLASRC
|
|||
cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f
|
||||
cgecon.f cgeequ.f cgees.f cgeesx.f cgeev.f cgeevx.f
|
||||
cgehd2.f cgehrd.f cgelq2.f cgelqf.f
|
||||
cgels.f cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f
|
||||
cgels.f cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f cgeqp3rk.f
|
||||
cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f cgerq2.f cgerqf.f
|
||||
cgesc2.f cgesdd.f cgesvd.f cgesvdx.f
|
||||
cgesvj.f cgejsv.f cgsvj0.f cgsvj1.f
|
||||
|
|
@ -173,7 +173,7 @@ set(CLASRC
|
|||
clanhb.f clanhe.f
|
||||
clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f
|
||||
clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f
|
||||
claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f
|
||||
claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqp2rk.f claqp3rk.f claqsb.f
|
||||
claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
|
||||
claqz0.f claqz1.f claqz2.f claqz3.f
|
||||
claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
|
||||
|
|
@ -243,7 +243,7 @@ set(DLASRC
|
|||
dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f
|
||||
dgehd2.f dgehrd.f dgelq2.f dgelqf.f
|
||||
dgels.f dgelsd.f dgelss.f dgelsy.f dgeql2.f dgeqlf.f
|
||||
dgeqp3.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f
|
||||
dgeqp3.f dgeqp3rk.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f
|
||||
dgesc2.f dgesdd.f dgesvd.f dgesvdx.f dgesvx.f dgetc2.f
|
||||
dgetrf2.f dgetri.f
|
||||
dggbak.f dggbal.f
|
||||
|
|
@ -258,7 +258,7 @@ set(DLASRC
|
|||
dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f
|
||||
dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f
|
||||
dlapll.f dlapmt.f
|
||||
dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
|
||||
dlaqgb.f dlaqge.f dlaqp2.f dlaqp2rk.f dlaqp3rk.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
|
||||
dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
|
||||
dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
|
||||
dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
|
||||
|
|
@ -331,7 +331,7 @@ set(ZLASRC
|
|||
zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f
|
||||
zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f
|
||||
zgehd2.f zgehrd.f zgelq2.f zgelqf.f
|
||||
zgels.f zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f
|
||||
zgels.f zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f zgeqp3rk.f
|
||||
zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f
|
||||
zgesc2.f zgesdd.f zgesvd.f zgesvdx.f zgesvx.f
|
||||
zgesvj.f zgejsv.f zgsvj0.f zgsvj1.f
|
||||
|
|
@ -367,7 +367,7 @@ set(ZLASRC
|
|||
zlanhe.f
|
||||
zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f
|
||||
zlantp.f zlantr.f zlapll.f zlapmt.f zlaqgb.f zlaqge.f
|
||||
zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f
|
||||
zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqp2rk.f zlaqp3rk.f zlaqps.f zlaqsb.f
|
||||
zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
|
||||
zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
|
||||
zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f
|
||||
|
|
@ -557,7 +557,7 @@ set(SLASRC
|
|||
sgebrd.c sgecon.c sgeequ.c sgees.c sgeesx.c sgeev.c sgeevx.c
|
||||
sgehd2.c sgehrd.c sgelq2.c sgelqf.c
|
||||
sgels.c sgelsd.c sgelss.c sgelsy.c sgeql2.c sgeqlf.c
|
||||
sgeqp3.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c
|
||||
sgeqp3.c sgeqp3rk.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c
|
||||
sgesc2.c sgesdd.c sgesvd.c sgesvdx.c sgesvx.c sgetc2.c
|
||||
sgetrf2.c sgetri.c
|
||||
sggbak.c sggbal.c
|
||||
|
|
@ -571,7 +571,7 @@ set(SLASRC
|
|||
slangb.c slange.c slangt.c slanhs.c slansb.c slansp.c
|
||||
slansy.c slantb.c slantp.c slantr.c slanv2.c
|
||||
slapll.c slapmt.c
|
||||
slaqgb.c slaqge.c slaqp2.c slaqps.c slaqsb.c slaqsp.c slaqsy.c
|
||||
slaqgb.c slaqge.c slaqp2.c slaqp2rk.c slaqp3rk.c slaqps.c slaqsb.c slaqsp.c slaqsy.c
|
||||
slaqr0.c slaqr1.c slaqr2.c slaqr3.c slaqr4.c slaqr5.c
|
||||
slaqtr.c slar1v.c slar2v.c ilaslr.c ilaslc.c
|
||||
slarf.c slarfb.c slarfb_gett.c slarfg.c slarfgp.c slarft.c slarfx.c slarfy.c slargv.c
|
||||
|
|
@ -643,7 +643,7 @@ set(CLASRC
|
|||
cgbtf2.c cgbtrf.c cgbtrs.c cgebak.c cgebal.c cgebd2.c cgebrd.c
|
||||
cgecon.c cgeequ.c cgees.c cgeesx.c cgeev.c cgeevx.c
|
||||
cgehd2.c cgehrd.c cgelq2.c cgelqf.c
|
||||
cgels.c cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c
|
||||
cgels.c cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c cgeqp3rk.c
|
||||
cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c
|
||||
cgesc2.c cgesdd.c cgesvd.c cgesvdx.c
|
||||
cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c
|
||||
|
|
@ -677,7 +677,7 @@ set(CLASRC
|
|||
clanhb.c clanhe.c
|
||||
clanhp.c clanhs.c clanht.c clansb.c clansp.c clansy.c clantb.c
|
||||
clantp.c clantr.c clapll.c clapmt.c clarcm.c claqgb.c claqge.c
|
||||
claqhb.c claqhe.c claqhp.c claqp2.c claqps.c claqsb.c
|
||||
claqhb.c claqhe.c claqhp.c claqp2.c claqp2rk.c claqp3rk.c claqps.c claqsb.c
|
||||
claqr0.c claqr1.c claqr2.c claqr3.c claqr4.c claqr5.c
|
||||
claqsp.c claqsy.c clar1v.c clar2v.c ilaclr.c ilaclc.c
|
||||
clarf.c clarfb.c clarfb_gett.c clarfg.c clarfgp.c clarft.c
|
||||
|
|
@ -746,7 +746,7 @@ set(DLASRC
|
|||
dgebrd.c dgecon.c dgeequ.c dgees.c dgeesx.c dgeev.c dgeevx.c
|
||||
dgehd2.c dgehrd.c dgelq2.c dgelqf.c
|
||||
dgels.c dgelsd.c dgelss.c dgelsy.c dgeql2.c dgeqlf.c
|
||||
dgeqp3.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c
|
||||
dgeqp3.c dgeqp3rk.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c
|
||||
dgesc2.c dgesdd.c dgesvd.c dgesvdx.c dgesvx.c dgetc2.c
|
||||
dgetrf2.c dgetri.c
|
||||
dggbak.c dggbal.c
|
||||
|
|
@ -760,7 +760,7 @@ set(DLASRC
|
|||
dlangb.c dlange.c dlangt.c dlanhs.c dlansb.c dlansp.c
|
||||
dlansy.c dlantb.c dlantp.c dlantr.c dlanv2.c
|
||||
dlapll.c dlapmt.c
|
||||
dlaqgb.c dlaqge.c dlaqp2.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c
|
||||
dlaqgb.c dlaqge.c dlaqp2.c dlaqp2rk.c dlaqp3rk.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c
|
||||
dlaqr0.c dlaqr1.c dlaqr2.c dlaqr3.c dlaqr4.c dlaqr5.c
|
||||
dlaqtr.c dlar1v.c dlar2v.c iladlr.c iladlc.c
|
||||
dlarf.c dlarfb.c dlarfb_gett.c dlarfg.c dlarfgp.c dlarft.c dlarfx.c dlarfy.c
|
||||
|
|
@ -833,7 +833,7 @@ set(ZLASRC
|
|||
zgbtf2.c zgbtrf.c zgbtrs.c zgebak.c zgebal.c zgebd2.c zgebrd.c
|
||||
zgecon.c zgeequ.c zgees.c zgeesx.c zgeev.c zgeevx.c
|
||||
zgehd2.c zgehrd.c zgelq2.c zgelqf.c
|
||||
zgels.c zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c
|
||||
zgels.c zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c zgeqp3rk.c
|
||||
zgeqr2.c zgeqr2p.c zgeqrf.c zgeqrfp.c zgerfs.c zgerq2.c zgerqf.c
|
||||
zgesc2.c zgesdd.c zgesvd.c zgesvdx.c zgesvx.c
|
||||
zgesvj.c zgejsv.c zgsvj0.c zgsvj1.c
|
||||
|
|
@ -868,7 +868,7 @@ set(ZLASRC
|
|||
zlanhe.c
|
||||
zlanhp.c zlanhs.c zlanht.c zlansb.c zlansp.c zlansy.c zlantb.c
|
||||
zlantp.c zlantr.c zlapll.c zlapmt.c zlaqgb.c zlaqge.c
|
||||
zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqps.c zlaqsb.c
|
||||
zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqp2rk.c zlaqp3rk.c zlaqps.c zlaqsb.c
|
||||
zlaqr0.c zlaqr1.c zlaqr2.c zlaqr3.c zlaqr4.c zlaqr5.c
|
||||
zlaqsp.c zlaqsy.c zlar1v.c zlar2v.c ilazlr.c ilazlc.c
|
||||
zlarcm.c zlarf.c zlarfb.c zlarfb_gett.c
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
|||
openblas_config=USE_64BITINT=@INTERFACE64@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
|
||||
Name: OpenBLAS
|
||||
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
||||
Version: @OPENBLAS_VERSION@
|
||||
URL: https://github.com/xianyi/OpenBLAS
|
||||
Version: @OpenBLAS_VERSION@
|
||||
URL: https://github.com/OpenMathLib/OpenBLAS
|
||||
Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix}
|
||||
Cflags: -I${includedir}
|
||||
|
|
|
|||
|
|
@ -38,7 +38,7 @@ if(CMAKE_CL_64 OR MINGW64)
|
|||
endif()
|
||||
elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING))
|
||||
set(X86 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc.*"))
|
||||
set(POWER 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
|
||||
set(MIPS64 1)
|
||||
|
|
@ -46,7 +46,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
|
|||
set(LOONGARCH64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*")
|
||||
set(RISCV64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*"))
|
||||
if (NOT BINARY)
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
set(X86_64 1)
|
||||
|
|
@ -109,7 +109,7 @@ else()
|
|||
endif ()
|
||||
|
||||
if (NOT BINARY)
|
||||
if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64)
|
||||
if (X86_64 OR ARM64 OR MIPS64 OR LOONGARCH64 OR RISCV64 OR (POWER AND NOT (CMAKE_OSX_ARCHITECTURES STREQUAL "ppc")))
|
||||
set(BINARY 64)
|
||||
else ()
|
||||
set(BINARY 32)
|
||||
|
|
|
|||
|
|
@ -119,12 +119,50 @@ static inline int WhereAmI(void){
|
|||
#define MOV fmov.d
|
||||
#define CMOVT fsel
|
||||
#define MTC movgr2fr.d
|
||||
#define MTG movfr2gr.d
|
||||
#define FABS fabs.d
|
||||
#define FMIN fmin.d
|
||||
#define FMINA fmina.d
|
||||
#define FMAX fmax.d
|
||||
#define FMAXA fmaxa.d
|
||||
#define CMPEQ fcmp.ceq.d
|
||||
#define CMPLE fcmp.cle.d
|
||||
#define CMPLT fcmp.clt.d
|
||||
#define NEG fneg.d
|
||||
#define FFINT ffint.d.l
|
||||
|
||||
#define XVFSUB xvfsub.d
|
||||
#define XVFADD xvfadd.d
|
||||
#define XVFMUL xvfmul.d
|
||||
#define XVFMADD xvfmadd.d
|
||||
#define XVFMIN xvfmin.d
|
||||
#define XVFMINA xvfmina.d
|
||||
#define XVFMAX xvfmax.d
|
||||
#define XVFMAXA xvfmaxa.d
|
||||
#define XVCMPEQ xvfcmp.ceq.d
|
||||
#define XVCMPLE xvfcmp.cle.d
|
||||
#define XVCMPLT xvfcmp.clt.d
|
||||
#define XVMUL xvfmul.d
|
||||
#define XVMSUB xvfmsub.d
|
||||
#define XVNMSUB xvfnmsub.d
|
||||
|
||||
#define VFSUB vfsub.d
|
||||
#define VFADD vfadd.d
|
||||
#define VFMUL vfmul.d
|
||||
#define VFMADD vfmadd.d
|
||||
#define VFMIN vfmin.d
|
||||
#define VFMINA vfmina.d
|
||||
#define VFMAX vfmax.d
|
||||
#define VFMAXA vfmaxa.d
|
||||
#define VCMPEQ vfcmp.ceq.d
|
||||
#define VCMPLE vfcmp.cle.d
|
||||
#define VCMPLT vfcmp.clt.d
|
||||
#define VMUL vfmul.d
|
||||
#define VMSUB vfmsub.d
|
||||
#define VNMSUB vfnmsub.d
|
||||
|
||||
#else
|
||||
|
||||
#define LD fld.s
|
||||
#define ST fst.s
|
||||
#define MADD fmadd.s
|
||||
|
|
@ -137,11 +175,48 @@ static inline int WhereAmI(void){
|
|||
#define MOV fmov.s
|
||||
#define CMOVT fsel
|
||||
#define MTC movgr2fr.w
|
||||
#define MTG movfr2gr.s
|
||||
#define FABS fabs.s
|
||||
#define FMIN fmin.s
|
||||
#define FMINA fmina.s
|
||||
#define FMAX fmax.s
|
||||
#define FMAXA fmaxa.s
|
||||
#define CMPEQ fcmp.ceq.s
|
||||
#define CMPLE fcmp.cle.s
|
||||
#define CMPLT fcmp.clt.s
|
||||
#define NEG fneg.s
|
||||
#define FFINT ffint.s.l
|
||||
|
||||
#define XVFSUB xvfsub.s
|
||||
#define XVFADD xvfadd.s
|
||||
#define XVFMUL xvfmul.s
|
||||
#define XVFMADD xvfmadd.s
|
||||
#define XVFMIN xvfmin.s
|
||||
#define XVFMINA xvfmina.s
|
||||
#define XVFMAX xvfmax.s
|
||||
#define XVFMAXA xvfmaxa.s
|
||||
#define XVCMPEQ xvfcmp.ceq.s
|
||||
#define XVCMPLE xvfcmp.cle.s
|
||||
#define XVCMPLT xvfcmp.clt.s
|
||||
#define XVMUL xvfmul.s
|
||||
#define XVMSUB xvfmsub.s
|
||||
#define XVNMSUB xvfnmsub.s
|
||||
|
||||
#define VFSUB vfsub.s
|
||||
#define VFADD vfadd.s
|
||||
#define VFMUL vfmul.s
|
||||
#define VFMADD vfmadd.s
|
||||
#define VFMIN vfmin.s
|
||||
#define VFMINA vfmina.s
|
||||
#define VFMAX vfmax.s
|
||||
#define VFMAXA vfmaxa.s
|
||||
#define VCMPEQ vfcmp.ceq.s
|
||||
#define VCMPLE vfcmp.cle.s
|
||||
#define VCMPLT vfcmp.clt.s
|
||||
#define VMUL vfmul.s
|
||||
#define VMSUB vfmsub.s
|
||||
#define VNMSUB vfnmsub.s
|
||||
|
||||
#endif /* defined(DOUBLE) */
|
||||
|
||||
#if defined(__64BIT__) && defined(USE64BITINT)
|
||||
|
|
|
|||
|
|
@ -111,8 +111,9 @@ typedef struct blas_queue {
|
|||
struct blas_queue *next;
|
||||
|
||||
#if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__)
|
||||
CRITICAL_SECTION lock;
|
||||
HANDLE finish;
|
||||
// CRITICAL_SECTION lock;
|
||||
// HANDLE finish;
|
||||
volatile int finished;
|
||||
#else
|
||||
pthread_mutex_t lock;
|
||||
pthread_cond_t finished;
|
||||
|
|
|
|||
|
|
@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CPU_LOONGSON3R5 1
|
||||
#define CPU_LOONGSON2K1000 2
|
||||
|
||||
#define LA_HWCAP_LSX (1<<4)
|
||||
#define LA_HWCAP_LASX (1<<5)
|
||||
#define LA_HWCAP_LSX (1U << 4)
|
||||
#define LA_HWCAP_LASX (1U << 5)
|
||||
|
||||
static char *cpuname[] = {
|
||||
"LOONGSONGENERIC",
|
||||
|
|
@ -64,11 +64,11 @@ static char *cpuname_lower[] = {
|
|||
|
||||
int detect(void) {
|
||||
#ifdef __linux
|
||||
int flag = (int)getauxval(AT_HWCAP);
|
||||
int hwcap = (int)getauxval(AT_HWCAP);
|
||||
|
||||
if (flag & LA_HWCAP_LASX)
|
||||
if (hwcap & LA_HWCAP_LASX)
|
||||
return CPU_LOONGSON3R5;
|
||||
else if (flag & LA_HWCAP_LSX)
|
||||
else if (hwcap & LA_HWCAP_LSX)
|
||||
return CPU_LOONGSON2K1000;
|
||||
else
|
||||
return CPU_GENERIC;
|
||||
|
|
@ -94,7 +94,9 @@ void get_subdirname(void) {
|
|||
}
|
||||
|
||||
void get_cpuconfig(void) {
|
||||
uint32_t hwcaps = 0;
|
||||
int d = detect();
|
||||
|
||||
switch (d) {
|
||||
case CPU_LOONGSON3R5:
|
||||
printf("#define LOONGSON3R5\n");
|
||||
|
|
@ -129,6 +131,10 @@ void get_cpuconfig(void) {
|
|||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
break;
|
||||
}
|
||||
|
||||
hwcaps = (uint32_t)getauxval( AT_HWCAP );
|
||||
if (hwcaps & LA_HWCAP_LSX) printf("#define HAVE_LSX\n");
|
||||
if (hwcaps & LA_HWCAP_LASX) printf("#define HAVE_LASX\n");
|
||||
}
|
||||
|
||||
void get_libname(void){
|
||||
|
|
|
|||
|
|
@ -160,6 +160,7 @@ int detect(void){
|
|||
infoCount = HOST_BASIC_INFO_COUNT;
|
||||
host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, &infoCount);
|
||||
|
||||
if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7400) return CPUTYPE_PPCG4;
|
||||
if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7450) return CPUTYPE_PPCG4;
|
||||
if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_970) return CPUTYPE_PPC970;
|
||||
|
||||
|
|
|
|||
|
|
@ -51,15 +51,10 @@
|
|||
/* This is a thread implementation for Win32 lazy implementation */
|
||||
|
||||
/* Thread server common information */
|
||||
typedef struct{
|
||||
CRITICAL_SECTION lock;
|
||||
HANDLE filled;
|
||||
HANDLE killed;
|
||||
|
||||
blas_queue_t *queue; /* Parameter Pointer */
|
||||
int shutdown; /* server shutdown flag */
|
||||
|
||||
} blas_pool_t;
|
||||
static blas_queue_t *work_queue = NULL;
|
||||
static HANDLE kickoff_event = NULL;
|
||||
static CRITICAL_SECTION queue_lock;
|
||||
|
||||
/* We need this global for checking if initialization is finished. */
|
||||
int blas_server_avail = 0;
|
||||
|
|
@ -67,11 +62,19 @@ int blas_server_avail = 0;
|
|||
/* Local Variables */
|
||||
static BLASULONG server_lock = 0;
|
||||
|
||||
static blas_pool_t pool;
|
||||
static HANDLE blas_threads [MAX_CPU_NUMBER];
|
||||
static DWORD blas_threads_id[MAX_CPU_NUMBER];
|
||||
static volatile int thread_target; // target num of live threads, volatile for cross-thread reads
|
||||
|
||||
|
||||
#if defined (__GNUC__) && (__GNUC__ < 6)
|
||||
#define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch)
|
||||
#else
|
||||
#if defined(_WIN64)
|
||||
#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp)
|
||||
#else
|
||||
#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
||||
|
||||
|
|
@ -202,14 +205,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
static DWORD WINAPI blas_thread_server(void *arg){
|
||||
|
||||
/* Thread identifier */
|
||||
#ifdef SMP_DEBUG
|
||||
BLASLONG cpu = (BLASLONG)arg;
|
||||
#endif
|
||||
|
||||
void *buffer, *sa, *sb;
|
||||
blas_queue_t *queue;
|
||||
DWORD action;
|
||||
HANDLE handles[] = {pool.filled, pool.killed};
|
||||
|
||||
/* Each server needs each buffer */
|
||||
buffer = blas_memory_alloc(2);
|
||||
|
|
@ -225,29 +224,44 @@ static DWORD WINAPI blas_thread_server(void *arg){
|
|||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu);
|
||||
#endif
|
||||
// event raised when work is added to the queue
|
||||
WaitForSingleObject(kickoff_event, INFINITE);
|
||||
|
||||
do {
|
||||
action = WaitForMultipleObjects(2, handles, FALSE, INFINITE);
|
||||
} while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1));
|
||||
|
||||
if (action == WAIT_OBJECT_0 + 1) break;
|
||||
if (cpu > thread_target - 2)
|
||||
{
|
||||
//printf("thread [%d] exiting.\n", cpu);
|
||||
break; // excess thread, so worker thread exits
|
||||
}
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Server[%2ld] Got it.\n", cpu);
|
||||
#endif
|
||||
|
||||
EnterCriticalSection(&pool.lock);
|
||||
#if 1
|
||||
EnterCriticalSection(&queue_lock);
|
||||
|
||||
queue = pool.queue;
|
||||
if (queue) pool.queue = queue->next;
|
||||
queue = work_queue;
|
||||
if (queue)
|
||||
work_queue = work_queue->next;
|
||||
|
||||
LeaveCriticalSection(&pool.lock);
|
||||
LeaveCriticalSection(&queue_lock);
|
||||
#else
|
||||
volatile blas_queue_t* queue_next;
|
||||
|
||||
INT_PTR prev_value;
|
||||
do {
|
||||
queue = (volatile blas_queue_t*)work_queue;
|
||||
if (!queue)
|
||||
break;
|
||||
|
||||
queue_next = (volatile blas_queue_t*)queue->next;
|
||||
prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue);
|
||||
} while (prev_value != queue);
|
||||
#endif
|
||||
|
||||
if (queue) {
|
||||
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
|
||||
|
||||
if (pool.queue) SetEvent(pool.filled);
|
||||
|
||||
sa = queue -> sa;
|
||||
sb = queue -> sb;
|
||||
|
||||
|
|
@ -331,14 +345,9 @@ static DWORD WINAPI blas_thread_server(void *arg){
|
|||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
|
||||
#endif
|
||||
|
||||
queue->finished = 1;
|
||||
|
||||
EnterCriticalSection(&queue->lock);
|
||||
|
||||
queue -> status = BLAS_STATUS_FINISHED;
|
||||
|
||||
LeaveCriticalSection(&queue->lock);
|
||||
|
||||
SetEvent(queue->finish);
|
||||
}
|
||||
|
||||
/* Shutdown procedure */
|
||||
|
|
@ -366,15 +375,16 @@ int blas_thread_init(void){
|
|||
#endif
|
||||
|
||||
if (!blas_server_avail){
|
||||
// create the kickoff Event
|
||||
kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
|
||||
|
||||
InitializeCriticalSection(&pool.lock);
|
||||
pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
|
||||
pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL);
|
||||
thread_target = blas_cpu_number;
|
||||
|
||||
pool.shutdown = 0;
|
||||
pool.queue = NULL;
|
||||
InitializeCriticalSection(&queue_lock);
|
||||
|
||||
for(i = 0; i < blas_cpu_number - 1; i++){
|
||||
//printf("thread_init: creating thread [%d]\n", i);
|
||||
|
||||
blas_threads[i] = CreateThread(NULL, 0,
|
||||
blas_thread_server, (void *)i,
|
||||
0, &blas_threads_id[i]);
|
||||
|
|
@ -409,8 +419,6 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
|
|||
current = queue;
|
||||
|
||||
while (current) {
|
||||
InitializeCriticalSection(¤t -> lock);
|
||||
current -> finish = CreateEvent(NULL, FALSE, FALSE, NULL);
|
||||
current -> position = pos;
|
||||
|
||||
#ifdef CONSISTENT_FPCSR
|
||||
|
|
@ -418,23 +426,32 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
|
|||
__asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode));
|
||||
#endif
|
||||
|
||||
current->finished = 0;
|
||||
current = current -> next;
|
||||
pos ++;
|
||||
}
|
||||
|
||||
EnterCriticalSection(&pool.lock);
|
||||
EnterCriticalSection(&queue_lock);
|
||||
|
||||
if (pool.queue) {
|
||||
current = pool.queue;
|
||||
while (current -> next) current = current -> next;
|
||||
current -> next = queue;
|
||||
} else {
|
||||
pool.queue = queue;
|
||||
if (!work_queue)
|
||||
{
|
||||
work_queue = queue;
|
||||
}
|
||||
else
|
||||
{
|
||||
blas_queue_t *next_item = work_queue;
|
||||
|
||||
// find the end of the work queue
|
||||
while (next_item)
|
||||
next_item = next_item->next;
|
||||
|
||||
// add new work to the end
|
||||
next_item = queue;
|
||||
}
|
||||
|
||||
LeaveCriticalSection(&pool.lock);
|
||||
LeaveCriticalSection(&queue_lock);
|
||||
|
||||
SetEvent(pool.filled);
|
||||
SetEvent(kickoff_event);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -449,21 +466,26 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
|
|||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Waiting Queue ..\n");
|
||||
#endif
|
||||
while (!queue->finished)
|
||||
YIELDING;
|
||||
|
||||
WaitForSingleObject(queue->finish, INFINITE);
|
||||
|
||||
CloseHandle(queue->finish);
|
||||
DeleteCriticalSection(&queue -> lock);
|
||||
|
||||
queue = queue -> next;
|
||||
num --;
|
||||
queue = queue->next;
|
||||
num--;
|
||||
}
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Completely Done.\n\n");
|
||||
#endif
|
||||
// if work was added to the queue after this batch we can't sleep the worker threads
|
||||
// by resetting the event
|
||||
EnterCriticalSection(&queue_lock);
|
||||
|
||||
return 0;
|
||||
if (work_queue == NULL)
|
||||
ResetEvent(kickoff_event);
|
||||
|
||||
LeaveCriticalSection(&queue_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Execute Threads */
|
||||
|
|
@ -512,8 +534,6 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
|||
|
||||
if (blas_server_avail){
|
||||
|
||||
SetEvent(pool.killed);
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
// Could also just use WaitForMultipleObjects
|
||||
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
|
||||
|
|
@ -528,9 +548,6 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
|||
CloseHandle(blas_threads[i]);
|
||||
}
|
||||
|
||||
CloseHandle(pool.filled);
|
||||
CloseHandle(pool.killed);
|
||||
|
||||
blas_server_avail = 0;
|
||||
}
|
||||
|
||||
|
|
@ -552,23 +569,48 @@ void goto_set_num_threads(int num_threads)
|
|||
|
||||
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
if (blas_server_avail && num_threads < blas_num_threads) {
|
||||
LOCK_COMMAND(&server_lock);
|
||||
|
||||
thread_target = num_threads;
|
||||
|
||||
SetEvent(kickoff_event);
|
||||
|
||||
for (i = num_threads - 1; i < blas_num_threads - 1; i++) {
|
||||
//printf("set_num_threads: waiting on thread [%d] to quit.\n", i);
|
||||
|
||||
WaitForSingleObject(blas_threads[i], INFINITE);
|
||||
|
||||
//printf("set_num_threads: thread [%d] has quit.\n", i);
|
||||
|
||||
CloseHandle(blas_threads[i]);
|
||||
}
|
||||
|
||||
blas_num_threads = num_threads;
|
||||
|
||||
ResetEvent(kickoff_event);
|
||||
|
||||
UNLOCK_COMMAND(&server_lock);
|
||||
}
|
||||
|
||||
if (num_threads > blas_num_threads) {
|
||||
|
||||
LOCK_COMMAND(&server_lock);
|
||||
|
||||
thread_target = num_threads;
|
||||
|
||||
//increased_threads = 1;
|
||||
if (!blas_server_avail){
|
||||
// create the kickoff Event
|
||||
kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
|
||||
|
||||
InitializeCriticalSection(&pool.lock);
|
||||
pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
|
||||
pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL);
|
||||
InitializeCriticalSection(&queue_lock);
|
||||
|
||||
pool.shutdown = 0;
|
||||
pool.queue = NULL;
|
||||
blas_server_avail = 1;
|
||||
}
|
||||
|
||||
for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
|
||||
//printf("set_num_threads: creating thread [%d]\n", i);
|
||||
|
||||
blas_threads[i] = CreateThread(NULL, 0,
|
||||
blas_thread_server, (void *)i,
|
||||
|
|
|
|||
|
|
@ -122,10 +122,11 @@ extern gotoblas_t gotoblas_CORTEXA55;
|
|||
#endif
|
||||
#else
|
||||
extern gotoblas_t gotoblas_CORTEXA53;
|
||||
#define gotoblas_CORTEXA55 gotoblas_CORTEXA53
|
||||
extern gotoblas_t gotoblas_CORTEXA57;
|
||||
extern gotoblas_t gotoblas_CORTEXA72;
|
||||
extern gotoblas_t gotoblas_CORTEXA73;
|
||||
extern gotoblas_t gotoblas_FALKOR;
|
||||
#define gotoblas_CORTEXA72 gotoblas_CORTEXA57
|
||||
#define gotoblas_CORTEXA73 gotoblas_CORTEXA57
|
||||
#define gotoblas_FALKOR gotoblas_CORTEXA57
|
||||
extern gotoblas_t gotoblas_THUNDERX;
|
||||
extern gotoblas_t gotoblas_THUNDERX2T99;
|
||||
extern gotoblas_t gotoblas_TSV110;
|
||||
|
|
@ -141,7 +142,6 @@ extern gotoblas_t gotoblas_ARMV8SVE;
|
|||
#define gotoblas_ARMV8SVE gotoblas_ARMV8
|
||||
#endif
|
||||
extern gotoblas_t gotoblas_THUNDERX3T110;
|
||||
extern gotoblas_t gotoblas_CORTEXA55;
|
||||
#endif
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
|
@ -247,6 +247,10 @@ static gotoblas_t *get_coretype(void) {
|
|||
int implementer, variant, part, arch, revision, midr_el1;
|
||||
char coremsg[128];
|
||||
|
||||
#if defined (OS_DARWIN)
|
||||
return &gotoblas_NEOVERSEN1;
|
||||
#endif
|
||||
|
||||
#if (!defined OS_LINUX && !defined OS_ANDROID)
|
||||
return NULL;
|
||||
#else
|
||||
|
|
@ -352,6 +356,9 @@ static gotoblas_t *get_coretype(void) {
|
|||
return &gotoblas_FALKOR;
|
||||
}
|
||||
break;
|
||||
case 0x61: // Apple
|
||||
return &gotoblas_NEOVERSEN1;
|
||||
break;
|
||||
default:
|
||||
snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
|
||||
openblas_warning(1, coremsg);
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include <sys/auxv.h>
|
||||
#include "common.h"
|
||||
|
||||
extern gotoblas_t gotoblas_LOONGSON3R5;
|
||||
|
|
@ -74,21 +75,15 @@ static gotoblas_t *force_coretype(char *coretype) {
|
|||
return NULL;
|
||||
}
|
||||
|
||||
#define LASX_MASK 1<<7
|
||||
#define LSX_MASK 1<<6
|
||||
#define LOONGARCH_CFG2 0x02
|
||||
#define LA_HWCAP_LSX (1U << 4)
|
||||
#define LA_HWCAP_LASX (1U << 5)
|
||||
|
||||
static gotoblas_t *get_coretype(void) {
|
||||
int ret = 0;
|
||||
__asm__ volatile (
|
||||
"cpucfg %0, %1 \n\t"
|
||||
: "+&r"(ret)
|
||||
: "r"(LOONGARCH_CFG2)
|
||||
);
|
||||
int hwcap = (int)getauxval(AT_HWCAP);
|
||||
|
||||
if (ret & LASX_MASK)
|
||||
if (hwcap & LA_HWCAP_LASX)
|
||||
return &gotoblas_LOONGSON3R5;
|
||||
else if (ret & LSX_MASK)
|
||||
else if (hwcap & LA_HWCAP_LSX)
|
||||
return &gotoblas_LOONGSON2K1000;
|
||||
else
|
||||
return &gotoblas_LOONGSONGENERIC;
|
||||
|
|
|
|||
|
|
@ -66,8 +66,7 @@ static int cpuid(void)
|
|||
#endif
|
||||
return CPU_UNKNOWN;
|
||||
}
|
||||
#else
|
||||
#if defined(C_PGI) || defined(__clang__)
|
||||
#elif defined(C_PGI) || defined(__clang__)
|
||||
/*
|
||||
* NV HPC compilers do not yet implement __builtin_cpu_is().
|
||||
* Fake a version here for use in the CPU detection code below.
|
||||
|
|
@ -196,13 +195,21 @@ static int cpuid(void)
|
|||
cpu_type = pvrPOWER[i].cpu_type;
|
||||
return (int)(cpu_type);
|
||||
}
|
||||
#endif /* C_PGI */
|
||||
#elif !defined(__BUILTIN_CPU_SUPPORTS__)
|
||||
static int cpuid(void)
|
||||
{
|
||||
return CPU_UNKNOWN;
|
||||
}
|
||||
#endif /* _AIX */
|
||||
|
||||
#ifndef __BUILTIN_CPU_SUPPORTS__
|
||||
#include <string.h>
|
||||
|
||||
#if defined(_AIX) || (defined(__has_builtin) && !__has_builtin(__builtin_cpu_is))
|
||||
#ifndef __has_builtin
|
||||
#define __has_builtin(x) 0
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || !__has_builtin(__builtin_cpu_is)
|
||||
static int __builtin_cpu_is(const char *arg)
|
||||
{
|
||||
static int ipinfo = -1;
|
||||
|
|
@ -227,7 +234,7 @@ static int __builtin_cpu_is(const char *arg)
|
|||
}
|
||||
#endif
|
||||
|
||||
#if defined(_AIX) || (defined(__has_builtin) && !__has_builtin(__builtin_cpu_supports))
|
||||
#if defined(_AIX) || !__has_builtin(__builtin_cpu_supports)
|
||||
static int __builtin_cpu_supports(const char *arg)
|
||||
{
|
||||
return 0;
|
||||
|
|
|
|||
|
|
@ -114,7 +114,14 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv,
|
|||
|
||||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
#ifndef DOUBLE
|
||||
if (args.m*args.n < 40000)
|
||||
#else
|
||||
if (args.m*args.n < 10000)
|
||||
#endif
|
||||
args.nthreads=1;
|
||||
else
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -1,206 +1 @@
|
|||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
STRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
STRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
STRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
DTRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
DTRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
DTRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
DTRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
TRSMCOPYLN_M = trsm_lncopy_sve.c
|
||||
TRSMCOPYLT_M = trsm_ltcopy_sve.c
|
||||
TRSMCOPYUN_M = trsm_uncopy_sve.c
|
||||
TRSMCOPYUT_M = trsm_utcopy_sve.c
|
||||
|
||||
CTRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
CTRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
CTRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
CTRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c
|
||||
ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c
|
||||
ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c
|
||||
ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c
|
||||
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
SASUMKERNEL = sasum_thunderx2t99.c
|
||||
DASUMKERNEL = dasum_thunderx2t99.c
|
||||
CASUMKERNEL = casum_thunderx2t99.c
|
||||
ZASUMKERNEL = zasum_thunderx2t99.c
|
||||
|
||||
SCOPYKERNEL = copy_thunderx2t99.c
|
||||
DCOPYKERNEL = copy_thunderx2t99.c
|
||||
CCOPYKERNEL = copy_thunderx2t99.c
|
||||
ZCOPYKERNEL = copy_thunderx2t99.c
|
||||
|
||||
SSWAPKERNEL = swap_thunderx2t99.S
|
||||
DSWAPKERNEL = swap_thunderx2t99.S
|
||||
CSWAPKERNEL = swap_thunderx2t99.S
|
||||
ZSWAPKERNEL = swap_thunderx2t99.S
|
||||
|
||||
ISAMAXKERNEL = iamax_thunderx2t99.c
|
||||
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||
|
||||
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
DDOTKERNEL = dot.c
|
||||
SDOTKERNEL = dot.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
|
||||
|
||||
SGEMMINCOPY = gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c
|
||||
SGEMMITCOPY = gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c
|
||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRMMUNCOPY_M = trmm_uncopy_sve_v1.c
|
||||
STRMMLNCOPY_M = trmm_lncopy_sve_v1.c
|
||||
STRMMUTCOPY_M = trmm_utcopy_sve_v1.c
|
||||
STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
|
||||
|
||||
SSYMMUCOPY_M = symm_ucopy_sve.c
|
||||
SSYMMLCOPY_M = symm_lcopy_sve.c
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
|
||||
|
||||
DGEMMINCOPY = gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c
|
||||
DGEMMITCOPY = gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c
|
||||
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c
|
||||
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c
|
||||
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
|
||||
|
||||
DSYMMUCOPY_M = symm_ucopy_sve.c
|
||||
DSYMMLCOPY_M = symm_lcopy_sve.c
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
|
||||
CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
|
||||
CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
|
||||
CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
|
||||
CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
|
||||
CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
|
||||
|
||||
CHEMMLTCOPY_M = zhemm_ltcopy_sve.c
|
||||
CHEMMUTCOPY_M = zhemm_utcopy_sve.c
|
||||
|
||||
CSYMMUCOPY_M = zsymm_ucopy_sve.c
|
||||
CSYMMLCOPY_M = zsymm_lcopy_sve.c
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
|
||||
ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
|
||||
ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
|
||||
ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
|
||||
ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
|
||||
|
||||
ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c
|
||||
ZHEMMUTCOPY_M = zhemm_utcopy_sve.c
|
||||
|
||||
ZSYMMUCOPY_M = zsymm_ucopy_sve.c
|
||||
ZSYMMLCOPY_M = zsymm_lcopy_sve.c
|
||||
include $(KERNELDIR)/KERNEL.ARMV8SVE
|
||||
|
|
|
|||
|
|
@ -1,196 +1 @@
|
|||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = axpy.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
|
||||
SASUMKERNEL = asum.S
|
||||
DASUMKERNEL = asum.S
|
||||
CASUMKERNEL = casum.S
|
||||
ZASUMKERNEL = zasum.S
|
||||
|
||||
SCOPYKERNEL = copy.S
|
||||
DCOPYKERNEL = copy.S
|
||||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
|
||||
SSWAPKERNEL = swap.S
|
||||
DSWAPKERNEL = swap.S
|
||||
CSWAPKERNEL = swap.S
|
||||
ZSWAPKERNEL = swap.S
|
||||
|
||||
ISAMAXKERNEL = iamax.S
|
||||
IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
else
|
||||
SDOTKERNEL = dot.S
|
||||
endif
|
||||
DDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
|
||||
else
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
endif
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
ifeq ($(SGEMM_UNROLL_M), 16)
|
||||
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_M), 4)
|
||||
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||
else
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
else
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
include $(KERNELDIR)/KERNEL.CORTEXA53
|
||||
|
|
|
|||
|
|
@ -1,184 +1 @@
|
|||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||
else
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
else
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
SASUMKERNEL = sasum_thunderx2t99.c
|
||||
DASUMKERNEL = dasum_thunderx2t99.c
|
||||
CASUMKERNEL = casum_thunderx2t99.c
|
||||
ZASUMKERNEL = zasum_thunderx2t99.c
|
||||
|
||||
SCOPYKERNEL = copy_thunderx2t99.c
|
||||
DCOPYKERNEL = copy_thunderx2t99.c
|
||||
CCOPYKERNEL = copy_thunderx2t99.c
|
||||
ZCOPYKERNEL = copy_thunderx2t99.c
|
||||
|
||||
SSWAPKERNEL = swap_thunderx2t99.S
|
||||
DSWAPKERNEL = swap_thunderx2t99.S
|
||||
CSWAPKERNEL = swap_thunderx2t99.S
|
||||
ZSWAPKERNEL = swap_thunderx2t99.S
|
||||
|
||||
ISAMAXKERNEL = iamax_thunderx2t99.c
|
||||
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||
|
||||
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c
|
||||
#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c
|
||||
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
|
||||
DDOTKERNEL = dot.c
|
||||
SDOTKERNEL = dot.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
|
||||
DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S
|
||||
endif
|
||||
|
||||
ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4)
|
||||
SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S
|
||||
endif
|
||||
|
||||
ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4)
|
||||
CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S
|
||||
endif
|
||||
|
||||
ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4)
|
||||
ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S
|
||||
endif
|
||||
include $(KERNELDIR)/KERNEL.THUNDERX2T99
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
Copyright (c) 2022, Arm Ltd
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
|
@ -30,37 +31,84 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include <arm_sve.h>
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define SVE_TYPE svfloat64_t
|
||||
#define SVE_ZERO svdup_f64(0.0)
|
||||
#define SVE_WHILELT svwhilelt_b64
|
||||
#define SVE_ALL svptrue_b64()
|
||||
#define SVE_WIDTH svcntd()
|
||||
#define DTYPE "d"
|
||||
#define WIDTH "d"
|
||||
#define SHIFT "3"
|
||||
#else
|
||||
#define SVE_TYPE svfloat32_t
|
||||
#define SVE_ZERO svdup_f32(0.0)
|
||||
#define SVE_WHILELT svwhilelt_b32
|
||||
#define SVE_ALL svptrue_b32()
|
||||
#define SVE_WIDTH svcntw()
|
||||
#define DTYPE "s"
|
||||
#define WIDTH "w"
|
||||
#define SHIFT "2"
|
||||
#endif
|
||||
|
||||
static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||
SVE_TYPE acc_a = SVE_ZERO;
|
||||
SVE_TYPE acc_b = SVE_ZERO;
|
||||
#define COUNT \
|
||||
" cnt"WIDTH" x9 \n"
|
||||
#define SETUP_TRUE \
|
||||
" ptrue p0."DTYPE" \n"
|
||||
#define OFFSET_INPUTS \
|
||||
" add x12, %[X_], x9, lsl #"SHIFT" \n" \
|
||||
" add x13, %[Y_], x9, lsl #"SHIFT" \n"
|
||||
#define TAIL_WHILE \
|
||||
" whilelo p1."DTYPE", x8, x0 \n"
|
||||
#define UPDATE(pg, x,y,out) \
|
||||
" ld1"WIDTH" { z2."DTYPE" }, "pg"/z, ["x", x8, lsl #"SHIFT"] \n" \
|
||||
" ld1"WIDTH" { z3."DTYPE" }, "pg"/z, ["y", x8, lsl #"SHIFT"] \n" \
|
||||
" fmla "out"."DTYPE", "pg"/m, z2."DTYPE", z3."DTYPE" \n"
|
||||
#define SUM_VECTOR(v) \
|
||||
" faddv "DTYPE""v", p0, z"v"."DTYPE" \n"
|
||||
#define RET \
|
||||
" fadd %"DTYPE"[RET_], "DTYPE"1, "DTYPE"0 \n"
|
||||
|
||||
BLASLONG sve_width = SVE_WIDTH;
|
||||
#define DOT_KERNEL \
|
||||
COUNT \
|
||||
" mov z1.d, #0 \n" \
|
||||
" mov z0.d, #0 \n" \
|
||||
" mov x8, #0 \n" \
|
||||
" movi d1, #0x0 \n" \
|
||||
SETUP_TRUE \
|
||||
" neg x10, x9, lsl #1 \n" \
|
||||
" ands x11, x10, x0 \n" \
|
||||
" b.eq 2f // skip_2x \n" \
|
||||
OFFSET_INPUTS \
|
||||
"1: // vector_2x \n" \
|
||||
UPDATE("p0", "%[X_]", "%[Y_]", "z1") \
|
||||
UPDATE("p0", "x12", "x13", "z0") \
|
||||
" sub x8, x8, x10 \n" \
|
||||
" cmp x8, x11 \n" \
|
||||
" b.lo 1b // vector_2x \n" \
|
||||
SUM_VECTOR("1") \
|
||||
"2: // skip_2x \n" \
|
||||
" neg x10, x9 \n" \
|
||||
" and x10, x10, x0 \n" \
|
||||
" cmp x8, x10 \n" \
|
||||
" b.hs 4f // tail \n" \
|
||||
"3: // vector_1x \n" \
|
||||
UPDATE("p0", "%[X_]", "%[Y_]", "z0") \
|
||||
" add x8, x8, x9 \n" \
|
||||
" cmp x8, x10 \n" \
|
||||
" b.lo 3b // vector_1x \n" \
|
||||
"4: // tail \n" \
|
||||
" cmp x10, x0 \n" \
|
||||
" b.eq 5f // end \n" \
|
||||
TAIL_WHILE \
|
||||
UPDATE("p1", "%[X_]", "%[Y_]", "z0") \
|
||||
"5: // end \n" \
|
||||
SUM_VECTOR("0") \
|
||||
RET
|
||||
|
||||
for (BLASLONG i = 0; i < n; i += sve_width * 2) {
|
||||
svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n);
|
||||
svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n);
|
||||
static
|
||||
FLOAT
|
||||
dot_kernel_sve(BLASLONG n, FLOAT* x, FLOAT* y)
|
||||
{
|
||||
FLOAT ret;
|
||||
|
||||
SVE_TYPE x_vec_a = svld1(pg_a, &x[i]);
|
||||
SVE_TYPE y_vec_a = svld1(pg_a, &y[i]);
|
||||
SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]);
|
||||
SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]);
|
||||
asm(DOT_KERNEL
|
||||
:
|
||||
[RET_] "=&w" (ret)
|
||||
:
|
||||
[N_] "r" (n),
|
||||
[X_] "r" (x),
|
||||
[Y_] "r" (y)
|
||||
:);
|
||||
|
||||
acc_a = svmla_m(pg_a, acc_a, x_vec_a, y_vec_a);
|
||||
acc_b = svmla_m(pg_b, acc_b, x_vec_b, y_vec_b);
|
||||
}
|
||||
|
||||
return svaddv(SVE_ALL, acc_a) + svaddv(SVE_ALL, acc_b);
|
||||
return ret;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,110 @@
|
|||
ifndef NO_LSX
|
||||
|
||||
SDOTKERNEL = dot_lsx.S
|
||||
DSDOTKERNEL = dot_lsx.S
|
||||
DDOTKERNEL = dot_lsx.S
|
||||
CDOTKERNEL = cdot_lsx.S
|
||||
ZDOTKERNEL = cdot_lsx.S
|
||||
|
||||
SSCALKERNEL = scal_lsx.S
|
||||
DSCALKERNEL = scal_lsx.S
|
||||
CSCALKERNEL = cscal_lsx.S
|
||||
ZSCALKERNEL = cscal_lsx.S
|
||||
|
||||
SAMAXKERNEL = amax_lsx.S
|
||||
DAMAXKERNEL = amax_lsx.S
|
||||
CAMAXKERNEL = camax_lsx.S
|
||||
|
||||
SAMINKERNEL = amin_lsx.S
|
||||
DAMINKERNEL = amin_lsx.S
|
||||
CAMINKERNEL = camin_lsx.S
|
||||
|
||||
SMAXKERNEL = max_lsx.S
|
||||
DMAXKERNEL = max_lsx.S
|
||||
|
||||
SMINKERNEL = min_lsx.S
|
||||
DMINKERNEL = min_lsx.S
|
||||
|
||||
ISMAXKERNEL = imax_lsx.S
|
||||
IDMAXKERNEL = imax_lsx.S
|
||||
|
||||
ISMINKERNEL = imin_lsx.S
|
||||
IDMINKERNEL = imin_lsx.S
|
||||
|
||||
ISAMAXKERNEL = iamax_lsx.S
|
||||
IDAMAXKERNEL = iamax_lsx.S
|
||||
ICAMAXKERNEL = icamax_lsx.S
|
||||
IZAMAXKERNEL = icamax_lsx.S
|
||||
|
||||
ISAMINKERNEL = iamin_lsx.S
|
||||
IDAMINKERNEL = iamin_lsx.S
|
||||
ICAMINKERNEL = icamin_lsx.S
|
||||
IZAMINKERNEL = icamin_lsx.S
|
||||
|
||||
SCOPYKERNEL = copy_lsx.S
|
||||
DCOPYKERNEL = copy_lsx.S
|
||||
CCOPYKERNEL = ccopy_lsx.S
|
||||
ZCOPYKERNEL = ccopy_lsx.S
|
||||
|
||||
SSWAPKERNEL = swap_lsx.S
|
||||
DSWAPKERNEL = swap_lsx.S
|
||||
|
||||
SAXPYKERNEL = axpy_lsx.S
|
||||
DAXPYKERNEL = axpy_lsx.S
|
||||
CAXPYKERNEL = caxpy_lsx.S
|
||||
ZAXPYKERNEL = caxpy_lsx.S
|
||||
|
||||
SAXPBYKERNEL = axpby_lsx.S
|
||||
DAXPBYKERNEL = axpby_lsx.S
|
||||
|
||||
SSUMKERNEL = sum_lsx.S
|
||||
DSUMKERNEL = sum_lsx.S
|
||||
|
||||
SASUMKERNEL = asum_lsx.S
|
||||
DASUMKERNEL = asum_lsx.S
|
||||
CASUMKERNEL = casum_lsx.S
|
||||
ZASUMKERNEL = casum_lsx.S
|
||||
|
||||
SROTKERNEL = rot_lsx.S
|
||||
DROTKERNEL = rot_lsx.S
|
||||
CROTKERNEL = crot_lsx.S
|
||||
ZROTKERNEL = crot_lsx.S
|
||||
|
||||
SNRM2KERNEL = snrm2_lsx.S
|
||||
DNRM2KERNEL = dnrm2_lsx.S
|
||||
CNRM2KERNEL = cnrm2_lsx.S
|
||||
ZNRM2KERNEL = znrm2_lsx.S
|
||||
|
||||
CSWAPKERNEL = cswap_lsx.S
|
||||
ZSWAPKERNEL = cswap_lsx.S
|
||||
|
||||
CSUMKERNEL = csum_lsx.S
|
||||
ZSUMKERNEL = csum_lsx.S
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_8x4.S
|
||||
DGEMMINCOPY = dgemm_ncopy_8_lsx.S
|
||||
DGEMMITCOPY = dgemm_tcopy_8_lsx.S
|
||||
DGEMMONCOPY = dgemm_ncopy_4_lsx.S
|
||||
DGEMMOTCOPY = dgemm_tcopy_4_lsx.S
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_2x2_lsx.S
|
||||
CGEMMONCOPY = cgemm_ncopy_2_lsx.S
|
||||
CGEMMOTCOPY = cgemm_tcopy_2_lsx.S
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
endif
|
||||
|
|
@ -1,4 +1,86 @@
|
|||
ifndef NO_LASX
|
||||
|
||||
SDOTKERNEL = dot_lasx.S
|
||||
DSDOTKERNEL = dot_lasx.S
|
||||
DDOTKERNEL = dot_lasx.S
|
||||
CDOTKERNEL = cdot_lasx.S
|
||||
ZDOTKERNEL = cdot_lasx.S
|
||||
|
||||
SSCALKERNEL = scal_lasx.S
|
||||
DSCALKERNEL = scal_lasx.S
|
||||
CSCALKERNEL = cscal_lasx.S
|
||||
ZSCALKERNEL = cscal_lasx.S
|
||||
|
||||
SAMAXKERNEL = amax_lasx.S
|
||||
DAMAXKERNEL = amax_lasx.S
|
||||
CAMAXKERNEL = camax_lasx.S
|
||||
|
||||
SAMINKERNEL = amin_lasx.S
|
||||
DAMINKERNEL = amin_lasx.S
|
||||
CAMINKERNEL = camin_lasx.S
|
||||
|
||||
SMAXKERNEL = max_lsx.S
|
||||
DMAXKERNEL = max_lsx.S
|
||||
|
||||
SMINKERNEL = min_lsx.S
|
||||
DMINKERNEL = min_lsx.S
|
||||
|
||||
ISMAXKERNEL = imax_lasx.S
|
||||
IDMAXKERNEL = imax_lasx.S
|
||||
|
||||
ISMINKERNEL = imin_lasx.S
|
||||
IDMINKERNEL = imin_lasx.S
|
||||
|
||||
ISAMAXKERNEL = iamax_lasx.S
|
||||
IDAMAXKERNEL = iamax_lasx.S
|
||||
ICAMAXKERNEL = icamax_lasx.S
|
||||
IZAMAXKERNEL = icamax_lasx.S
|
||||
|
||||
ISAMINKERNEL = iamin_lasx.S
|
||||
IDAMINKERNEL = iamin_lasx.S
|
||||
ICAMINKERNEL = icamin_lasx.S
|
||||
IZAMINKERNEL = icamin_lasx.S
|
||||
|
||||
SCOPYKERNEL = copy_lasx.S
|
||||
DCOPYKERNEL = copy_lasx.S
|
||||
CCOPYKERNEL = ccopy_lasx.S
|
||||
ZCOPYKERNEL = ccopy_lasx.S
|
||||
|
||||
SSWAPKERNEL = swap_lasx.S
|
||||
DSWAPKERNEL = swap_lasx.S
|
||||
|
||||
SAXPYKERNEL = axpy_lasx.S
|
||||
DAXPYKERNEL = axpy_lasx.S
|
||||
CAXPYKERNEL = caxpy_lasx.S
|
||||
ZAXPYKERNEL = caxpy_lasx.S
|
||||
|
||||
SAXPBYKERNEL = axpby_lasx.S
|
||||
DAXPBYKERNEL = axpby_lasx.S
|
||||
|
||||
SSUMKERNEL = sum_lasx.S
|
||||
DSUMKERNEL = sum_lasx.S
|
||||
|
||||
SASUMKERNEL = asum_lasx.S
|
||||
DASUMKERNEL = asum_lasx.S
|
||||
CASUMKERNEL = casum_lasx.S
|
||||
ZASUMKERNEL = casum_lasx.S
|
||||
|
||||
SROTKERNEL = rot_lasx.S
|
||||
DROTKERNEL = rot_lasx.S
|
||||
CROTKERNEL = crot_lasx.S
|
||||
ZROTKERNEL = crot_lasx.S
|
||||
|
||||
SNRM2KERNEL = snrm2_lasx.S
|
||||
DNRM2KERNEL = dnrm2_lasx.S
|
||||
CNRM2KERNEL = cnrm2_lasx.S
|
||||
ZNRM2KERNEL = znrm2_lasx.S
|
||||
|
||||
CSWAPKERNEL = cswap_lasx.S
|
||||
ZSWAPKERNEL = cswap_lasx.S
|
||||
|
||||
CSUMKERNEL = csum_lasx.S
|
||||
ZSUMKERNEL = csum_lasx.S
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_16x4.S
|
||||
DGEMMINCOPY = dgemm_ncopy_16.S
|
||||
DGEMMITCOPY = dgemm_tcopy_16.S
|
||||
|
|
@ -25,13 +107,35 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
|||
SGEMVNKERNEL = sgemv_n_8_lasx.S
|
||||
SGEMVTKERNEL = sgemv_t_8_lasx.S
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_2x2_lsx.S
|
||||
CGEMMONCOPY = cgemm_ncopy_2_lsx.S
|
||||
CGEMMOTCOPY = cgemm_tcopy_2_lsx.S
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_2x2_lasx.S
|
||||
ZGEMMONCOPY = zgemm_ncopy_2_lasx.S
|
||||
ZGEMMOTCOPY = zgemm_tcopy_2_lasx.S
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = dtrsm_kernel_LN_16x4_lasx.S
|
||||
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_lasx.S
|
||||
DTRSMKERNEL_RN = dtrsm_kernel_RN_16x4_lasx.S
|
||||
DTRSMKERNEL_RT = dtrsm_kernel_RT_16x4_lasx.S
|
||||
endif
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
endif
|
||||
|
|
|
|||
|
|
@ -0,0 +1,232 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
|
||||
#define I $r12
|
||||
#define TEMP $r13
|
||||
|
||||
#define VM0 $xr0
|
||||
#define VM1 $xr1
|
||||
#define VM2 $xr2
|
||||
#define VX0 $xr3
|
||||
#define VX1 $xr4
|
||||
#define VX2 $xr5
|
||||
#define VX3 $xr6
|
||||
|
||||
#define t1 $r14
|
||||
#define t2 $r15
|
||||
#define t3 $r16
|
||||
#define t4 $r17
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
#ifdef DOUBLE
|
||||
xvldrepl.d VM0, X, 0
|
||||
#else
|
||||
xvldrepl.w VM0, X, 0
|
||||
#endif
|
||||
XVFSUB VM0, VM0, VM0
|
||||
bne INCX, TEMP, .L20
|
||||
|
||||
srai.d I, N, 4
|
||||
bge $r0, I, .L11
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0
|
||||
xvld VX1, X, 32
|
||||
xvld VX2, X, 64
|
||||
xvld VX3, X, 96
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 128
|
||||
XVFMAXA VM1, VX0, VX1
|
||||
XVFMAXA VM2, VX2, VX3
|
||||
XVFMAXA VM0, VM0, VM1
|
||||
XVFMAXA VM0, VM0, VM2
|
||||
#else
|
||||
xvld VX0, X, 0
|
||||
xvld VX1, X, 32
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 64
|
||||
XVFMAXA VM1, VX0, VX1
|
||||
XVFMAXA VM0, VM0, VM1
|
||||
#endif
|
||||
blt $r0, I, .L10
|
||||
|
||||
#ifdef DOUBLE
|
||||
xvrepl128vei.d VX0, VM0, 0
|
||||
xvrepl128vei.d VX1, VM0, 1
|
||||
XVFMAXA VM0, VX0, VX1
|
||||
#else
|
||||
xvrepl128vei.w VX0, VM0, 0
|
||||
xvrepl128vei.w VX1, VM0, 1
|
||||
xvrepl128vei.w VX2, VM0, 2
|
||||
xvrepl128vei.w VX3, VM0, 3
|
||||
XVFMAXA VM1, VX0, VX1
|
||||
XVFMAXA VM2, VX2, VX3
|
||||
XVFMAXA VM0, VM1, VM2
|
||||
#endif
|
||||
xvpermi.q VM1, VM0, 0x1
|
||||
XVFMAXA VM0, VM0, VM1
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
andi I, N, 0x0f
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12: /* 0 < N < 16 */
|
||||
LD $f1, X, 0
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
FMAXA $f0, $f0, $f1
|
||||
bnez I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS $f0, $f0
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
ld.d t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvfmaxa.d VM1, VX0, VX1
|
||||
xvfmaxa.d VM0, VM0, VM1
|
||||
#else
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VM1, t1, 0
|
||||
xvinsgr2vr.w VM1, t2, 1
|
||||
xvinsgr2vr.w VM1, t3, 2
|
||||
xvinsgr2vr.w VM1, t4, 3
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VM1, t1, 4
|
||||
xvinsgr2vr.w VM1, t2, 5
|
||||
xvinsgr2vr.w VM1, t3, 6
|
||||
xvinsgr2vr.w VM1, t4, 7
|
||||
xvfmaxa.s VM0, VM0, VM1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#ifdef DOUBLE
|
||||
xvrepl128vei.d VX0, VM0, 0
|
||||
xvrepl128vei.d VX1, VM0, 1
|
||||
XVFMAXA VM0, VX0, VX1
|
||||
#else
|
||||
xvrepl128vei.w VX0, VM0, 0
|
||||
xvrepl128vei.w VX1, VM0, 1
|
||||
xvrepl128vei.w VX2, VM0, 2
|
||||
xvrepl128vei.w VX3, VM0, 3
|
||||
XVFMAXA VM1, VX0, VX1
|
||||
XVFMAXA VM2, VX2, VX3
|
||||
XVFMAXA VM0, VM1, VM2
|
||||
#endif
|
||||
xvpermi.q VM1, VM0, 1
|
||||
XVFMAXA VM0, VM0, VM1
|
||||
.align 3
|
||||
|
||||
.L23: //INCX!=1 and N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24: /* 0 < N < 8 */
|
||||
LD $f1, X, 0
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
FMAXA $f0, $f0, $f1
|
||||
bnez I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
FABS $f0, $f0
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,231 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
|
||||
#define I $r12
|
||||
#define TEMP $r13
|
||||
|
||||
#define VM0 $vr0
|
||||
#define VM1 $vr1
|
||||
#define VM2 $vr2
|
||||
#define VX0 $vr3
|
||||
#define VX1 $vr4
|
||||
#define VX2 $vr5
|
||||
#define VX3 $vr6
|
||||
|
||||
#define t1 $r14
|
||||
#define t2 $r15
|
||||
#define t3 $r16
|
||||
#define t4 $r17
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
#ifdef DOUBLE
|
||||
vldrepl.d VM0, X, 0
|
||||
#else
|
||||
vldrepl.w VM0, X, 0
|
||||
#endif
|
||||
VFSUB VM0, VM0, VM0
|
||||
bne INCX, TEMP, .L20
|
||||
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L11
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0
|
||||
vld VX1, X, 16
|
||||
vld VX2, X, 32
|
||||
vld VX3, X, 48
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 64
|
||||
VFMAXA VM1, VX0, VX1
|
||||
VFMAXA VM2, VX2, VX3
|
||||
VFMAXA VM0, VM0, VM1
|
||||
VFMAXA VM0, VM0, VM2
|
||||
#else
|
||||
vld VX0, X, 0
|
||||
vld VX1, X, 16
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 32
|
||||
VFMAXA VM1, VX0, VX1
|
||||
VFMAXA VM0, VM0, VM1
|
||||
#endif
|
||||
blt $r0, I, .L10
|
||||
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX0, VM0, 0
|
||||
vreplvei.d VX1, VM0, 1
|
||||
VFMAXA VM0, VX0, VX1
|
||||
#else
|
||||
vreplvei.w VX0, VM0, 0
|
||||
vreplvei.w VX1, VM0, 1
|
||||
vreplvei.w VX2, VM0, 2
|
||||
vreplvei.w VX3, VM0, 3
|
||||
VFMAXA VM1, VX0, VX1
|
||||
VFMAXA VM2, VX2, VX3
|
||||
VFMAXA VM0, VM1, VM2
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
LD $f1, X, 0
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
FMAXA $f0, $f0, $f1
|
||||
bnez I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS $f0, $f0
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vfmaxa.d VM1, VX0, VX1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vfmaxa.d VM2, VX0, VX1
|
||||
vfmaxa.d VM1, VM1, VM2
|
||||
vfmaxa.d VM0, VM0, VM1
|
||||
#else
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
vfmaxa.s VM1, VX0, VX1
|
||||
vfmaxa.s VM0, VM0, VM1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX0, VM0, 0
|
||||
vreplvei.d VX1, VM0, 1
|
||||
VFMAXA VM0, VX0, VX1
|
||||
#else
|
||||
vreplvei.w VX0, VM0, 0
|
||||
vreplvei.w VX1, VM0, 1
|
||||
vreplvei.w VX2, VM0, 2
|
||||
vreplvei.w VX3, VM0, 3
|
||||
VFMAXA VM1, VX0, VX1
|
||||
VFMAXA VM2, VX2, VX3
|
||||
VFMAXA VM0, VM1, VM2
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L23: //INCX!=1 and N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
LD $f1, X, 0
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
FMAXA $f0, $f0, $f1
|
||||
bnez I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
FABS $f0, $f0
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,232 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
|
||||
#define I $r12
|
||||
#define TEMP $r13
|
||||
|
||||
#define VM0 $xr0
|
||||
#define VM1 $xr1
|
||||
#define VM2 $xr2
|
||||
#define VX0 $xr3
|
||||
#define VX1 $xr4
|
||||
#define VX2 $xr5
|
||||
#define VX3 $xr6
|
||||
|
||||
#define t1 $r14
|
||||
#define t2 $r15
|
||||
#define t3 $r16
|
||||
#define t4 $r17
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
#ifdef DOUBLE
|
||||
xvldrepl.d VM0, X, 0
|
||||
#else
|
||||
xvldrepl.w VM0, X, 0
|
||||
#endif
|
||||
XVFSUB VM0, VM0, VM0
|
||||
bne INCX, TEMP, .L20
|
||||
|
||||
srai.d I, N, 4
|
||||
bge $r0, I, .L11
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0
|
||||
xvld VX1, X, 32
|
||||
xvld VX2, X, 64
|
||||
xvld VX3, X, 96
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 128
|
||||
XVFMINA VM1, VX0, VX1
|
||||
XVFMINA VM2, VX2, VX3
|
||||
XVFMINA VM0, VM0, VM1
|
||||
XVFMINA VM0, VM0, VM2
|
||||
#else
|
||||
xvld VX0, X, 0
|
||||
xvld VX1, X, 32
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 64
|
||||
XVFMINA VM1, VX0, VX1
|
||||
XVFMINA VM0, VM0, VM1
|
||||
#endif
|
||||
blt $r0, I, .L10
|
||||
|
||||
#ifdef DOUBLE
|
||||
xvrepl128vei.d VX0, VM0, 0
|
||||
xvrepl128vei.d VX1, VM0, 1
|
||||
XVFMINA VM0, VX0, VX1
|
||||
#else
|
||||
xvrepl128vei.w VX0, VM0, 0
|
||||
xvrepl128vei.w VX1, VM0, 1
|
||||
xvrepl128vei.w VX2, VM0, 2
|
||||
xvrepl128vei.w VX3, VM0, 3
|
||||
XVFMINA VM1, VX0, VX1
|
||||
XVFMINA VM2, VX2, VX3
|
||||
XVFMINA VM0, VM1, VM2
|
||||
#endif
|
||||
xvpermi.q VM1, VM0, 0x1
|
||||
XVFMINA VM0, VM0, VM1
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
andi I, N, 0x0f
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12: /* 0 < N < 16 */
|
||||
LD $f1, X, 0
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
FMINA $f0, $f0, $f1
|
||||
bnez I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS $f0, $f0
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
ld.d t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvfmaxa.d VM1, VX0, VX1
|
||||
xvfmaxa.d VM0, VM0, VM1
|
||||
#else
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VM1, t1, 0
|
||||
xvinsgr2vr.w VM1, t2, 1
|
||||
xvinsgr2vr.w VM1, t3, 2
|
||||
xvinsgr2vr.w VM1, t4, 3
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VM1, t1, 4
|
||||
xvinsgr2vr.w VM1, t2, 5
|
||||
xvinsgr2vr.w VM1, t3, 6
|
||||
xvinsgr2vr.w VM1, t4, 7
|
||||
xvfmaxa.s VM0, VM0, VM1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#ifdef DOUBLE
|
||||
xvrepl128vei.d VX0, VM0, 0
|
||||
xvrepl128vei.d VX1, VM0, 1
|
||||
XVFMINA VM0, VX0, VX1
|
||||
#else
|
||||
xvrepl128vei.w VX0, VM0, 0
|
||||
xvrepl128vei.w VX1, VM0, 1
|
||||
xvrepl128vei.w VX2, VM0, 2
|
||||
xvrepl128vei.w VX3, VM0, 3
|
||||
XVFMINA VM1, VX0, VX1
|
||||
XVFMINA VM2, VX2, VX3
|
||||
XVFMINA VM0, VM1, VM2
|
||||
#endif
|
||||
xvpermi.q VM1, VM0, 1
|
||||
XVFMINA VM0, VM0, VM1
|
||||
.align 3
|
||||
|
||||
.L23: //INCX!=1 and N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24: /* 0 < N < 8 */
|
||||
LD $f1, X, 0
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
FMINA $f0, $f0, $f1
|
||||
bnez I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
FABS $f0, $f0
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,232 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
|
||||
#define I $r12
|
||||
#define TEMP $r13
|
||||
|
||||
#define VM0 $vr0
|
||||
#define VM1 $vr1
|
||||
#define VM2 $vr2
|
||||
#define VX0 $vr3
|
||||
#define VX1 $vr4
|
||||
#define VX2 $vr5
|
||||
#define VX3 $vr6
|
||||
|
||||
#define t1 $r14
|
||||
#define t2 $r15
|
||||
#define t3 $r16
|
||||
#define t4 $r17
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
#ifdef DOUBLE
|
||||
vldrepl.d VM0, X, 0
|
||||
#else
|
||||
vldrepl.w VM0, X, 0
|
||||
#endif
|
||||
VFSUB VM0, VM0, VM0
|
||||
bne INCX, TEMP, .L20
|
||||
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L11
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0
|
||||
vld VX1, X, 16
|
||||
vld VX2, X, 32
|
||||
vld VX3, X, 48
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 64
|
||||
VFMINA VM1, VX0, VX1
|
||||
VFMINA VM2, VX2, VX3
|
||||
VFMINA VM0, VM0, VM1
|
||||
VFMINA VM0, VM0, VM2
|
||||
#else
|
||||
vld VX0, X, 0
|
||||
vld VX1, X, 16
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 32
|
||||
VFMINA VM1, VX0, VX1
|
||||
VFMINA VM0, VM0, VM1
|
||||
#endif
|
||||
blt $r0, I, .L10
|
||||
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX0, VM0, 0
|
||||
vreplvei.d VX1, VM0, 1
|
||||
VFMINA VM0, VX0, VX1
|
||||
#else
|
||||
vreplvei.w VX0, VM0, 0
|
||||
vreplvei.w VX1, VM0, 1
|
||||
vreplvei.w VX2, VM0, 2
|
||||
vreplvei.w VX3, VM0, 3
|
||||
VFMINA VM1, VX0, VX1
|
||||
VFMINA VM2, VX2, VX3
|
||||
VFMINA VM0, VM1, VM2
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
LD $f1, X, 0
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
FMINA $f0, $f0, $f1
|
||||
bnez I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS $f0, $f0
|
||||
SUB $f0, $f0, $f0
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vfmaxa.d VM1, VX0, VX1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vfmaxa.d VM2, VX0, VX1
|
||||
vfmaxa.d VM1, VM1, VM2
|
||||
vfmaxa.d VM0, VM0, VM1
|
||||
#else
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
vfmaxa.s VM1, VX0, VX1
|
||||
vfmaxa.s VM0, VM0, VM1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX0, VM0, 0
|
||||
vreplvei.d VX1, VM0, 1
|
||||
VFMINA VM0, VX0, VX1
|
||||
#else
|
||||
vreplvei.w VX0, VM0, 0
|
||||
vreplvei.w VX1, VM0, 1
|
||||
vreplvei.w VX2, VM0, 2
|
||||
vreplvei.w VX3, VM0, 3
|
||||
VFMINA VM1, VX0, VX1
|
||||
VFMINA VM2, VX2, VX3
|
||||
VFMINA VM0, VM1, VM2
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L23: //INCX!=1 and N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
LD $f1, X, 0
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
FMINA $f0, $f0, $f1
|
||||
bnez I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
FABS $f0, $f0
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,257 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define t1 $r15
|
||||
#define t2 $r12
|
||||
#define t3 $r13
|
||||
#define t4 $r14
|
||||
#define VX0 $xr12
|
||||
#define VX1 $xr13
|
||||
#define VX2 $xr14
|
||||
#define VX3 $xr15
|
||||
#define VT0 $xr23
|
||||
#define VT1 $xr22
|
||||
#define res1 $xr16
|
||||
#define res2 $xr17
|
||||
#define res0 $xr18
|
||||
#define neg1 $xr19
|
||||
|
||||
PROLOGUE
|
||||
xvxor.v res1, res1, res1
|
||||
xvxor.v res2, res2, res2
|
||||
xvxor.v res0, res0, res0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
#ifdef DOUBLE
|
||||
li.d t1, -1
|
||||
xvreplgr2vr.d neg1, t1
|
||||
xvffint.d.l neg1, neg1
|
||||
#else
|
||||
li.w t1, -1
|
||||
xvreplgr2vr.w neg1, t1
|
||||
xvffint.s.w neg1, neg1
|
||||
#endif
|
||||
li.d TEMP, SIZE
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvfmul.d VX2, neg1, VX0
|
||||
xvfmul.d VX3, neg1, VX1
|
||||
xvfcmp.clt.d VT0, VX0, res0
|
||||
xvfcmp.clt.d VT1, VX1, res0
|
||||
xvbitsel.v VX0, VX0, VX2, VT0
|
||||
xvbitsel.v VX1, VX1, VX3, VT1
|
||||
xvfadd.d res2, VX0, VX1
|
||||
xvfadd.d res1, res1, res2
|
||||
#else
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvfmul.s VX2, neg1, VX0
|
||||
xvfcmp.clt.s VT0, VX0, res0
|
||||
xvbitsel.v VX0, VX0, VX2, VT0
|
||||
xvfadd.s res1, VX0, res1
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L11
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VX1, res1, 1
|
||||
xvpickve.d VX2, res1, 2
|
||||
xvpickve.d VX3, res1, 3
|
||||
xvfadd.d res1, VX1, res1
|
||||
xvfadd.d res1, VX2, res1
|
||||
xvfadd.d res1, VX3, res1
|
||||
#else
|
||||
xvfadd.s res2, res1, res2
|
||||
xvpickve.w VX1, res1, 1
|
||||
xvpickve.w VX2, res1, 2
|
||||
xvpickve.w VX3, res1, 3
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX3, res1
|
||||
xvpickve.w VX0, res2, 4
|
||||
xvpickve.w VX1, res2, 5
|
||||
xvpickve.w VX2, res2, 6
|
||||
xvpickve.w VX3, res2, 7
|
||||
xvfadd.s res1, VX0, res1
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L14:
|
||||
LD $f12, X, 0 * SIZE
|
||||
FABS $f12, $f12
|
||||
ADD $f16, $f12, $f16
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
blt $r0, I, .L14
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvfmul.d VX2, neg1, VX0
|
||||
xvfmul.d VX3, neg1, VX1
|
||||
xvfcmp.clt.d VT0, VX0, res0
|
||||
xvfcmp.clt.d VT1, VX1, res0
|
||||
xvbitsel.v VX0, VX0, VX2, VT0
|
||||
xvbitsel.v VX1, VX1, VX3, VT1
|
||||
xvfadd.d res2, VX0, VX1
|
||||
xvfadd.d res1, res1, res2
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
xvfmul.s VX2, neg1, VX0
|
||||
xvfcmp.clt.s VT0, VX0, res0
|
||||
xvbitsel.v VX0, VX0, VX2, VT0
|
||||
xvfadd.s res1, VX0, res1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VX1, res1, 1
|
||||
xvpickve.d VX2, res1, 2
|
||||
xvpickve.d VX3, res1, 3
|
||||
xvfadd.d res1, VX1, res1
|
||||
xvfadd.d res1, VX2, res1
|
||||
xvfadd.d res1, VX3, res1
|
||||
#else
|
||||
xvfadd.s res2, res1, res2
|
||||
xvpickve.w VX1, res1, 1
|
||||
xvpickve.w VX2, res1, 2
|
||||
xvpickve.w VX3, res1, 3
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX3, res1
|
||||
xvpickve.w VX0, res2, 4
|
||||
xvpickve.w VX1, res2, 5
|
||||
xvpickve.w VX2, res2, 6
|
||||
xvpickve.w VX3, res2, 7
|
||||
xvfadd.s res1, VX0, res1
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
LD $f12, X, 0 * SIZE
|
||||
FABS $f12, $f12
|
||||
ADD $f16, $f12, $f16
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
MOV $f0, $f16
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,258 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define t1 $r15
|
||||
#define t2 $r12
|
||||
#define t3 $r13
|
||||
#define t4 $r14
|
||||
#define VX0 $vr12
|
||||
#define VX1 $vr13
|
||||
#define VX2 $vr14
|
||||
#define VX3 $vr15
|
||||
#define VT0 $vr23
|
||||
#define VT1 $vr22
|
||||
#define res1 $vr16
|
||||
#define res2 $vr17
|
||||
#define res0 $vr18
|
||||
#define neg1 $vr19
|
||||
|
||||
PROLOGUE
|
||||
vxor.v res1, res1, res1
|
||||
vxor.v res2, res2, res2
|
||||
vxor.v res0, res0, res0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
#ifdef DOUBLE
|
||||
li.d t1, -1
|
||||
vreplgr2vr.d neg1, t1
|
||||
vffint.d.l neg1, neg1
|
||||
#else
|
||||
li.w t1, -1
|
||||
vreplgr2vr.w neg1, t1
|
||||
vffint.s.w neg1, neg1
|
||||
#endif
|
||||
li.d TEMP, SIZE
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vfmul.d VX2, neg1, VX0
|
||||
vfmul.d VX3, neg1, VX1
|
||||
vfcmp.clt.d VT0, VX0, res0
|
||||
vfcmp.clt.d VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
vld VX0, X, 4 * SIZE
|
||||
vld VX1, X, 6 * SIZE
|
||||
vfmul.d VX2, neg1, VX0
|
||||
vfmul.d VX3, neg1, VX1
|
||||
vfcmp.clt.d VT0, VX0, res0
|
||||
vfcmp.clt.d VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
#else
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX1, X, 4 * SIZE
|
||||
vfmul.s VX2, neg1, VX0
|
||||
vfmul.s VX3, neg1, VX1
|
||||
vfcmp.clt.s VT0, VX0, res0
|
||||
vfcmp.clt.s VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.s res2, VX0, VX1
|
||||
vfadd.s res1, res1, res2
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L11
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX1, res1, 1
|
||||
vfadd.d res1, VX1, res1
|
||||
#else
|
||||
vreplvei.w VX1, res1, 1
|
||||
vreplvei.w VX2, res1, 2
|
||||
vreplvei.w VX3, res1, 3
|
||||
vfadd.s res1, VX1, res1
|
||||
vfadd.s res1, VX2, res1
|
||||
vfadd.s res1, VX3, res1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L14:
|
||||
LD $f12, X, 0 * SIZE
|
||||
FABS $f12, $f12
|
||||
ADD $f16, $f12, $f16
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
blt $r0, I, .L14
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
vinsgr2vr.d VX1, t1, 0
|
||||
vinsgr2vr.d VX1, t2, 1
|
||||
add.d X, X, INCX
|
||||
vfmul.d VX2, neg1, VX0
|
||||
vfmul.d VX3, neg1, VX1
|
||||
vfcmp.clt.d VT0, VX0, res0
|
||||
vfcmp.clt.d VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t3, 0
|
||||
vinsgr2vr.d VX0, t4, 1
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
add.d X, X, INCX
|
||||
vfmul.d VX2, neg1, VX0
|
||||
vfmul.d VX3, neg1, VX1
|
||||
vfcmp.clt.d VT0, VX0, res0
|
||||
vfcmp.clt.d VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
vfmul.s VX2, neg1, VX0
|
||||
vfmul.s VX3, neg1, VX1
|
||||
vfcmp.clt.s VT0, VX0, res0
|
||||
vfcmp.clt.s VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.s res2, VX0, VX1
|
||||
vfadd.s res1, res1, res2
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX1, res1, 1
|
||||
vfadd.d res1, VX1, res1
|
||||
#else
|
||||
vreplvei.w VX1, res1, 1
|
||||
vreplvei.w VX2, res1, 2
|
||||
vreplvei.w VX3, res1, 3
|
||||
vfadd.s res1, VX1, res1
|
||||
vfadd.s res1, VX2, res1
|
||||
vfadd.s res1, VX3, res1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
LD $f12, X, 0 * SIZE
|
||||
FABS $f12, $f12
|
||||
ADD $f16, $f12, $f16
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
MOV $f0, $f16
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,529 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define XX $r5
|
||||
#define YY $r6
|
||||
#define ALPHA $f0
|
||||
#define X $r7
|
||||
#define INCX $r8
|
||||
#define Y $r9
|
||||
#define INCY $r10
|
||||
|
||||
#define I $r12
|
||||
#define TEMP $r13
|
||||
#define t1 $r14
|
||||
#define t2 $r16
|
||||
#define t3 $r15
|
||||
#define t4 $r17
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define b1 $f16
|
||||
#define b2 $f17
|
||||
#define b3 $f18
|
||||
#define b4 $f19
|
||||
#define VX0 $xr8
|
||||
#define VX1 $xr20
|
||||
#define VX2 $xr21
|
||||
#define VX3 $xr22
|
||||
#define VXA $xr23
|
||||
|
||||
PROLOGUE
|
||||
|
||||
bge $r0, N, .L999
|
||||
li.d TEMP, 1
|
||||
movgr2fr.d a1, $r0
|
||||
FFINT a1, a1
|
||||
movgr2fr.d a2, TEMP
|
||||
FFINT a2, a2
|
||||
CMPEQ $fcc0, ALPHA, a1
|
||||
bcnez $fcc0, .L999
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
slli.d INCY, INCY, BASE_SHIFT
|
||||
MTG t1, ALPHA
|
||||
#ifdef DOUBLE
|
||||
xvreplgr2vr.d VXA, t1
|
||||
#else
|
||||
xvreplgr2vr.w VXA, t1
|
||||
#endif
|
||||
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
|
||||
b .L11 // INCX==1 and INCY==1
|
||||
.L20:
|
||||
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
|
||||
b .L21 // INCX!=1 and INCY==1
|
||||
|
||||
.L11:
|
||||
bge $r0, I, .L113
|
||||
CMPEQ $fcc0, ALPHA, a2
|
||||
bceqz $fcc0, .L112
|
||||
.align 3
|
||||
|
||||
.L111:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvld VX3, Y, 4 * SIZE
|
||||
xvfadd.d VX2, VX0, VX2
|
||||
xvfadd.d VX3, VX1, VX3
|
||||
addi.d I, I, -1
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
xvst VX3, Y, 4 * SIZE
|
||||
#else
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
xvfadd.s VX2, VX0, VX2
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L111
|
||||
b .L113
|
||||
.align 3
|
||||
|
||||
.L112:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvld VX3, Y, 4 * SIZE
|
||||
xvfmadd.d VX2, VX0, VXA, VX2
|
||||
xvfmadd.d VX3, VX1, VXA, VX3
|
||||
addi.d I, I, -1
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
xvst VX3, Y, 4 * SIZE
|
||||
#else
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
xvfmadd.s VX2, VX0, VXA, VX2
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L112
|
||||
.align 3
|
||||
|
||||
.L113:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L114:
|
||||
LD $f12, X, 0 * SIZE
|
||||
LD $f14, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
MADD $f14, $f12, $f0, $f14
|
||||
ST $f14, Y, 0 * SIZE
|
||||
addi.d X, X, SIZE
|
||||
addi.d Y, Y, SIZE
|
||||
blt $r0, I, .L114
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L12: // INCX==1 and INCY!=1
|
||||
bge $r0, I, .L122
|
||||
move YY, Y
|
||||
.align 3
|
||||
|
||||
.L121:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0 * SIZE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t4, Y, 0 * SIZE
|
||||
xvinsgr2vr.d VX2, t1, 0
|
||||
xvinsgr2vr.d VX2, t2, 1
|
||||
xvinsgr2vr.d VX2, t3, 2
|
||||
xvinsgr2vr.d VX2, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
xvfmadd.d VX2, VX0, VXA, VX2
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvstelm.d VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VX2, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VX2, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t4, Y, 0 * SIZE
|
||||
xvinsgr2vr.d VX3, t1, 0
|
||||
xvinsgr2vr.d VX3, t2, 1
|
||||
xvinsgr2vr.d VX3, t3, 2
|
||||
xvinsgr2vr.d VX3, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
xvfmadd.d VX3, VX1, VXA, VX3
|
||||
addi.d I, I, -1
|
||||
xvstelm.d VX3, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VX3, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VX3, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VX3, YY, 0, 3
|
||||
#else
|
||||
xvld VX0, X, 0 * SIZE
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
xvinsgr2vr.w VX2, t1, 0
|
||||
xvinsgr2vr.w VX2, t2, 1
|
||||
xvinsgr2vr.w VX2, t3, 2
|
||||
xvinsgr2vr.w VX2, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
xvinsgr2vr.w VX2, t1, 4
|
||||
xvinsgr2vr.w VX2, t2, 5
|
||||
xvinsgr2vr.w VX2, t3, 6
|
||||
xvinsgr2vr.w VX2, t4, 7
|
||||
add.d Y, Y, INCY
|
||||
xvfmadd.s VX2, VX0, VXA, VX2
|
||||
addi.d I, I, -1
|
||||
xvstelm.w VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 4
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 5
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 6
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 7
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
addi.d X, X, 8 * SIZE
|
||||
blt $r0, I, .L121
|
||||
.align 3
|
||||
|
||||
.L122:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L123:
|
||||
LD $f12, X, 0 * SIZE
|
||||
LD $f14, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
MADD $f14, $f12, $f0, $f14
|
||||
ST $f14, Y, 0 * SIZE
|
||||
addi.d X, X, SIZE
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L123
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L21:// INCX!=1 and INCY==1
|
||||
bge $r0, I, .L212
|
||||
.align 3
|
||||
|
||||
.L211:
|
||||
#ifdef DOUBLE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
add.d X, X, INCX
|
||||
xvfmadd.d VX2, VX0, VXA, VX2
|
||||
xvld VX3, Y, 4 * SIZE
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
add.d X, X, INCX
|
||||
xvfmadd.d VX3, VX1, VXA, VX3
|
||||
addi.d I, I, -1
|
||||
xvst VX3, Y, 4 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
#else
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
add.d X, X, INCX
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
xvfmadd.s VX2, VX0, VXA, VX2
|
||||
addi.d I, I, -1
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
#endif
|
||||
blt $r0, I, .L211
|
||||
.align 3
|
||||
|
||||
.L212:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L213:
|
||||
LD $f12, X, 0 * SIZE
|
||||
LD $f14, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
MADD $f14, $f12, $f0, $f14
|
||||
ST $f14, Y, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
addi.d Y, Y, SIZE
|
||||
blt $r0, I, .L213
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
bge $r0, I, .L223
|
||||
move YY, Y
|
||||
.align 3
|
||||
|
||||
.L222:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
add.d X, X, INCX
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t4, Y, 0 * SIZE
|
||||
xvinsgr2vr.d VX2, t1, 0
|
||||
xvinsgr2vr.d VX2, t2, 1
|
||||
xvinsgr2vr.d VX2, t3, 2
|
||||
xvinsgr2vr.d VX2, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
xvfmadd.d VX2, VX0, VXA, VX2
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvstelm.d VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VX2, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VX2, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t4, Y, 0 * SIZE
|
||||
xvinsgr2vr.d VX3, t1, 0
|
||||
xvinsgr2vr.d VX3, t2, 1
|
||||
xvinsgr2vr.d VX3, t3, 2
|
||||
xvinsgr2vr.d VX3, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
xvfmadd.d VX3, VX1, VXA, VX3
|
||||
addi.d I, I, -1
|
||||
xvstelm.d VX3, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VX3, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VX3, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d VX3, YY, 0, 3
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w VX2, t1, 0
|
||||
xvinsgr2vr.w VX2, t2, 1
|
||||
xvinsgr2vr.w VX2, t3, 2
|
||||
xvinsgr2vr.w VX2, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
xvinsgr2vr.w VX2, t1, 4
|
||||
xvinsgr2vr.w VX2, t2, 5
|
||||
xvinsgr2vr.w VX2, t3, 6
|
||||
xvinsgr2vr.w VX2, t4, 7
|
||||
add.d Y, Y, INCY
|
||||
xvfmadd.s VX2, VX0, VXA, VX2
|
||||
addi.d I, I, -1
|
||||
xvstelm.w VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 4
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 5
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 6
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w VX2, YY, 0, 7
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
blt $r0, I, .L222
|
||||
.align 3
|
||||
|
||||
.L223:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L224:
|
||||
LD $f12, X, 0 * SIZE
|
||||
LD $f14, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
MADD $f14, $f12, $f0, $f14
|
||||
ST $f14, Y, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L224
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r12
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,573 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define XX $r5
|
||||
#define YY $r6
|
||||
#define ALPHA $f0
|
||||
#define X $r7
|
||||
#define INCX $r8
|
||||
#define Y $r9
|
||||
#define INCY $r10
|
||||
|
||||
#define I $r12
|
||||
#define TEMP $r13
|
||||
#define t1 $r14
|
||||
#define t2 $r16
|
||||
#define t3 $r15
|
||||
#define t4 $r17
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define b1 $f16
|
||||
#define b2 $f17
|
||||
#define b3 $f18
|
||||
#define b4 $f19
|
||||
#define VX0 $vr8
|
||||
#define VX1 $vr20
|
||||
#define VX2 $vr21
|
||||
#define VX3 $vr22
|
||||
#define VXA $vr23
|
||||
|
||||
PROLOGUE
|
||||
|
||||
bge $r0, N, .L999
|
||||
li.d TEMP, 1
|
||||
movgr2fr.d a1, $r0
|
||||
FFINT a1, a1
|
||||
movgr2fr.d a2, TEMP
|
||||
FFINT a2, a2
|
||||
CMPEQ $fcc0, ALPHA, a1
|
||||
bcnez $fcc0, .L999
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
slli.d INCY, INCY, BASE_SHIFT
|
||||
MTG t1, ALPHA
|
||||
#ifdef DOUBLE
|
||||
vreplgr2vr.d VXA, t1
|
||||
#else
|
||||
vreplgr2vr.w VXA, t1
|
||||
#endif
|
||||
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
|
||||
b .L11 // INCX==1 and INCY==1
|
||||
.L20:
|
||||
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
|
||||
b .L21 // INCX!=1 and INCY==1
|
||||
|
||||
.L11:
|
||||
bge $r0, I, .L113
|
||||
CMPEQ $fcc0, ALPHA, a2
|
||||
bceqz $fcc0, .L112
|
||||
.align 3
|
||||
|
||||
.L111:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vld VX3, Y, 2 * SIZE
|
||||
vfadd.d VX2, VX0, VX2
|
||||
vfadd.d VX3, VX1, VX3
|
||||
vst VX2, Y, 0 * SIZE
|
||||
vst VX3, Y, 2 * SIZE
|
||||
vld VX0, X, 4 * SIZE
|
||||
vld VX2, Y, 4 * SIZE
|
||||
vld VX1, X, 6 * SIZE
|
||||
vld VX3, Y, 6 * SIZE
|
||||
vfadd.d VX2, VX0, VX2
|
||||
vfadd.d VX3, VX1, VX3
|
||||
vst VX2, Y, 4 * SIZE
|
||||
vst VX3, Y, 6 * SIZE
|
||||
#else
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
vld VX1, X, 4 * SIZE
|
||||
vld VX3, Y, 4 * SIZE
|
||||
vfadd.s VX2, VX0, VX2
|
||||
vfadd.s VX3, VX1, VX3
|
||||
vst VX2, Y, 0 * SIZE
|
||||
vst VX3, Y, 4 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L111
|
||||
b .L113
|
||||
.align 3
|
||||
|
||||
.L112:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vld VX3, Y, 2 * SIZE
|
||||
vfmadd.d VX2, VX0, VXA, VX2
|
||||
vfmadd.d VX3, VX1, VXA, VX3
|
||||
addi.d I, I, -1
|
||||
vst VX2, Y, 0 * SIZE
|
||||
vst VX3, Y, 2 * SIZE
|
||||
vld VX0, X, 4 * SIZE
|
||||
vld VX2, Y, 4 * SIZE
|
||||
vld VX1, X, 6 * SIZE
|
||||
vld VX3, Y, 6 * SIZE
|
||||
addi.d X, X, 8 * SIZE
|
||||
vfmadd.d VX2, VX0, VXA, VX2
|
||||
vfmadd.d VX3, VX1, VXA, VX3
|
||||
vst VX2, Y, 4 * SIZE
|
||||
vst VX3, Y, 6 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
#else
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
vld VX1, X, 4 * SIZE
|
||||
vld VX3, Y, 4 * SIZE
|
||||
vfmadd.s VX2, VX0, VXA, VX2
|
||||
vfmadd.s VX3, VX1, VXA, VX3
|
||||
vst VX2, Y, 0 * SIZE
|
||||
vst VX3, Y, 4 * SIZE
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
#endif
|
||||
blt $r0, I, .L112
|
||||
.align 3
|
||||
|
||||
.L113:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L114:
|
||||
LD $f12, X, 0 * SIZE
|
||||
LD $f14, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
MADD $f14, $f12, $f0, $f14
|
||||
ST $f14, Y, 0 * SIZE
|
||||
addi.d X, X, SIZE
|
||||
addi.d Y, Y, SIZE
|
||||
blt $r0, I, .L114
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L12: // INCX==1 and INCY!=1
|
||||
bge $r0, I, .L122
|
||||
move YY, Y
|
||||
.align 3
|
||||
|
||||
.L121:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0 * SIZE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t2, Y, 0 * SIZE
|
||||
vinsgr2vr.d VX2, t1, 0
|
||||
vinsgr2vr.d VX2, t2, 1
|
||||
add.d Y, Y, INCY
|
||||
vfmadd.d VX2, VX0, VXA, VX2
|
||||
vld VX1, X, 2 * SIZE
|
||||
vstelm.d VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t4, Y, 0 * SIZE
|
||||
vinsgr2vr.d VX3, t3, 0
|
||||
vinsgr2vr.d VX3, t4, 1
|
||||
add.d Y, Y, INCY
|
||||
vfmadd.d VX3, VX1, VXA, VX3
|
||||
vld VX0, X, 4 * SIZE
|
||||
vstelm.d VX3, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VX3, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t2, Y, 0 * SIZE
|
||||
vinsgr2vr.d VX2, t1, 0
|
||||
vinsgr2vr.d VX2, t2, 1
|
||||
add.d Y, Y, INCY
|
||||
vfmadd.d VX2, VX0, VXA, VX2
|
||||
vld VX1, X, 6 * SIZE
|
||||
vstelm.d VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t4, Y, 0 * SIZE
|
||||
vinsgr2vr.d VX3, t3, 0
|
||||
vinsgr2vr.d VX3, t4, 1
|
||||
add.d Y, Y, INCY
|
||||
vfmadd.d VX3, VX1, VXA, VX3
|
||||
vstelm.d VX3, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VX3, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
#else
|
||||
vld VX0, X, 0 * SIZE
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
vinsgr2vr.w VX2, t1, 0
|
||||
vinsgr2vr.w VX2, t2, 1
|
||||
vinsgr2vr.w VX2, t3, 2
|
||||
vinsgr2vr.w VX2, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
vfmadd.s VX2, VX0, VXA, VX2
|
||||
vld VX1, X, 4 * SIZE
|
||||
vstelm.w VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
vinsgr2vr.w VX3, t1, 0
|
||||
vinsgr2vr.w VX3, t2, 1
|
||||
vinsgr2vr.w VX3, t3, 2
|
||||
vinsgr2vr.w VX3, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
vfmadd.s VX3, VX1, VXA, VX3
|
||||
addi.d I, I, -1
|
||||
vstelm.w VX3, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
addi.d X, X, 8 * SIZE
|
||||
#endif
|
||||
blt $r0, I, .L121
|
||||
.align 3
|
||||
|
||||
.L122:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L123:
|
||||
LD $f12, X, 0 * SIZE
|
||||
LD $f14, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
MADD $f14, $f12, $f0, $f14
|
||||
ST $f14, Y, 0 * SIZE
|
||||
addi.d X, X, SIZE
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L123
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L21:// INCX!=1 and INCY==1
|
||||
bge $r0, I, .L212
|
||||
.align 3
|
||||
|
||||
.L211:
|
||||
#ifdef DOUBLE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
add.d X, X, INCX
|
||||
vfmadd.d VX2, VX0, VXA, VX2
|
||||
vld VX3, Y, 2 * SIZE
|
||||
vst VX2, Y, 0 * SIZE
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
add.d X, X, INCX
|
||||
vfmadd.d VX3, VX1, VXA, VX3
|
||||
vld VX2, Y, 4 * SIZE
|
||||
vst VX3, Y, 2 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
add.d X, X, INCX
|
||||
vfmadd.d VX2, VX0, VXA, VX2
|
||||
vld VX3, Y, 6 * SIZE
|
||||
vst VX2, Y, 4 * SIZE
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
add.d X, X, INCX
|
||||
vfmadd.d VX3, VX1, VXA, VX3
|
||||
addi.d I, I, -1
|
||||
vst VX3, Y, 6 * SIZE
|
||||
#else
|
||||
vld VX2, Y, 0 * SIZE
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
add.d X, X, INCX
|
||||
vfmadd.s VX2, VX0, VXA, VX2
|
||||
vld VX3, Y, 4 * SIZE
|
||||
vst VX2, Y, 0 * SIZE
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
add.d X, X, INCX
|
||||
vfmadd.s VX3, VX1, VXA, VX3
|
||||
addi.d I, I, -1
|
||||
vst VX3, Y, 4 * SIZE
|
||||
#endif
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L211
|
||||
.align 3
|
||||
|
||||
.L212:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L213:
|
||||
LD $f12, X, 0 * SIZE
|
||||
LD $f14, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
MADD $f14, $f12, $f0, $f14
|
||||
ST $f14, Y, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
addi.d Y, Y, SIZE
|
||||
blt $r0, I, .L213
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
bge $r0, I, .L223
|
||||
move YY, Y
|
||||
.align 3
|
||||
|
||||
.L222:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t2, Y, 0 * SIZE
|
||||
vinsgr2vr.d VX2, t1, 0
|
||||
vinsgr2vr.d VX2, t2, 1
|
||||
add.d Y, Y, INCY
|
||||
vfmadd.d VX2, VX0, VXA, VX2
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vstelm.d VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t4, Y, 0 * SIZE
|
||||
vinsgr2vr.d VX3, t3, 0
|
||||
vinsgr2vr.d VX3, t4, 1
|
||||
add.d Y, Y, INCY
|
||||
vfmadd.d VX3, VX1, VXA, VX3
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vstelm.d VX3, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VX3, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t2, Y, 0 * SIZE
|
||||
vinsgr2vr.d VX2, t1, 0
|
||||
vinsgr2vr.d VX2, t2, 1
|
||||
add.d Y, Y, INCY
|
||||
vfmadd.d VX2, VX0, VXA, VX2
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vstelm.d VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t2, Y, 0 * SIZE
|
||||
vinsgr2vr.d VX3, t1, 0
|
||||
vinsgr2vr.d VX3, t2, 1
|
||||
add.d Y, Y, INCY
|
||||
vfmadd.d VX3, VX1, VXA, VX3
|
||||
addi.d I, I, -1
|
||||
vstelm.d VX3, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VX3, YY, 0, 1
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
add.d X, X, INCX
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
vinsgr2vr.w VX2, t1, 0
|
||||
vinsgr2vr.w VX2, t2, 1
|
||||
vinsgr2vr.w VX2, t3, 2
|
||||
vinsgr2vr.w VX2, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
vfmadd.s VX2, VX0, VXA, VX2
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
vstelm.w VX2, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX2, YY, 0, 3
|
||||
add.d YY, YY, INCY
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t4, Y, 0 * SIZE
|
||||
vinsgr2vr.w VX3, t1, 0
|
||||
vinsgr2vr.w VX3, t2, 1
|
||||
vinsgr2vr.w VX3, t3, 2
|
||||
vinsgr2vr.w VX3, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
vfmadd.s VX3, VX1, VXA, VX3
|
||||
addi.d I, I, -1
|
||||
vstelm.w VX3, YY, 0, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX3, YY, 0, 3
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
blt $r0, I, .L222
|
||||
.align 3
|
||||
|
||||
.L223:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L224:
|
||||
LD $f12, X, 0 * SIZE
|
||||
LD $f14, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
MADD $f14, $f12, $f0, $f14
|
||||
ST $f14, Y, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L224
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r12
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,194 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r12
|
||||
#define t1 $f14
|
||||
#define t2 $f18
|
||||
#define t3 $f15
|
||||
#define t4 $f17
|
||||
#define s1 $f22
|
||||
#define s2 $f9
|
||||
#define s3 $f10
|
||||
#define s4 $f11
|
||||
#define TEMP $r16
|
||||
#define a0 $f20
|
||||
#define a1 $f21
|
||||
#define x1 $xr9
|
||||
#define x2 $xr10
|
||||
#define x3 $xr11
|
||||
#define x4 $xr12
|
||||
#define VT0 $xr13
|
||||
#define VT1 $xr14
|
||||
#define res0 $xr18
|
||||
#define neg1 $xr19
|
||||
#define VX0 $xr20
|
||||
#define VX1 $xr21
|
||||
#define VM0 $xr22
|
||||
#define VM1 $xr23
|
||||
|
||||
PROLOGUE
|
||||
xvxor.v VM0, VM0, VM0
|
||||
xvxor.v res0, res0, res0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
li.w I, -1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
xvreplgr2vr.w neg1, I
|
||||
xvffint.s.w neg1, neg1
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX1, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
xvfmul.s x3, neg1, x1
|
||||
xvfmul.s x4, neg1, x2
|
||||
xvfcmp.clt.s VT0, x1, res0
|
||||
xvfcmp.clt.s VT1, x2, res0
|
||||
xvbitsel.v x1, x1, x3, VT0
|
||||
xvbitsel.v x2, x2, x4, VT1
|
||||
addi.d X, X, 16 * SIZE
|
||||
xvfadd.s VM1, x1, x2
|
||||
xvfmax.s VM0, VM0, VM1
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
xvpickve.w x1, VM0, 0
|
||||
xvpickve.w x2, VM0, 1
|
||||
xvpickve.w x3, VM0, 2
|
||||
xvpickve.w x4, VM0, 3
|
||||
xvfmax.s VM1, x1, x2
|
||||
xvfmax.s VM0, x3, x4
|
||||
xvfmax.s VM0, VM0, VM1
|
||||
b .L23
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmax.s s1, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmax.s s1, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
addi.d I, I, -1
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmax.s s3, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmax.s s4, t1, t3
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
fmax.s s1, s1, s2
|
||||
fmax.s s3, s3, s4
|
||||
fmax.s s1, s1, s3
|
||||
.align 3
|
||||
|
||||
.L23: //N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
LD a0, X, 0 * SIZE
|
||||
LD a1, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
FABS a0, a0
|
||||
FABS a1, a1
|
||||
ADD a0, a0, a1
|
||||
add.d X, X, INCX
|
||||
fmax.s s1, a0, s1
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
fmov.s $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,206 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r12
|
||||
#define t1 $f14
|
||||
#define t2 $f18
|
||||
#define t3 $f15
|
||||
#define t4 $f17
|
||||
#define s1 $f22
|
||||
#define s2 $f9
|
||||
#define s3 $f10
|
||||
#define s4 $f11
|
||||
#define TEMP $r16
|
||||
#define a0 $f20
|
||||
#define a1 $f21
|
||||
#define x1 $vr9
|
||||
#define x2 $vr10
|
||||
#define x3 $vr11
|
||||
#define x4 $vr12
|
||||
#define VT0 $vr13
|
||||
#define VT1 $vr14
|
||||
#define res0 $vr18
|
||||
#define neg1 $vr19
|
||||
#define VX0 $vr20
|
||||
#define VX1 $vr21
|
||||
#define VM0 $vr22
|
||||
#define VM1 $vr23
|
||||
|
||||
PROLOGUE
|
||||
vxor.v VM0, VM0, VM0
|
||||
vxor.v res0, res0, res0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
li.w I, -1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
vreplgr2vr.w neg1, I
|
||||
vffint.s.w neg1, neg1
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX1, X, 4 * SIZE
|
||||
addi.d I, I, -1
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vfmul.s x3, neg1, x1
|
||||
vfmul.s x4, neg1, x2
|
||||
vfcmp.clt.s VT0, x1, res0
|
||||
vfcmp.clt.s VT1, x2, res0
|
||||
vld VX0, X, 8 * SIZE
|
||||
vbitsel.v x1, x1, x3, VT0
|
||||
vbitsel.v x2, x2, x4, VT1
|
||||
vld VX1, X, 12 * SIZE
|
||||
vfadd.s VM1, x1, x2
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vfmul.s x3, neg1, x1
|
||||
vfmul.s x4, neg1, x2
|
||||
vfcmp.clt.s VT0, x1, res0
|
||||
vfcmp.clt.s VT1, x2, res0
|
||||
addi.d X, X, 16 * SIZE
|
||||
vbitsel.v x1, x1, x3, VT0
|
||||
vbitsel.v x2, x2, x4, VT1
|
||||
vfadd.s x1, x1, x2
|
||||
vfmax.s VM1, x1, VM1
|
||||
vfmax.s VM0, VM0, VM1
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
vreplvei.w x1, VM0, 0
|
||||
vreplvei.w x2, VM0, 1
|
||||
vreplvei.w x3, VM0, 2
|
||||
vreplvei.w x4, VM0, 3
|
||||
vfmax.s VM1, x1, x2
|
||||
vfmax.s VM0, x3, x4
|
||||
vfmax.s VM0, VM0, VM1
|
||||
b .L23
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmax.s s1, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmax.s s1, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
addi.d I, I, -1
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmax.s s3, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmax.s s4, t1, t3
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
fmax.s s1, s1, s2
|
||||
fmax.s s3, s3, s4
|
||||
fmax.s s1, s1, s3
|
||||
.align 3
|
||||
|
||||
.L23: //N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
fld.s a0, X, 0 * SIZE
|
||||
fld.s a1, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
fabs.s a0, a0
|
||||
fabs.s a1, a1
|
||||
fadd.s a0, a0, a1
|
||||
add.d X, X, INCX
|
||||
fmax.s s1, a0, s1
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
fmov.s $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,199 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r12
|
||||
#define TEMP $r16
|
||||
#define t1 $f14
|
||||
#define t2 $f18
|
||||
#define t3 $f15
|
||||
#define t4 $f17
|
||||
#define s1 $f22
|
||||
#define s2 $f9
|
||||
#define s3 $f10
|
||||
#define s4 $f11
|
||||
#define a0 $f20
|
||||
#define a1 $f21
|
||||
#define x1 $xr9
|
||||
#define x2 $xr10
|
||||
#define x3 $xr11
|
||||
#define x4 $xr12
|
||||
#define VT0 $xr13
|
||||
#define VT1 $xr14
|
||||
#define res0 $xr18
|
||||
#define neg1 $xr19
|
||||
#define VX0 $xr20
|
||||
#define VX1 $xr21
|
||||
#define VM0 $xr22
|
||||
#define VM1 $xr23
|
||||
|
||||
PROLOGUE
|
||||
MTC s1, $r0
|
||||
xvxor.v res0, res0, res0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
fld.s a0, X, 0 * SIZE
|
||||
fld.s a1, X, 1 * SIZE
|
||||
fabs.s a0, a0
|
||||
fabs.s a1, a1
|
||||
fadd.s s1, a1, a0
|
||||
xvreplve0.w VM0, VM0
|
||||
li.d TEMP, 1
|
||||
li.w I, -1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
xvreplgr2vr.w neg1, I
|
||||
xvffint.s.w neg1, neg1
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX1, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
xvfmul.s x3, neg1, x1
|
||||
xvfmul.s x4, neg1, x2
|
||||
xvfcmp.clt.s VT0, x1, res0
|
||||
xvfcmp.clt.s VT1, x2, res0
|
||||
xvbitsel.v x1, x1, x3, VT0
|
||||
xvbitsel.v x2, x2, x4, VT1
|
||||
addi.d X, X, 16 * SIZE
|
||||
xvfadd.s VM1, x1, x2
|
||||
xvfmin.s VM0, VM0, VM1
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
xvpickve.w x1, VM0, 0
|
||||
xvpickve.w x2, VM0, 1
|
||||
xvpickve.w x3, VM0, 2
|
||||
xvpickve.w x4, VM0, 3
|
||||
xvfmin.s VM1, x1, x2
|
||||
xvfmin.s VM0, x3, x4
|
||||
xvfmin.s VM0, VM0, VM1
|
||||
b .L23
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmin.s s1, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmin.s s1, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
addi.d I, I, -1
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmin.s s3, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmin.s s4, t1, t3
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
fmin.s s1, s1, s2
|
||||
fmin.s s3, s3, s4
|
||||
fmin.s s1, s1, s3
|
||||
.align 3
|
||||
|
||||
.L23: //N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
LD a0, X, 0 * SIZE
|
||||
LD a1, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
FABS a0, a0
|
||||
FABS a1, a1
|
||||
ADD a0, a0, a1
|
||||
add.d X, X, INCX
|
||||
fmin.s s1, a0, s1
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
fmov.s $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,211 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r12
|
||||
#define t1 $f14
|
||||
#define t2 $f18
|
||||
#define t3 $f15
|
||||
#define t4 $f17
|
||||
#define s1 $f22
|
||||
#define s2 $f9
|
||||
#define s3 $f10
|
||||
#define s4 $f11
|
||||
#define TEMP $r16
|
||||
#define a0 $f20
|
||||
#define a1 $f21
|
||||
#define x1 $vr9
|
||||
#define x2 $vr10
|
||||
#define x3 $vr11
|
||||
#define x4 $vr12
|
||||
#define VT0 $vr13
|
||||
#define VT1 $vr14
|
||||
#define res0 $vr18
|
||||
#define neg1 $vr19
|
||||
#define VX0 $vr20
|
||||
#define VX1 $vr21
|
||||
#define VM0 $vr22
|
||||
#define VM1 $vr23
|
||||
|
||||
PROLOGUE
|
||||
MTC s1, $r0
|
||||
vxor.v res0, res0, res0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
fld.s a0, X, 0 * SIZE
|
||||
fld.s a1, X, 1 * SIZE
|
||||
fabs.s a0, a0
|
||||
fabs.s a1, a1
|
||||
fadd.s s1, a1, a0
|
||||
vreplvei.w VM0, VM0, 0
|
||||
li.d TEMP, 1
|
||||
li.w I, -1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
vreplgr2vr.w neg1, I
|
||||
vffint.s.w neg1, neg1
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX1, X, 4 * SIZE
|
||||
addi.d I, I, -1
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vfmul.s x3, neg1, x1
|
||||
vfmul.s x4, neg1, x2
|
||||
vfcmp.clt.s VT0, x1, res0
|
||||
vfcmp.clt.s VT1, x2, res0
|
||||
vld VX0, X, 8 * SIZE
|
||||
vbitsel.v x1, x1, x3, VT0
|
||||
vbitsel.v x2, x2, x4, VT1
|
||||
vld VX1, X, 12 * SIZE
|
||||
vfadd.s VM1, x1, x2
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vfmul.s x3, neg1, x1
|
||||
vfmul.s x4, neg1, x2
|
||||
vfcmp.clt.s VT0, x1, res0
|
||||
vfcmp.clt.s VT1, x2, res0
|
||||
addi.d X, X, 16 * SIZE
|
||||
vbitsel.v x1, x1, x3, VT0
|
||||
vbitsel.v x2, x2, x4, VT1
|
||||
vfadd.s x1, x1, x2
|
||||
vfmin.s VM1, x1, VM1
|
||||
vfmin.s VM0, VM0, VM1
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
vreplvei.w x1, VM0, 0
|
||||
vreplvei.w x2, VM0, 1
|
||||
vreplvei.w x3, VM0, 2
|
||||
vreplvei.w x4, VM0, 3
|
||||
vfmin.s VM1, x1, x2
|
||||
vfmin.s VM0, x3, x4
|
||||
vfmin.s VM0, VM0, VM1
|
||||
b .L23
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmin.s s1, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmin.s s1, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
addi.d I, I, -1
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmin.s s3, t1, t3
|
||||
fld.s t1, X, 0 * SIZE
|
||||
fld.s t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fld.s t3, X, 0 * SIZE
|
||||
fld.s t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fabs.s t1, t1
|
||||
fabs.s t2, t2
|
||||
fabs.s t3, t3
|
||||
fabs.s t4, t4
|
||||
fadd.s t1, t1, t2
|
||||
fadd.s t3, t3, t4
|
||||
fmin.s s4, t1, t3
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
fmin.s s1, s1, s2
|
||||
fmin.s s3, s3, s4
|
||||
fmin.s s1, s1, s3
|
||||
.align 3
|
||||
|
||||
.L23: //N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
fld.s a0, X, 0 * SIZE
|
||||
fld.s a1, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
fabs.s a0, a0
|
||||
fabs.s a1, a1
|
||||
fadd.s a0, a0, a1
|
||||
add.d X, X, INCX
|
||||
fmin.s s1, a0, s1
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
fmov.s $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,329 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define t1 $r15
|
||||
#define t2 $r12
|
||||
#define t3 $r13
|
||||
#define t4 $r14
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define s1 $f16
|
||||
#define VX0 $xr12
|
||||
#define VX1 $xr13
|
||||
#define VX2 $xr14
|
||||
#define VX3 $xr15
|
||||
#define res1 $xr16
|
||||
#define res2 $xr17
|
||||
#define res3 $xr18
|
||||
#define res0 $xr19
|
||||
#define neg1 $xr20
|
||||
#define VT0 $xr21
|
||||
#define VT1 $xr22
|
||||
|
||||
PROLOGUE
|
||||
xvxor.v res1, res1, res1
|
||||
xvxor.v res2, res2, res2
|
||||
xvxor.v res0, res0, res0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
#ifdef DOUBLE
|
||||
li.d t1, -1
|
||||
xvreplgr2vr.d neg1, t1
|
||||
xvffint.d.l neg1, neg1
|
||||
#else
|
||||
li.w t1, -1
|
||||
xvreplgr2vr.w neg1, t1
|
||||
xvffint.s.w neg1, neg1
|
||||
#endif
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvfmul.d VX2, neg1, VX0
|
||||
xvfmul.d VX3, neg1, VX1
|
||||
xvfcmp.clt.d VT0, VX0, res0
|
||||
xvfcmp.clt.d VT1, VX1, res0
|
||||
xvbitsel.v VX0, VX0, VX2, VT0
|
||||
xvbitsel.v VX1, VX1, VX3, VT1
|
||||
xvfadd.d res2, VX0, VX1
|
||||
xvfadd.d res1, res1, res2
|
||||
xvld VX2, X, 8 * SIZE
|
||||
xvld VX3, X, 12 * SIZE
|
||||
xvfmul.d VX0, neg1, VX2
|
||||
xvfmul.d VX1, neg1, VX3
|
||||
xvfcmp.clt.d VT0, VX2, res0
|
||||
xvfcmp.clt.d VT1, VX3, res0
|
||||
xvbitsel.v VX2, VX2, VX0, VT0
|
||||
xvbitsel.v VX3, VX3, VX1, VT1
|
||||
xvfadd.d res2, VX2, VX3
|
||||
xvfadd.d res1, res1, res2
|
||||
#else
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX1, X, 8 * SIZE
|
||||
xvfmul.s VX2, neg1, VX0
|
||||
xvfmul.s VX3, neg1, VX1
|
||||
xvfcmp.clt.s VT0, VX0, res0
|
||||
xvfcmp.clt.s VT1, VX1, res0
|
||||
xvbitsel.v VX0, VX0, VX2, VT0
|
||||
xvbitsel.v VX1, VX1, VX3, VT1
|
||||
xvfadd.s res2, VX0, VX1
|
||||
xvfadd.s res1, res2, res1
|
||||
#endif
|
||||
addi.d X, X, 16 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L11
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VX1, res1, 1
|
||||
xvpickve.d VX2, res1, 2
|
||||
xvpickve.d VX3, res1, 3
|
||||
xvfadd.d res1, VX1, res1
|
||||
xvfadd.d res1, VX2, res1
|
||||
xvfadd.d res1, VX3, res1
|
||||
#else
|
||||
xvfadd.s res2, res1, res2
|
||||
xvpickve.w VX1, res1, 1
|
||||
xvpickve.w VX2, res1, 2
|
||||
xvpickve.w VX3, res1, 3
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX3, res1
|
||||
xvpickve.w VX0, res2, 4
|
||||
xvpickve.w VX1, res2, 5
|
||||
xvpickve.w VX2, res2, 6
|
||||
xvpickve.w VX3, res2, 7
|
||||
xvfadd.s res1, VX0, res1
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L14:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
FABS a1, a1
|
||||
FABS a2, a2
|
||||
addi.d I, I, -1
|
||||
ADD a1, a1, a2
|
||||
ADD s1, a1, s1
|
||||
addi.d X, X, 2 * SIZE
|
||||
blt $r0, I, .L14
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvfmul.d VX2, neg1, VX0
|
||||
xvfmul.d VX3, neg1, VX1
|
||||
xvfcmp.clt.d VT0, VX0, res0
|
||||
xvfcmp.clt.d VT1, VX1, res0
|
||||
xvbitsel.v VX0, VX0, VX2, VT0
|
||||
xvbitsel.v VX1, VX1, VX3, VT1
|
||||
xvfadd.d res2, VX0, VX1
|
||||
xvfadd.d res1, res1, res2
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvfmul.d VX2, neg1, VX0
|
||||
xvfmul.d VX3, neg1, VX1
|
||||
xvfcmp.clt.d VT0, VX0, res0
|
||||
xvfcmp.clt.d VT1, VX1, res0
|
||||
xvbitsel.v VX0, VX0, VX2, VT0
|
||||
xvbitsel.v VX1, VX1, VX3, VT1
|
||||
xvfadd.d res2, VX0, VX1
|
||||
xvfadd.d res1, res1, res2
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX1, t1, 0
|
||||
xvinsgr2vr.w VX1, t2, 1
|
||||
xvinsgr2vr.w VX1, t3, 2
|
||||
xvinsgr2vr.w VX1, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX1, t1, 4
|
||||
xvinsgr2vr.w VX1, t2, 5
|
||||
xvinsgr2vr.w VX1, t3, 6
|
||||
xvinsgr2vr.w VX1, t4, 7
|
||||
xvfmul.s VX2, neg1, VX0
|
||||
xvfmul.s VX3, neg1, VX1
|
||||
xvfcmp.clt.s VT0, VX0, res0
|
||||
xvfcmp.clt.s VT1, VX1, res0
|
||||
xvbitsel.v VX0, VX0, VX2, VT0
|
||||
xvbitsel.v VX1, VX1, VX3, VT1
|
||||
xvfadd.s res2, VX0, VX1
|
||||
xvfadd.s res1, res2, res1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VX1, res1, 1
|
||||
xvpickve.d VX2, res1, 2
|
||||
xvpickve.d VX3, res1, 3
|
||||
xvfadd.d res1, VX1, res1
|
||||
xvfadd.d res1, VX2, res1
|
||||
xvfadd.d res1, VX3, res1
|
||||
#else
|
||||
xvfadd.s res2, res1, res2
|
||||
xvpickve.w VX1, res1, 1
|
||||
xvpickve.w VX2, res1, 2
|
||||
xvpickve.w VX3, res1, 3
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX3, res1
|
||||
xvpickve.w VX0, res2, 4
|
||||
xvpickve.w VX1, res2, 5
|
||||
xvpickve.w VX2, res2, 6
|
||||
xvpickve.w VX3, res2, 7
|
||||
xvfadd.s res1, VX0, res1
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
FABS a1, a1
|
||||
FABS a2, a2
|
||||
addi.d I, I, -1
|
||||
ADD a1, a1, a2
|
||||
ADD s1, a1, s1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
MOV $f0, $f16
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,358 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define t1 $r15
|
||||
#define t2 $r12
|
||||
#define t3 $r13
|
||||
#define t4 $r14
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define s1 $f16
|
||||
#define VX0 $vr12
|
||||
#define VX1 $vr13
|
||||
#define VX2 $vr14
|
||||
#define VX3 $vr15
|
||||
#define res1 $vr16
|
||||
#define res2 $vr17
|
||||
#define res3 $vr18
|
||||
#define res0 $vr19
|
||||
#define neg1 $vr20
|
||||
#define VT0 $vr21
|
||||
#define VT1 $vr22
|
||||
|
||||
PROLOGUE
|
||||
vxor.v res1, res1, res1
|
||||
vxor.v res2, res2, res2
|
||||
vxor.v res0, res0, res0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
#ifdef DOUBLE
|
||||
li.d t1, -1
|
||||
vreplgr2vr.d neg1, t1
|
||||
vffint.d.l neg1, neg1
|
||||
#else
|
||||
li.w t1, -1
|
||||
vreplgr2vr.w neg1, t1
|
||||
vffint.s.w neg1, neg1
|
||||
#endif
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vfmul.d VX2, neg1, VX0
|
||||
vfmul.d VX3, neg1, VX1
|
||||
vfcmp.clt.d VT0, VX0, res0
|
||||
vfcmp.clt.d VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
vld VX2, X, 4 * SIZE
|
||||
vld VX3, X, 6 * SIZE
|
||||
vfmul.d VX0, neg1, VX2
|
||||
vfmul.d VX1, neg1, VX3
|
||||
vfcmp.clt.d VT0, VX2, res0
|
||||
vfcmp.clt.d VT1, VX3, res0
|
||||
vbitsel.v VX2, VX2, VX0, VT0
|
||||
vbitsel.v VX3, VX3, VX1, VT1
|
||||
vfadd.d res2, VX2, VX3
|
||||
vfadd.d res1, res1, res2
|
||||
vld VX0, X, 8 * SIZE
|
||||
vld VX1, X, 10 * SIZE
|
||||
vfmul.d VX2, neg1, VX0
|
||||
vfmul.d VX3, neg1, VX1
|
||||
vfcmp.clt.d VT0, VX0, res0
|
||||
vfcmp.clt.d VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
vld VX2, X, 12 * SIZE
|
||||
vld VX3, X, 14 * SIZE
|
||||
vfmul.d VX0, neg1, VX2
|
||||
vfmul.d VX1, neg1, VX3
|
||||
vfcmp.clt.d VT0, VX2, res0
|
||||
vfcmp.clt.d VT1, VX3, res0
|
||||
vbitsel.v VX2, VX2, VX0, VT0
|
||||
vbitsel.v VX3, VX3, VX1, VT1
|
||||
vfadd.d res2, VX2, VX3
|
||||
vfadd.d res1, res1, res2
|
||||
addi.d I, I, -1
|
||||
#else
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX1, X, 4 * SIZE
|
||||
vfmul.s VX2, neg1, VX0
|
||||
vfmul.s VX3, neg1, VX1
|
||||
vfcmp.clt.s VT0, VX0, res0
|
||||
vfcmp.clt.s VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.s res2, VX0, VX1
|
||||
vld VX0, X, 8 * SIZE
|
||||
vld VX1, X, 12 * SIZE
|
||||
addi.d I, I, -1
|
||||
vfmul.s VX2, neg1, VX0
|
||||
vfmul.s VX3, neg1, VX1
|
||||
vfcmp.clt.s VT0, VX0, res0
|
||||
vfcmp.clt.s VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.s res3, VX1, VX0
|
||||
vfadd.s res2, res3, res2
|
||||
vfadd.s res1, res1, res2
|
||||
#endif
|
||||
addi.d X, X, 16 * SIZE
|
||||
blt $r0, I, .L11
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX1, res1, 1
|
||||
vfadd.d res1, VX1, res1
|
||||
#else
|
||||
vreplvei.w VX1, res1, 1
|
||||
vreplvei.w VX2, res1, 2
|
||||
vreplvei.w VX3, res1, 3
|
||||
vfadd.s res1, VX1, res1
|
||||
vfadd.s res1, VX2, res1
|
||||
vfadd.s res1, VX3, res1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L14:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
FABS a1, a1
|
||||
FABS a2, a2
|
||||
addi.d I, I, -1
|
||||
ADD a1, a1, a2
|
||||
ADD s1, a1, s1
|
||||
addi.d X, X, 2 * SIZE
|
||||
blt $r0, I, .L14
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
vinsgr2vr.d VX1, t1, 0
|
||||
vinsgr2vr.d VX1, t2, 1
|
||||
add.d X, X, INCX
|
||||
vfmul.d VX2, neg1, VX0
|
||||
vfmul.d VX3, neg1, VX1
|
||||
vfcmp.clt.d VT0, VX0, res0
|
||||
vfcmp.clt.d VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t3, 0
|
||||
vinsgr2vr.d VX0, t4, 1
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
add.d X, X, INCX
|
||||
vfmul.d VX2, neg1, VX0
|
||||
vfmul.d VX3, neg1, VX1
|
||||
vfcmp.clt.d VT0, VX0, res0
|
||||
vfcmp.clt.d VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
vinsgr2vr.d VX1, t1, 0
|
||||
vinsgr2vr.d VX1, t2, 1
|
||||
add.d X, X, INCX
|
||||
vfmul.d VX2, neg1, VX0
|
||||
vfmul.d VX3, neg1, VX1
|
||||
vfcmp.clt.d VT0, VX0, res0
|
||||
vfcmp.clt.d VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t3, 0
|
||||
vinsgr2vr.d VX0, t4, 1
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
add.d X, X, INCX
|
||||
vfmul.d VX2, neg1, VX0
|
||||
vfmul.d VX3, neg1, VX1
|
||||
vfcmp.clt.d VT0, VX0, res0
|
||||
vfcmp.clt.d VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
vfmul.s VX2, neg1, VX0
|
||||
vfmul.s VX3, neg1, VX1
|
||||
vfcmp.clt.s VT0, VX0, res0
|
||||
vfcmp.clt.s VT1, VX1, res0
|
||||
vbitsel.v VX0, VX0, VX2, VT0
|
||||
vbitsel.v VX1, VX1, VX3, VT1
|
||||
vfadd.s res2, VX0, VX1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX2, t1, 0
|
||||
vinsgr2vr.w VX2, t2, 1
|
||||
vinsgr2vr.w VX2, t3, 2
|
||||
vinsgr2vr.w VX2, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX3, t1, 0
|
||||
vinsgr2vr.w VX3, t2, 1
|
||||
vinsgr2vr.w VX3, t3, 2
|
||||
vinsgr2vr.w VX3, t4, 3
|
||||
vfmul.s VX0, neg1, VX2
|
||||
vfmul.s VX1, neg1, VX3
|
||||
vfcmp.clt.s VT0, VX2, res0
|
||||
vfcmp.clt.s VT1, VX3, res0
|
||||
vbitsel.v VX2, VX2, VX0, VT0
|
||||
vbitsel.v VX3, VX3, VX1, VT1
|
||||
vfadd.s res3, VX2, VX3
|
||||
vfadd.s res2, res3, res2
|
||||
vfadd.s res1, res1, res2
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX1, res1, 1
|
||||
vfadd.d res1, VX1, res1
|
||||
#else
|
||||
vreplvei.w VX1, res1, 1
|
||||
vreplvei.w VX2, res1, 2
|
||||
vreplvei.w VX3, res1, 3
|
||||
vfadd.s res1, VX1, res1
|
||||
vfadd.s res1, VX2, res1
|
||||
vfadd.s res1, VX3, res1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
FABS a1, a1
|
||||
FABS a2, a2
|
||||
addi.d I, I, -1
|
||||
ADD a1, a1, a2
|
||||
ADD s1, a1, s1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
MOV $f0, $f16
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,707 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define XX $r5
|
||||
#define YY $r6
|
||||
#define ALPHAR $f0
|
||||
#define ALPHAI $f1
|
||||
#define X $r7
|
||||
#define INCX $r8
|
||||
#define Y $r9
|
||||
#define INCY $r10
|
||||
|
||||
#define I $r12
|
||||
#define TEMP $r13
|
||||
#define t1 $r14
|
||||
#define t2 $r16
|
||||
#define t3 $r15
|
||||
#define t4 $r17
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define s1 $f16
|
||||
#define s2 $f17
|
||||
#define s3 $f18
|
||||
#define s4 $f19
|
||||
#define VX0 $xr8
|
||||
#define VX1 $xr20
|
||||
#define VX2 $xr21
|
||||
#define VX3 $xr22
|
||||
#define VXAR $xr23
|
||||
#define VXAI $xr19
|
||||
#define x1 $xr18
|
||||
#define x2 $xr17
|
||||
#define x3 $xr16
|
||||
#define x4 $xr15
|
||||
|
||||
PROLOGUE
|
||||
|
||||
bge $r0, N, .L999
|
||||
li.d TEMP, 1
|
||||
movgr2fr.d a1, $r0
|
||||
FFINT a1, a1
|
||||
CMPEQ $fcc0, ALPHAR, a1
|
||||
CMPEQ $fcc1, ALPHAI, a1
|
||||
bceqz $fcc0, .L10
|
||||
bcnez $fcc1, .L999
|
||||
.L10:
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
slli.d INCY, INCY, ZBASE_SHIFT
|
||||
MTG t1, ALPHAR
|
||||
MTG t2, ALPHAI
|
||||
#ifdef DOUBLE
|
||||
xvreplgr2vr.d VXAR, t1
|
||||
xvreplgr2vr.d VXAI, t2
|
||||
srai.d I, N, 2
|
||||
#else
|
||||
xvreplgr2vr.w VXAR, t1
|
||||
xvreplgr2vr.w VXAI, t2
|
||||
srai.d I, N, 3
|
||||
#endif
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
|
||||
b .L11 // INCX==1 and INCY==1
|
||||
.L20:
|
||||
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
|
||||
b .L21 // INCX!=1 and INCY==1
|
||||
|
||||
.L11:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L111:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvld VX3, Y, 4 * SIZE
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
xvpickev.d x3, VX3, VX2
|
||||
xvpickod.d x4, VX3, VX2
|
||||
#else
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
xvld VX1, X, 8 * SIZE
|
||||
xvld VX3, Y, 8 * SIZE
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
xvpickev.w x3, VX3, VX2
|
||||
xvpickod.w x4, VX3, VX2
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
#ifdef DOUBLE
|
||||
xvfmul.d VX0, VXAI, x2
|
||||
xvfmul.d VX2, VXAI, x1
|
||||
xvfmsub.d VX1, VXAR, x1, VX0
|
||||
xvfmadd.d VX3, x2, VXAR, VX2
|
||||
xvfadd.d x3, x3, VX1
|
||||
xvfadd.d x4, x4, VX3
|
||||
#else
|
||||
xvfmul.s VX0, VXAI, x2
|
||||
xvfmul.s VX2, VXAI, x1
|
||||
xvfmsub.s VX1, VXAR, x1, VX0
|
||||
xvfmadd.s VX3, x2, VXAR, VX2
|
||||
xvfadd.s x3, x3, VX1
|
||||
xvfadd.s x4, x4, VX3
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
xvfmul.d VX0, VXAI, x2
|
||||
xvfmul.d VX2, VXAI, x1
|
||||
xvfmadd.d VX1, VXAR, x1, VX0
|
||||
xvfmsub.d VX3, x2, VXAR, VX2
|
||||
xvfadd.d x3, x3, VX1
|
||||
xvfsub.d x4, x4, VX3
|
||||
#else
|
||||
xvfmul.s VX0, VXAI, x2
|
||||
xvfmul.s VX2, VXAI, x1
|
||||
xvfmadd.s VX1, VXAR, x1, VX0
|
||||
xvfmsub.s VX3, x2, VXAR, VX2
|
||||
xvfadd.s x3, x3, VX1
|
||||
xvfsub.s x4, x4, VX3
|
||||
#endif
|
||||
#endif
|
||||
#ifdef DOUBLE
|
||||
xvilvl.d VX2, x4 ,x3
|
||||
xvilvh.d VX3, x4, x3
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
xvst VX3, Y, 4 * SIZE
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
#else
|
||||
xvilvl.w VX2, x4 ,x3
|
||||
xvilvh.w VX3, x4, x3
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
xvst VX3, Y, 8 * SIZE
|
||||
addi.d X, X, 16 * SIZE
|
||||
addi.d Y, Y, 16 * SIZE
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L111
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L12: // INCX==1 and INCY!=1
|
||||
bge $r0, I, .L997
|
||||
move YY, Y
|
||||
.align 3
|
||||
|
||||
.L121:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.d x3, t1, 0
|
||||
xvinsgr2vr.d x4, t2, 0
|
||||
xvinsgr2vr.d x3, t3, 2
|
||||
xvinsgr2vr.d x4, t4, 2
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
xvinsgr2vr.d x3, t1, 1
|
||||
xvinsgr2vr.d x4, t2, 1
|
||||
xvinsgr2vr.d x3, t3, 3
|
||||
xvinsgr2vr.d x4, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
#else
|
||||
xvld VX0, X, 0 * SIZE
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 0
|
||||
xvinsgr2vr.w x4, t2, 0
|
||||
xvinsgr2vr.w x3, t3, 1
|
||||
xvinsgr2vr.w x4, t4, 1
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 4
|
||||
xvinsgr2vr.w x4, t2, 4
|
||||
xvinsgr2vr.w x3, t3, 5
|
||||
xvinsgr2vr.w x4, t4, 5
|
||||
xvld VX1, X, 8 * SIZE
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 2
|
||||
xvinsgr2vr.w x4, t2, 2
|
||||
xvinsgr2vr.w x3, t3, 3
|
||||
xvinsgr2vr.w x4, t4, 3
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
xvinsgr2vr.w x3, t1, 6
|
||||
xvinsgr2vr.w x4, t2, 6
|
||||
xvinsgr2vr.w x3, t3, 7
|
||||
xvinsgr2vr.w x4, t4, 7
|
||||
add.d Y, Y, INCY
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
#ifdef DOUBLE
|
||||
xvfmul.d VX0, VXAI, x2
|
||||
xvfmul.d VX2, VXAI, x1
|
||||
xvfmsub.d VX1, VXAR, x1, VX0
|
||||
xvfmadd.d VX3, x2, VXAR, VX2
|
||||
xvfadd.d x3, x3, VX1
|
||||
xvfadd.d x4, x4, VX3
|
||||
#else
|
||||
xvfmul.s VX0, VXAI, x2
|
||||
xvfmul.s VX2, VXAI, x1
|
||||
xvfmsub.s VX1, VXAR, x1, VX0
|
||||
xvfmadd.s VX3, x2, VXAR, VX2
|
||||
xvfadd.s x3, x3, VX1
|
||||
xvfadd.s x4, x4, VX3
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
xvfmul.d VX0, VXAI, x2
|
||||
xvfmul.d VX2, VXAI, x1
|
||||
xvfmadd.d VX1, VXAR, x1, VX0
|
||||
xvfmsub.d VX3, x2, VXAR, VX2
|
||||
xvfadd.d x3, x3, VX1
|
||||
xvfsub.d x4, x4, VX3
|
||||
#else
|
||||
xvfmul.s VX0, VXAI, x2
|
||||
xvfmul.s VX2, VXAI, x1
|
||||
xvfmadd.s VX1, VXAR, x1, VX0
|
||||
xvfmsub.s VX3, x2, VXAR, VX2
|
||||
xvfadd.s x3, x3, VX1
|
||||
xvfsub.s x4, x4, VX3
|
||||
#endif
|
||||
#endif
|
||||
#ifdef DOUBLE
|
||||
xvstelm.d x3, YY, 0 * SIZE, 0
|
||||
xvstelm.d x4, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d x3, YY, 0 * SIZE, 2
|
||||
xvstelm.d x4, YY, 1 * SIZE, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d x3, YY, 0 * SIZE, 1
|
||||
xvstelm.d x4, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d x3, YY, 0 * SIZE, 3
|
||||
xvstelm.d x4, YY, 1 * SIZE, 3
|
||||
add.d YY, YY, INCY
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
#else
|
||||
addi.d I, I, -1
|
||||
xvstelm.w x3, YY, 0 * SIZE, 0
|
||||
xvstelm.w x4, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 1
|
||||
xvstelm.w x4, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 4
|
||||
xvstelm.w x4, YY, 1 * SIZE, 4
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 5
|
||||
xvstelm.w x4, YY, 1 * SIZE, 5
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 2
|
||||
xvstelm.w x4, YY, 1 * SIZE, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 3
|
||||
xvstelm.w x4, YY, 1 * SIZE, 3
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 6
|
||||
xvstelm.w x4, YY, 1 * SIZE, 6
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 7
|
||||
xvstelm.w x4, YY, 1 * SIZE, 7
|
||||
add.d YY, YY, INCY
|
||||
addi.d X, X, 16 * SIZE
|
||||
#endif
|
||||
blt $r0, I, .L121
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L21:// INCX!=1 and INCY==1
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L211:
|
||||
#ifdef DOUBLE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
xvld VX3, Y, 4 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 0
|
||||
xvinsgr2vr.d x2, t2, 0
|
||||
xvinsgr2vr.d x1, t3, 2
|
||||
xvinsgr2vr.d x2, t4, 2
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
xvinsgr2vr.d x1, t1, 1
|
||||
xvinsgr2vr.d x2, t2, 1
|
||||
xvinsgr2vr.d x1, t3, 3
|
||||
xvinsgr2vr.d x2, t4, 3
|
||||
add.d X, X, INCX
|
||||
xvpickev.d x3, VX3, VX2
|
||||
xvpickod.d x4, VX3, VX2
|
||||
#else
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 0
|
||||
xvinsgr2vr.w x2, t2, 0
|
||||
xvinsgr2vr.w x1, t3, 1
|
||||
xvinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 4
|
||||
xvinsgr2vr.w x2, t2, 4
|
||||
xvinsgr2vr.w x1, t3, 5
|
||||
xvinsgr2vr.w x2, t4, 5
|
||||
xvld VX3, Y, 8 * SIZE
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 2
|
||||
xvinsgr2vr.w x2, t2, 2
|
||||
xvinsgr2vr.w x1, t3, 3
|
||||
xvinsgr2vr.w x2, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 6
|
||||
xvinsgr2vr.w x2, t2, 6
|
||||
xvinsgr2vr.w x1, t3, 7
|
||||
xvinsgr2vr.w x2, t4, 7
|
||||
xvpickev.w x3, VX3, VX2
|
||||
xvpickod.w x4, VX3, VX2
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
#ifdef DOUBLE
|
||||
xvfmul.d VX0, VXAI, x2
|
||||
xvfmul.d VX2, VXAI, x1
|
||||
xvfmsub.d VX1, VXAR, x1, VX0
|
||||
xvfmadd.d VX3, x2, VXAR, VX2
|
||||
xvfadd.d x3, x3, VX1
|
||||
xvfadd.d x4, x4, VX3
|
||||
#else
|
||||
xvfmul.s VX0, VXAI, x2
|
||||
xvfmul.s VX2, VXAI, x1
|
||||
xvfmsub.s VX1, VXAR, x1, VX0
|
||||
xvfmadd.s VX3, x2, VXAR, VX2
|
||||
xvfadd.s x3, x3, VX1
|
||||
xvfadd.s x4, x4, VX3
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
xvfmul.d VX0, VXAI, x2
|
||||
xvfmul.d VX2, VXAI, x1
|
||||
xvfmadd.d VX1, VXAR, x1, VX0
|
||||
xvfmsub.d VX3, x2, VXAR, VX2
|
||||
xvfadd.d x3, x3, VX1
|
||||
xvfsub.d x4, x4, VX3
|
||||
#else
|
||||
xvfmul.s VX0, VXAI, x2
|
||||
xvfmul.s VX2, VXAI, x1
|
||||
xvfmadd.s VX1, VXAR, x1, VX0
|
||||
xvfmsub.s VX3, x2, VXAR, VX2
|
||||
xvfadd.s x3, x3, VX1
|
||||
xvfsub.s x4, x4, VX3
|
||||
#endif
|
||||
#endif
|
||||
#ifdef DOUBLE
|
||||
xvilvl.d VX2, x4 ,x3
|
||||
xvilvh.d VX3, x4, x3
|
||||
addi.d I, I, -1
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
xvst VX3, Y, 4 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
#else
|
||||
xvilvl.w VX2, x4 ,x3
|
||||
xvilvh.w VX3, x4, x3
|
||||
addi.d I, I, -1
|
||||
xvst VX2, Y, 0 * SIZE
|
||||
xvst VX3, Y, 8 * SIZE
|
||||
addi.d Y, Y, 16 * SIZE
|
||||
#endif
|
||||
blt $r0, I, .L211
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
bge $r0, I, .L997
|
||||
move YY, Y
|
||||
.align 3
|
||||
|
||||
.L222:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 0
|
||||
xvinsgr2vr.d x2, t2, 0
|
||||
xvinsgr2vr.d x1, t3, 1
|
||||
xvinsgr2vr.d x2, t4, 1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 2
|
||||
xvinsgr2vr.d x2, t2, 2
|
||||
xvinsgr2vr.d x1, t3, 3
|
||||
xvinsgr2vr.d x2, t4, 3
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.d x3, t1, 0
|
||||
xvinsgr2vr.d x4, t2, 0
|
||||
xvinsgr2vr.d x3, t3, 1
|
||||
xvinsgr2vr.d x4, t4, 1
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.d x3, t1, 2
|
||||
xvinsgr2vr.d x4, t2, 2
|
||||
xvinsgr2vr.d x3, t3, 3
|
||||
xvinsgr2vr.d x4, t4, 3
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 0
|
||||
xvinsgr2vr.w x2, t2, 0
|
||||
xvinsgr2vr.w x1, t3, 1
|
||||
xvinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 2
|
||||
xvinsgr2vr.w x2, t2, 2
|
||||
xvinsgr2vr.w x1, t3, 3
|
||||
xvinsgr2vr.w x2, t4, 3
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 0
|
||||
xvinsgr2vr.w x4, t2, 0
|
||||
xvinsgr2vr.w x3, t3, 1
|
||||
xvinsgr2vr.w x4, t4, 1
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 2
|
||||
xvinsgr2vr.w x4, t2, 2
|
||||
xvinsgr2vr.w x3, t3, 3
|
||||
xvinsgr2vr.w x4, t4, 3
|
||||
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 4
|
||||
xvinsgr2vr.w x2, t2, 4
|
||||
xvinsgr2vr.w x1, t3, 5
|
||||
xvinsgr2vr.w x2, t4, 5
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 6
|
||||
xvinsgr2vr.w x2, t2, 6
|
||||
xvinsgr2vr.w x1, t3, 7
|
||||
xvinsgr2vr.w x2, t4, 7
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 4
|
||||
xvinsgr2vr.w x4, t2, 4
|
||||
xvinsgr2vr.w x3, t3, 5
|
||||
xvinsgr2vr.w x4, t4, 5
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 6
|
||||
xvinsgr2vr.w x4, t2, 6
|
||||
xvinsgr2vr.w x3, t3, 7
|
||||
xvinsgr2vr.w x4, t4, 7
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
#ifdef DOUBLE
|
||||
xvfmul.d VX0, VXAI, x2
|
||||
xvfmul.d VX2, VXAI, x1
|
||||
xvfmsub.d VX1, VXAR, x1, VX0
|
||||
xvfmadd.d VX3, x2, VXAR, VX2
|
||||
xvfadd.d x3, x3, VX1
|
||||
xvfadd.d x4, x4, VX3
|
||||
#else
|
||||
xvfmul.s VX0, VXAI, x2
|
||||
xvfmul.s VX2, VXAI, x1
|
||||
xvfmsub.s VX1, VXAR, x1, VX0
|
||||
xvfmadd.s VX3, x2, VXAR, VX2
|
||||
xvfadd.s x3, x3, VX1
|
||||
xvfadd.s x4, x4, VX3
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
xvfmul.d VX0, VXAI, x2
|
||||
xvfmul.d VX2, VXAI, x1
|
||||
xvfmadd.d VX1, VXAR, x1, VX0
|
||||
xvfmsub.d VX3, x2, VXAR, VX2
|
||||
xvfadd.d x3, x3, VX1
|
||||
xvfsub.d x4, x4, VX3
|
||||
#else
|
||||
xvfmul.s VX0, VXAI, x2
|
||||
xvfmul.s VX2, VXAI, x1
|
||||
xvfmadd.s VX1, VXAR, x1, VX0
|
||||
xvfmsub.s VX3, x2, VXAR, VX2
|
||||
xvfadd.s x3, x3, VX1
|
||||
xvfsub.s x4, x4, VX3
|
||||
#endif
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
#ifdef DOUBLE
|
||||
xvstelm.d x3, YY, 0 * SIZE, 0
|
||||
xvstelm.d x4, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d x3, YY, 0 * SIZE, 1
|
||||
xvstelm.d x4, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d x3, YY, 0 * SIZE, 2
|
||||
xvstelm.d x4, YY, 1 * SIZE, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.d x3, YY, 0 * SIZE, 3
|
||||
xvstelm.d x4, YY, 1 * SIZE, 3
|
||||
#else
|
||||
xvstelm.w x3, YY, 0 * SIZE, 0
|
||||
xvstelm.w x4, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 1
|
||||
xvstelm.w x4, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 2
|
||||
xvstelm.w x4, YY, 1 * SIZE, 2
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 3
|
||||
xvstelm.w x4, YY, 1 * SIZE, 3
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 4
|
||||
xvstelm.w x4, YY, 1 * SIZE, 4
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 5
|
||||
xvstelm.w x4, YY, 1 * SIZE, 5
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 6
|
||||
xvstelm.w x4, YY, 1 * SIZE, 6
|
||||
add.d YY, YY, INCY
|
||||
xvstelm.w x3, YY, 0 * SIZE, 7
|
||||
xvstelm.w x4, YY, 1 * SIZE, 7
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
blt $r0, I, .L222
|
||||
.align 3
|
||||
|
||||
.L997:
|
||||
#ifdef DOUBLE
|
||||
andi I, N, 3
|
||||
#else
|
||||
andi I, N, 7
|
||||
#endif
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, Y, 0 * SIZE
|
||||
LD a4, Y, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
#if !defined(CONJ)
|
||||
MUL s1, ALPHAI, a2
|
||||
MUL s2, ALPHAI, a1
|
||||
MSUB s3, ALPHAR, a1, s1
|
||||
MADD s4, a2, ALPHAR, s2
|
||||
ADD s3, s3, a3
|
||||
ADD s4, s4, a4
|
||||
#else
|
||||
MUL s1, ALPHAI, a2
|
||||
MUL s2, ALPHAI, a1
|
||||
MADD s3, ALPHAR, a1, s1
|
||||
MSUB s4, a2, ALPHAR, s2
|
||||
ADD s3, s3, a3
|
||||
SUB s4, a4, s4
|
||||
#endif
|
||||
ST s3, Y, 0 * SIZE
|
||||
ST s4, Y, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r12
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,679 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define XX $r5
|
||||
#define YY $r6
|
||||
#define ALPHAR $f0
|
||||
#define ALPHAI $f1
|
||||
#define X $r7
|
||||
#define INCX $r8
|
||||
#define Y $r9
|
||||
#define INCY $r10
|
||||
|
||||
#define I $r12
|
||||
#define TEMP $r13
|
||||
#define t1 $r14
|
||||
#define t2 $r16
|
||||
#define t3 $r15
|
||||
#define t4 $r17
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define s1 $f16
|
||||
#define s2 $f17
|
||||
#define s3 $f18
|
||||
#define s4 $f19
|
||||
#define VX0 $vr8
|
||||
#define VX1 $vr20
|
||||
#define VX2 $vr21
|
||||
#define VX3 $vr22
|
||||
#define VXAR $vr23
|
||||
#define VXAI $vr19
|
||||
#define x1 $vr18
|
||||
#define x2 $vr17
|
||||
#define x3 $vr16
|
||||
#define x4 $vr15
|
||||
|
||||
PROLOGUE
|
||||
|
||||
bge $r0, N, .L999
|
||||
li.d TEMP, 1
|
||||
movgr2fr.d a1, $r0
|
||||
FFINT a1, a1
|
||||
CMPEQ $fcc0, ALPHAR, a1
|
||||
CMPEQ $fcc1, ALPHAI, a1
|
||||
bceqz $fcc0, .L10
|
||||
bcnez $fcc1, .L999
|
||||
.L10:
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
slli.d INCY, INCY, ZBASE_SHIFT
|
||||
MTG t1, ALPHAR
|
||||
MTG t2, ALPHAI
|
||||
#ifdef DOUBLE
|
||||
vreplgr2vr.d VXAR, t1
|
||||
vreplgr2vr.d VXAI, t2
|
||||
#else
|
||||
vreplgr2vr.w VXAR, t1
|
||||
vreplgr2vr.w VXAI, t2
|
||||
#endif
|
||||
srai.d I, N, 2
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
|
||||
b .L11 // INCX==1 and INCY==1
|
||||
.L20:
|
||||
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
|
||||
b .L21 // INCX!=1 and INCY==1
|
||||
|
||||
.L11:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L111:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vld VX3, Y, 2 * SIZE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vpickev.d x3, VX3, VX2
|
||||
vpickod.d x4, VX3, VX2
|
||||
#else
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
vld VX1, X, 4 * SIZE
|
||||
vld VX3, Y, 4 * SIZE
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vpickev.w x3, VX3, VX2
|
||||
vpickod.w x4, VX3, VX2
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
#ifdef DOUBLE
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmsub.d VX1, VXAR, x1, VX0
|
||||
vfmadd.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfadd.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.s VX0, VXAI, x2
|
||||
vfmul.s VX2, VXAI, x1
|
||||
vfmsub.s VX1, VXAR, x1, VX0
|
||||
vfmadd.s VX3, x2, VXAR, VX2
|
||||
vfadd.s x3, x3, VX1
|
||||
vfadd.s x4, x4, VX3
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmadd.d VX1, VXAR, x1, VX0
|
||||
vfmsub.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfsub.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.s VX0, VXAI, x2
|
||||
vfmul.s VX2, VXAI, x1
|
||||
vfmadd.s VX1, VXAR, x1, VX0
|
||||
vfmsub.s VX3, x2, VXAR, VX2
|
||||
vfadd.s x3, x3, VX1
|
||||
vfsub.s x4, x4, VX3
|
||||
#endif
|
||||
#endif
|
||||
#ifdef DOUBLE
|
||||
vilvl.d VX2, x4 ,x3
|
||||
vilvh.d VX3, x4, x3
|
||||
vst VX2, Y, 0 * SIZE
|
||||
vst VX3, Y, 2 * SIZE
|
||||
vld VX0, X, 4 * SIZE
|
||||
vld VX2, Y, 4 * SIZE
|
||||
vld VX1, X, 6 * SIZE
|
||||
vld VX3, Y, 6 * SIZE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vpickev.d x3, VX3, VX2
|
||||
vpickod.d x4, VX3, VX2
|
||||
#if !defined(CONJ)
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmsub.d VX1, VXAR, x1, VX0
|
||||
vfmadd.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfadd.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmadd.d VX1, VXAR, x1, VX0
|
||||
vfmsub.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfsub.d x4, x4, VX3
|
||||
#endif
|
||||
vilvl.d VX2, x4 ,x3
|
||||
vilvh.d VX3, x4, x3
|
||||
vst VX2, Y, 4 * SIZE
|
||||
vst VX3, Y, 6 * SIZE
|
||||
#else
|
||||
vilvl.w VX2, x4 ,x3
|
||||
vilvh.w VX3, x4, x3
|
||||
vst VX2, Y, 0 * SIZE
|
||||
vst VX3, Y, 4 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L111
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L12: // INCX==1 and INCY!=1
|
||||
bge $r0, I, .L997
|
||||
move YY, Y
|
||||
.align 3
|
||||
|
||||
.L121:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX1, X, 2 * SIZE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
vinsgr2vr.d x3, t1, 0
|
||||
vinsgr2vr.d x4, t2, 0
|
||||
vinsgr2vr.d x3, t3, 1
|
||||
vinsgr2vr.d x4, t4, 1
|
||||
add.d Y, Y, INCY
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
#else
|
||||
vld VX0, X, 0 * SIZE
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.w x3, t1, 0
|
||||
vinsgr2vr.w x4, t2, 0
|
||||
vinsgr2vr.w x3, t3, 1
|
||||
vinsgr2vr.w x4, t4, 1
|
||||
vld VX1, X, 4 * SIZE
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
vinsgr2vr.w x3, t1, 2
|
||||
vinsgr2vr.w x4, t2, 2
|
||||
vinsgr2vr.w x3, t3, 3
|
||||
vinsgr2vr.w x4, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
#ifdef DOUBLE
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmsub.d VX1, VXAR, x1, VX0
|
||||
vfmadd.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfadd.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.s VX0, VXAI, x2
|
||||
vfmul.s VX2, VXAI, x1
|
||||
vfmsub.s VX1, VXAR, x1, VX0
|
||||
vfmadd.s VX3, x2, VXAR, VX2
|
||||
vfadd.s x3, x3, VX1
|
||||
vfadd.s x4, x4, VX3
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmadd.d VX1, VXAR, x1, VX0
|
||||
vfmsub.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfsub.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.s VX0, VXAI, x2
|
||||
vfmul.s VX2, VXAI, x1
|
||||
vfmadd.s VX1, VXAR, x1, VX0
|
||||
vfmsub.s VX3, x2, VXAR, VX2
|
||||
vfadd.s x3, x3, VX1
|
||||
vfsub.s x4, x4, VX3
|
||||
#endif
|
||||
#endif
|
||||
#ifdef DOUBLE
|
||||
vstelm.d x3, YY, 0 * SIZE, 0
|
||||
vstelm.d x4, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d x3, YY, 0 * SIZE, 1
|
||||
vstelm.d x4, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
|
||||
vld VX0, X, 4 * SIZE
|
||||
vld VX1, X, 6 * SIZE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
vinsgr2vr.d x3, t1, 0
|
||||
vinsgr2vr.d x4, t2, 0
|
||||
vinsgr2vr.d x3, t3, 1
|
||||
vinsgr2vr.d x4, t4, 1
|
||||
add.d Y, Y, INCY
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
#if !defined(CONJ)
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmsub.d VX1, VXAR, x1, VX0
|
||||
vfmadd.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfadd.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmadd.d VX1, VXAR, x1, VX0
|
||||
vfmsub.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfsub.d x4, x4, VX3
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
vstelm.d x3, YY, 0 * SIZE, 0
|
||||
vstelm.d x4, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d x3, YY, 0 * SIZE, 1
|
||||
vstelm.d x4, YY, 1 * SIZE, 1
|
||||
#else
|
||||
addi.d I, I, -1
|
||||
vstelm.w x3, YY, 0 * SIZE, 0
|
||||
vstelm.w x4, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w x3, YY, 0 * SIZE, 1
|
||||
vstelm.w x4, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w x3, YY, 0 * SIZE, 2
|
||||
vstelm.w x4, YY, 1 * SIZE, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w x3, YY, 0 * SIZE, 3
|
||||
vstelm.w x4, YY, 1 * SIZE, 3
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
addi.d X, X, 8 * SIZE
|
||||
blt $r0, I, .L121
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L21:// INCX!=1 and INCY==1
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L211:
|
||||
#ifdef DOUBLE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
vld VX3, Y, 2 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
add.d X, X, INCX
|
||||
vpickev.d x3, VX3, VX2
|
||||
vpickod.d x4, VX3, VX2
|
||||
#else
|
||||
vld VX2, Y, 0 * SIZE
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 0
|
||||
vinsgr2vr.w x2, t2, 0
|
||||
vinsgr2vr.w x1, t3, 1
|
||||
vinsgr2vr.w x2, t4, 1
|
||||
vld VX3, Y, 4 * SIZE
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
vinsgr2vr.w x1, t1, 2
|
||||
vinsgr2vr.w x2, t2, 2
|
||||
vinsgr2vr.w x1, t3, 3
|
||||
vinsgr2vr.w x2, t4, 3
|
||||
add.d X, X, INCX
|
||||
vpickev.w x3, VX3, VX2
|
||||
vpickod.w x4, VX3, VX2
|
||||
#endif
|
||||
#if !defined(CONJ)
|
||||
#ifdef DOUBLE
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmsub.d VX1, VXAR, x1, VX0
|
||||
vfmadd.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfadd.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.s VX0, VXAI, x2
|
||||
vfmul.s VX2, VXAI, x1
|
||||
vfmsub.s VX1, VXAR, x1, VX0
|
||||
vfmadd.s VX3, x2, VXAR, VX2
|
||||
vfadd.s x3, x3, VX1
|
||||
vfadd.s x4, x4, VX3
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmadd.d VX1, VXAR, x1, VX0
|
||||
vfmsub.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfsub.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.s VX0, VXAI, x2
|
||||
vfmul.s VX2, VXAI, x1
|
||||
vfmadd.s VX1, VXAR, x1, VX0
|
||||
vfmsub.s VX3, x2, VXAR, VX2
|
||||
vfadd.s x3, x3, VX1
|
||||
vfsub.s x4, x4, VX3
|
||||
#endif
|
||||
#endif
|
||||
#ifdef DOUBLE
|
||||
vilvl.d VX2, x4 ,x3
|
||||
vilvh.d VX3, x4, x3
|
||||
vst VX2, Y, 0 * SIZE
|
||||
vst VX3, Y, 2 * SIZE
|
||||
|
||||
vld VX2, Y, 4 * SIZE
|
||||
vld VX3, Y, 6 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
add.d X, X, INCX
|
||||
vpickev.d x3, VX3, VX2
|
||||
vpickod.d x4, VX3, VX2
|
||||
#if !defined(CONJ)
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmsub.d VX1, VXAR, x1, VX0
|
||||
vfmadd.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfadd.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmadd.d VX1, VXAR, x1, VX0
|
||||
vfmsub.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfsub.d x4, x4, VX3
|
||||
#endif
|
||||
vilvl.d VX2, x4 ,x3
|
||||
vilvh.d VX3, x4, x3
|
||||
addi.d I, I, -1
|
||||
vst VX2, Y, 4 * SIZE
|
||||
vst VX3, Y, 6 * SIZE
|
||||
#else
|
||||
vilvl.w VX2, x4 ,x3
|
||||
vilvh.w VX3, x4, x3
|
||||
addi.d I, I, -1
|
||||
vst VX2, Y, 0 * SIZE
|
||||
vst VX3, Y, 4 * SIZE
|
||||
#endif
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L211
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
bge $r0, I, .L997
|
||||
move YY, Y
|
||||
.align 3
|
||||
|
||||
.L222:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
vinsgr2vr.d x3, t1, 0
|
||||
vinsgr2vr.d x4, t2, 0
|
||||
vinsgr2vr.d x3, t3, 1
|
||||
vinsgr2vr.d x4, t4, 1
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 0
|
||||
vinsgr2vr.w x2, t2, 0
|
||||
vinsgr2vr.w x1, t3, 1
|
||||
vinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.w x3, t1, 0
|
||||
vinsgr2vr.w x4, t2, 0
|
||||
vinsgr2vr.w x3, t3, 1
|
||||
vinsgr2vr.w x4, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 2
|
||||
vinsgr2vr.w x2, t2, 2
|
||||
vinsgr2vr.w x1, t3, 3
|
||||
vinsgr2vr.w x2, t4, 3
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
vinsgr2vr.w x3, t1, 2
|
||||
vinsgr2vr.w x4, t2, 2
|
||||
vinsgr2vr.w x3, t3, 3
|
||||
vinsgr2vr.w x4, t4, 3
|
||||
#endif
|
||||
add.d Y, Y, INCY
|
||||
#if !defined(CONJ)
|
||||
#ifdef DOUBLE
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmsub.d VX1, VXAR, x1, VX0
|
||||
vfmadd.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfadd.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.s VX0, VXAI, x2
|
||||
vfmul.s VX2, VXAI, x1
|
||||
vfmsub.s VX1, VXAR, x1, VX0
|
||||
vfmadd.s VX3, x2, VXAR, VX2
|
||||
vfadd.s x3, x3, VX1
|
||||
vfadd.s x4, x4, VX3
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmadd.d VX1, VXAR, x1, VX0
|
||||
vfmsub.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfsub.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.s VX0, VXAI, x2
|
||||
vfmul.s VX2, VXAI, x1
|
||||
vfmadd.s VX1, VXAR, x1, VX0
|
||||
vfmsub.s VX3, x2, VXAR, VX2
|
||||
vfadd.s x3, x3, VX1
|
||||
vfsub.s x4, x4, VX3
|
||||
#endif
|
||||
#endif
|
||||
#ifdef DOUBLE
|
||||
vstelm.d x3, YY, 0 * SIZE, 0
|
||||
vstelm.d x4, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d x3, YY, 0 * SIZE, 1
|
||||
vstelm.d x4, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
vinsgr2vr.d x3, t1, 0
|
||||
vinsgr2vr.d x4, t2, 0
|
||||
vinsgr2vr.d x3, t3, 1
|
||||
vinsgr2vr.d x4, t4, 1
|
||||
add.d Y, Y, INCY
|
||||
#if !defined(CONJ)
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmsub.d VX1, VXAR, x1, VX0
|
||||
vfmadd.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfadd.d x4, x4, VX3
|
||||
#else
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmul.d VX2, VXAI, x1
|
||||
vfmadd.d VX1, VXAR, x1, VX0
|
||||
vfmsub.d VX3, x2, VXAR, VX2
|
||||
vfadd.d x3, x3, VX1
|
||||
vfsub.d x4, x4, VX3
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
vstelm.d x3, YY, 0 * SIZE, 0
|
||||
vstelm.d x4, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d x3, YY, 0 * SIZE, 1
|
||||
vstelm.d x4, YY, 1 * SIZE, 1
|
||||
#else
|
||||
addi.d I, I, -1
|
||||
vstelm.w x3, YY, 0 * SIZE, 0
|
||||
vstelm.w x4, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w x3, YY, 0 * SIZE, 1
|
||||
vstelm.w x4, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w x3, YY, 0 * SIZE, 2
|
||||
vstelm.w x4, YY, 1 * SIZE, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w x3, YY, 0 * SIZE, 3
|
||||
vstelm.w x4, YY, 1 * SIZE, 3
|
||||
#endif
|
||||
add.d YY, YY, INCY
|
||||
blt $r0, I, .L222
|
||||
.align 3
|
||||
|
||||
.L997:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, Y, 0 * SIZE
|
||||
LD a4, Y, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
#if !defined(CONJ)
|
||||
MUL s1, ALPHAI, a2
|
||||
MUL s2, ALPHAI, a1
|
||||
MSUB s3, ALPHAR, a1, s1
|
||||
MADD s4, a2, ALPHAR, s2
|
||||
ADD s3, s3, a3
|
||||
ADD s4, s4, a4
|
||||
#else
|
||||
MUL s1, ALPHAI, a2
|
||||
MUL s2, ALPHAI, a1
|
||||
MADD s3, ALPHAR, a1, s1
|
||||
MSUB s4, a2, ALPHAR, s2
|
||||
ADD s3, s3, a3
|
||||
SUB s4, a4, s4
|
||||
#endif
|
||||
ST s3, Y, 0 * SIZE
|
||||
ST s4, Y, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r12
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,386 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define Y $r7
|
||||
#define INCY $r8
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define t1 $r14
|
||||
#define t2 $r15
|
||||
#define t3 $r16
|
||||
#define t4 $r19
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define VX0 $xr12
|
||||
#define VX1 $xr13
|
||||
#define VX2 $xr14
|
||||
#define VX3 $xr15
|
||||
|
||||
PROLOGUE
|
||||
bge $r0, N, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
slli.d INCY, INCY, ZBASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
|
||||
b .L11 // INCX==1 and INCY==1
|
||||
.L20:
|
||||
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
|
||||
b .L21 // INCX!=1 and INCY==1
|
||||
|
||||
.L11:
|
||||
bge $r0, I, .L112
|
||||
.align 3
|
||||
|
||||
.L111:
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvld VX2, X, 8 * SIZE
|
||||
xvld VX3, X, 12 * SIZE
|
||||
xvst VX0, Y, 0 * SIZE
|
||||
xvst VX1, Y, 4 * SIZE
|
||||
xvst VX2, Y, 8 * SIZE
|
||||
xvst VX3, Y, 12 * SIZE
|
||||
#else
|
||||
xvld VX1, X, 8 * SIZE
|
||||
xvst VX0, Y, 0 * SIZE
|
||||
xvst VX1, Y, 8 * SIZE
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 16 * SIZE
|
||||
addi.d Y, Y, 16 * SIZE
|
||||
blt $r0, I, .L111
|
||||
.align 3
|
||||
|
||||
.L112:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L113:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 2 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
addi.d Y, Y, 2 * SIZE
|
||||
blt $r0, I, .L113
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
bge $r0, I, .L122
|
||||
.align 3
|
||||
|
||||
.L121:
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvld VX2, X, 8 * SIZE
|
||||
xvld VX3, X, 12 * SIZE
|
||||
xvstelm.d VX0, Y, 0 * SIZE, 0
|
||||
xvstelm.d VX0, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.d VX0, Y, 0 * SIZE, 2
|
||||
xvstelm.d VX0, Y, 1 * SIZE, 3
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.d VX1, Y, 0 * SIZE, 0
|
||||
xvstelm.d VX1, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.d VX1, Y, 0 * SIZE, 2
|
||||
xvstelm.d VX1, Y, 1 * SIZE, 3
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.d VX2, Y, 0 * SIZE, 0
|
||||
xvstelm.d VX2, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.d VX2, Y, 0 * SIZE, 2
|
||||
xvstelm.d VX2, Y, 1 * SIZE, 3
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.d VX3, Y, 0 * SIZE, 0
|
||||
xvstelm.d VX3, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.d VX3, Y, 0 * SIZE, 2
|
||||
xvstelm.d VX3, Y, 1 * SIZE, 3
|
||||
#else
|
||||
xvld VX1, X, 8 * SIZE
|
||||
xvstelm.w VX0, Y, 0 * SIZE, 0
|
||||
xvstelm.w VX0, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.w VX0, Y, 0 * SIZE, 2
|
||||
xvstelm.w VX0, Y, 1 * SIZE, 3
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.w VX0, Y, 0 * SIZE, 4
|
||||
xvstelm.w VX0, Y, 1 * SIZE, 5
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.w VX0, Y, 0 * SIZE, 6
|
||||
xvstelm.w VX0, Y, 1 * SIZE, 7
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.w VX1, Y, 0 * SIZE, 0
|
||||
xvstelm.w VX1, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.w VX1, Y, 0 * SIZE, 2
|
||||
xvstelm.w VX1, Y, 1 * SIZE, 3
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.w VX1, Y, 0 * SIZE, 4
|
||||
xvstelm.w VX1, Y, 1 * SIZE, 5
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.w VX1, Y, 0 * SIZE, 6
|
||||
xvstelm.w VX1, Y, 1 * SIZE, 7
|
||||
#endif
|
||||
add.d Y, Y, INCY
|
||||
addi.d X, X, 16 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L121
|
||||
.align 3
|
||||
|
||||
.L122:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L123:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 2 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L123
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
bge $r0, I, .L212
|
||||
.align 3
|
||||
|
||||
.L211:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX2, t1, 0
|
||||
xvinsgr2vr.d VX2, t2, 1
|
||||
xvinsgr2vr.d VX2, t3, 2
|
||||
xvinsgr2vr.d VX2, t4, 3
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX3, t1, 0
|
||||
xvinsgr2vr.d VX3, t2, 1
|
||||
xvinsgr2vr.d VX3, t3, 2
|
||||
xvinsgr2vr.d VX3, t4, 3
|
||||
xvst VX0, Y, 0 * SIZE
|
||||
xvst VX1, Y, 4 * SIZE
|
||||
xvst VX2, Y, 8 * SIZE
|
||||
xvst VX3, Y, 12 * SIZE
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX1, t1, 0
|
||||
xvinsgr2vr.w VX1, t2, 1
|
||||
xvinsgr2vr.w VX1, t3, 2
|
||||
xvinsgr2vr.w VX1, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX1, t1, 4
|
||||
xvinsgr2vr.w VX1, t2, 5
|
||||
xvinsgr2vr.w VX1, t3, 6
|
||||
xvinsgr2vr.w VX1, t4, 7
|
||||
xvst VX0, Y, 0 * SIZE
|
||||
xvst VX1, Y, 8 * SIZE
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
addi.d Y, Y, 16 * SIZE
|
||||
blt $r0, I, .L211
|
||||
.align 3
|
||||
|
||||
.L212:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L213:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
addi.d Y, Y, 2 * SIZE
|
||||
blt $r0, I, .L213
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
bge $r0, I, .L223
|
||||
.align 3
|
||||
|
||||
.L222:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L222
|
||||
.align 3
|
||||
|
||||
.L223:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L224:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L224
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r12
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,411 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define Y $r7
|
||||
#define INCY $r8
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define t1 $r14
|
||||
#define t2 $r15
|
||||
#define t3 $r16
|
||||
#define t4 $r19
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define VX0 $vr12
|
||||
#define VX1 $vr13
|
||||
#define VX2 $vr14
|
||||
#define VX3 $vr15
|
||||
|
||||
PROLOGUE
|
||||
bge $r0, N, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
slli.d INCY, INCY, ZBASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
|
||||
b .L11 // INCX==1 and INCY==1
|
||||
.L20:
|
||||
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
|
||||
b .L21 // INCX!=1 and INCY==1
|
||||
|
||||
.L11:// INCX==1 and INCY==1
|
||||
bge $r0, I, .L112
|
||||
.align 3
|
||||
|
||||
.L111:
|
||||
vld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vld VX2, X, 4 * SIZE
|
||||
vld VX3, X, 6 * SIZE
|
||||
vst VX0, Y, 0 * SIZE
|
||||
vst VX1, Y, 2 * SIZE
|
||||
vst VX2, Y, 4 * SIZE
|
||||
vst VX3, Y, 6 * SIZE
|
||||
vld VX0, X, 8 * SIZE
|
||||
vld VX1, X, 10 * SIZE
|
||||
vld VX2, X, 12 * SIZE
|
||||
vld VX3, X, 14 * SIZE
|
||||
addi.d I, I, -1
|
||||
vst VX0, Y, 8 * SIZE
|
||||
vst VX1, Y, 10 * SIZE
|
||||
vst VX2, Y, 12 * SIZE
|
||||
vst VX3, Y, 14 * SIZE
|
||||
#else
|
||||
vld VX1, X, 4 * SIZE
|
||||
vld VX2, X, 8 * SIZE
|
||||
vld VX3, X, 12 * SIZE
|
||||
addi.d I, I, -1
|
||||
vst VX0, Y, 0 * SIZE
|
||||
vst VX1, Y, 4 * SIZE
|
||||
vst VX2, Y, 8 * SIZE
|
||||
vst VX3, Y, 12 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 16 * SIZE
|
||||
addi.d Y, Y, 16 * SIZE
|
||||
blt $r0, I, .L111
|
||||
.align 3
|
||||
|
||||
.L112:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L113:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 2 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
addi.d Y, Y, 2 * SIZE
|
||||
blt $r0, I, .L113
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L12: // INCX==1 and INCY!=1
|
||||
bge $r0, I, .L122
|
||||
.align 3
|
||||
|
||||
.L121:
|
||||
vld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vld VX2, X, 4 * SIZE
|
||||
vld VX3, X, 6 * SIZE
|
||||
vstelm.d VX0, Y, 0 * SIZE, 0
|
||||
vstelm.d VX0, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.d VX1, Y, 0 * SIZE, 0
|
||||
vstelm.d VX1, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.d VX2, Y, 0 * SIZE, 0
|
||||
vstelm.d VX2, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.d VX3, Y, 0 * SIZE, 0
|
||||
vstelm.d VX3, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vld VX0, X, 8 * SIZE
|
||||
vld VX1, X, 10 * SIZE
|
||||
vld VX2, X, 12 * SIZE
|
||||
vld VX3, X, 14 * SIZE
|
||||
vstelm.d VX0, Y, 0 * SIZE, 0
|
||||
vstelm.d VX0, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.d VX1, Y, 0 * SIZE, 0
|
||||
vstelm.d VX1, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.d VX2, Y, 0 * SIZE, 0
|
||||
vstelm.d VX2, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.d VX3, Y, 0 * SIZE, 0
|
||||
vstelm.d VX3, Y, 1 * SIZE, 1
|
||||
#else
|
||||
vld VX1, X, 4 * SIZE
|
||||
vld VX2, X, 8 * SIZE
|
||||
vld VX3, X, 12 * SIZE
|
||||
vstelm.w VX0, Y, 0 * SIZE, 0
|
||||
vstelm.w VX0, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.w VX0, Y, 0 * SIZE, 2
|
||||
vstelm.w VX0, Y, 1 * SIZE, 3
|
||||
add.d Y, Y, INCY
|
||||
vstelm.w VX1, Y, 0 * SIZE, 0
|
||||
vstelm.w VX1, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.w VX1, Y, 0 * SIZE, 2
|
||||
vstelm.w VX1, Y, 1 * SIZE, 3
|
||||
add.d Y, Y, INCY
|
||||
vstelm.w VX2, Y, 0 * SIZE, 0
|
||||
vstelm.w VX2, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.w VX2, Y, 0 * SIZE, 2
|
||||
vstelm.w VX2, Y, 1 * SIZE, 3
|
||||
add.d Y, Y, INCY
|
||||
vstelm.w VX3, Y, 0 * SIZE, 0
|
||||
vstelm.w VX3, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.w VX3, Y, 0 * SIZE, 2
|
||||
vstelm.w VX3, Y, 1 * SIZE, 3
|
||||
#endif
|
||||
add.d Y, Y, INCY
|
||||
addi.d X, X, 16 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L121
|
||||
.align 3
|
||||
|
||||
.L122:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L123:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 2 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L123
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
bge $r0, I, .L212
|
||||
.align 3
|
||||
|
||||
.L211:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vst VX0, Y, 0 * SIZE
|
||||
vst VX1, Y, 2 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vst VX0, Y, 4 * SIZE
|
||||
vst VX1, Y, 6 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vst VX0, Y, 8 * SIZE
|
||||
vst VX1, Y, 10 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vst VX0, Y, 12 * SIZE
|
||||
vst VX1, Y, 14 * SIZE
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX2, t1, 0
|
||||
vinsgr2vr.w VX2, t2, 1
|
||||
vinsgr2vr.w VX2, t3, 2
|
||||
vinsgr2vr.w VX2, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX3, t1, 0
|
||||
vinsgr2vr.w VX3, t2, 1
|
||||
vinsgr2vr.w VX3, t3, 2
|
||||
vinsgr2vr.w VX3, t4, 3
|
||||
vst VX0, Y, 0 * SIZE
|
||||
vst VX1, Y, 4 * SIZE
|
||||
vst VX2, Y, 8 * SIZE
|
||||
vst VX3, Y, 12 * SIZE
|
||||
#endif
|
||||
addi.d Y, Y, 16 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L211
|
||||
.align 3
|
||||
|
||||
.L212:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L213:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
addi.d Y, Y, 2 * SIZE
|
||||
blt $r0, I, .L213
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
bge $r0, I, .L223
|
||||
.align 3
|
||||
|
||||
.L222:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L222
|
||||
.align 3
|
||||
|
||||
.L223:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L224:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L224
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r12
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,565 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define Y $r7
|
||||
#define INCY $r8
|
||||
#define I $r19
|
||||
#define TEMP $r10
|
||||
#define t1 $r11
|
||||
#define t2 $r12
|
||||
#define t3 $r13
|
||||
#define t4 $r14
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define s1 $f16
|
||||
#define s2 $f17
|
||||
#define s3 $f18
|
||||
#define s4 $f19
|
||||
#define res1 $xr16
|
||||
#define res2 $xr17
|
||||
#define res3 $xr18
|
||||
#define res4 $xr19
|
||||
#define VX0 $xr12
|
||||
#define VX1 $xr13
|
||||
#define VX2 $xr14
|
||||
#define VX3 $xr15
|
||||
#define x1 $xr20
|
||||
#define x2 $xr21
|
||||
#define x3 $xr22
|
||||
#define x4 $xr23
|
||||
|
||||
PROLOGUE
|
||||
xvxor.v res1, res1, res1
|
||||
xvxor.v res2, res2, res2
|
||||
xvxor.v res3, res3, res3
|
||||
xvxor.v res4, res4, res4
|
||||
bge $r0, N, .L999
|
||||
li.d TEMP, 2 * SIZE
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
slli.d INCY, INCY, ZBASE_SHIFT
|
||||
#ifdef DOUBLE
|
||||
srai.d I, N, 2
|
||||
#else
|
||||
srai.d I, N, 3
|
||||
#endif
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
|
||||
b .L11 // INCX==1 and INCY==1
|
||||
.L20:
|
||||
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
|
||||
b .L21 // INCX!=1 and INCY==1
|
||||
|
||||
.L11:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L111:
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
xvld VX3, Y, 4 * SIZE
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
xvpickev.d x3, VX3, VX2
|
||||
xvpickod.d x4, VX3, VX2
|
||||
xvfmadd.d res1, x1, x3, res1
|
||||
xvfmadd.d res2, x2, x3, res2
|
||||
xvfmadd.d res3, x1, x4, res3
|
||||
xvfmadd.d res4, x2, x4, res4
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
#else
|
||||
xvld VX1, X, 8 * SIZE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
xvld VX3, Y, 8 * SIZE
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
xvpickev.w x3, VX3, VX2
|
||||
xvpickod.w x4, VX3, VX2
|
||||
xvfmadd.s res1, x1, x3, res1
|
||||
xvfmadd.s res2, x2, x3, res2
|
||||
xvfmadd.s res3, x1, x4, res3
|
||||
xvfmadd.s res4, x2, x4, res4
|
||||
addi.d X, X, 16 * SIZE
|
||||
addi.d Y, Y, 16 * SIZE
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L111
|
||||
b .L996
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L121:
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.d x3, t1, 0
|
||||
xvinsgr2vr.d x4, t2, 0
|
||||
xvinsgr2vr.d x3, t3, 2
|
||||
xvinsgr2vr.d x4, t4, 2
|
||||
xvld VX1, X, 4 * SIZE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.d x3, t1, 1
|
||||
xvinsgr2vr.d x4, t2, 1
|
||||
xvinsgr2vr.d x3, t3, 3
|
||||
xvinsgr2vr.d x4, t4, 3
|
||||
addi.d X, X, 8 * SIZE
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
xvfmadd.d res1, x1, x3, res1
|
||||
xvfmadd.d res2, x2, x3, res2
|
||||
xvfmadd.d res3, x1, x4, res3
|
||||
xvfmadd.d res4, x2, x4, res4
|
||||
#else
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 0
|
||||
xvinsgr2vr.w x4, t2, 0
|
||||
xvinsgr2vr.w x3, t3, 1
|
||||
xvinsgr2vr.w x4, t4, 1
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 4
|
||||
xvinsgr2vr.w x4, t2, 4
|
||||
xvinsgr2vr.w x3, t3, 5
|
||||
xvinsgr2vr.w x4, t4, 5
|
||||
xvld VX1, X, 8 * SIZE
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 2
|
||||
xvinsgr2vr.w x4, t2, 2
|
||||
xvinsgr2vr.w x3, t3, 3
|
||||
xvinsgr2vr.w x4, t4, 3
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 6
|
||||
xvinsgr2vr.w x4, t2, 6
|
||||
xvinsgr2vr.w x3, t3, 7
|
||||
xvinsgr2vr.w x4, t4, 7
|
||||
addi.d X, X, 16 * SIZE
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
xvfmadd.s res1, x1, x3, res1
|
||||
xvfmadd.s res2, x2, x3, res2
|
||||
xvfmadd.s res3, x1, x4, res3
|
||||
xvfmadd.s res4, x2, x4, res4
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L121
|
||||
b .L996
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L211:
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 0
|
||||
xvinsgr2vr.d x2, t2, 0
|
||||
xvinsgr2vr.d x1, t3, 2
|
||||
xvinsgr2vr.d x2, t4, 2
|
||||
xvld VX3, Y, 4 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 1
|
||||
xvinsgr2vr.d x2, t2, 1
|
||||
xvinsgr2vr.d x1, t3, 3
|
||||
xvinsgr2vr.d x2, t4, 3
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
xvpickev.d x3, VX3, VX2
|
||||
xvpickod.d x4, VX3, VX2
|
||||
xvfmadd.d res1, x1, x3, res1
|
||||
xvfmadd.d res2, x2, x3, res2
|
||||
xvfmadd.d res3, x1, x4, res3
|
||||
xvfmadd.d res4, x2, x4, res4
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 0
|
||||
xvinsgr2vr.w x2, t2, 0
|
||||
xvinsgr2vr.w x1, t3, 1
|
||||
xvinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 4
|
||||
xvinsgr2vr.w x2, t2, 4
|
||||
xvinsgr2vr.w x1, t3, 5
|
||||
xvinsgr2vr.w x2, t4, 5
|
||||
xvld VX3, Y, 8 * SIZE
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 2
|
||||
xvinsgr2vr.w x2, t2, 2
|
||||
xvinsgr2vr.w x1, t3, 3
|
||||
xvinsgr2vr.w x2, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 6
|
||||
xvinsgr2vr.w x2, t2, 6
|
||||
xvinsgr2vr.w x1, t3, 7
|
||||
xvinsgr2vr.w x2, t4, 7
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
xvpickev.w x3, VX3, VX2
|
||||
xvpickod.w x4, VX3, VX2
|
||||
xvfmadd.s res1, x1, x3, res1
|
||||
xvfmadd.s res2, x2, x3, res2
|
||||
xvfmadd.s res3, x1, x4, res3
|
||||
xvfmadd.s res4, x2, x4, res4
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L211
|
||||
b .L996
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L222:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 0
|
||||
xvinsgr2vr.d x2, t2, 0
|
||||
xvinsgr2vr.d x1, t3, 1
|
||||
xvinsgr2vr.d x2, t4, 1
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.d x3, t1, 0
|
||||
xvinsgr2vr.d x4, t2, 0
|
||||
xvinsgr2vr.d x3, t3, 1
|
||||
xvinsgr2vr.d x4, t4, 1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 2
|
||||
xvinsgr2vr.d x2, t2, 2
|
||||
xvinsgr2vr.d x1, t3, 3
|
||||
xvinsgr2vr.d x2, t4, 3
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.d x3, t1, 2
|
||||
xvinsgr2vr.d x4, t2, 2
|
||||
xvinsgr2vr.d x3, t3, 3
|
||||
xvinsgr2vr.d x4, t4, 3
|
||||
xvfmadd.d res1, x1, x3, res1
|
||||
xvfmadd.d res2, x2, x3, res2
|
||||
xvfmadd.d res3, x1, x4, res3
|
||||
xvfmadd.d res4, x2, x4, res4
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 0
|
||||
xvinsgr2vr.w x2, t2, 0
|
||||
xvinsgr2vr.w x1, t3, 1
|
||||
xvinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 0
|
||||
xvinsgr2vr.w x4, t2, 0
|
||||
xvinsgr2vr.w x3, t3, 1
|
||||
xvinsgr2vr.w x4, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 2
|
||||
xvinsgr2vr.w x2, t2, 2
|
||||
xvinsgr2vr.w x1, t3, 3
|
||||
xvinsgr2vr.w x2, t4, 3
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 2
|
||||
xvinsgr2vr.w x4, t2, 2
|
||||
xvinsgr2vr.w x3, t3, 3
|
||||
xvinsgr2vr.w x4, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 4
|
||||
xvinsgr2vr.w x2, t2, 4
|
||||
xvinsgr2vr.w x1, t3, 5
|
||||
xvinsgr2vr.w x2, t4, 5
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 4
|
||||
xvinsgr2vr.w x4, t2, 4
|
||||
xvinsgr2vr.w x3, t3, 5
|
||||
xvinsgr2vr.w x4, t4, 5
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 6
|
||||
xvinsgr2vr.w x2, t2, 6
|
||||
xvinsgr2vr.w x1, t3, 7
|
||||
xvinsgr2vr.w x2, t4, 7
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
xvinsgr2vr.w x3, t1, 6
|
||||
xvinsgr2vr.w x4, t2, 6
|
||||
xvinsgr2vr.w x3, t3, 7
|
||||
xvinsgr2vr.w x4, t4, 7
|
||||
xvfmadd.s res1, x1, x3, res1
|
||||
xvfmadd.s res2, x2, x3, res2
|
||||
xvfmadd.s res3, x1, x4, res3
|
||||
xvfmadd.s res4, x2, x4, res4
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L222
|
||||
.align 3
|
||||
|
||||
.L996:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VX1, res1, 1
|
||||
xvpickve.d VX2, res1, 2
|
||||
xvpickve.d VX3, res1, 3
|
||||
xvfadd.d res1, VX1, res1
|
||||
xvfadd.d res1, VX2, res1
|
||||
xvfadd.d res1, VX3, res1
|
||||
xvpickve.d VX1, res2, 1
|
||||
xvpickve.d VX2, res2, 2
|
||||
xvpickve.d VX3, res2, 3
|
||||
xvfadd.d res2, VX1, res2
|
||||
xvfadd.d res2, VX2, res2
|
||||
xvfadd.d res2, VX3, res2
|
||||
xvpickve.d VX1, res3, 1
|
||||
xvpickve.d VX2, res3, 2
|
||||
xvpickve.d VX3, res3, 3
|
||||
xvfadd.d res3, VX1, res3
|
||||
xvfadd.d res3, VX2, res3
|
||||
xvfadd.d res3, VX3, res3
|
||||
xvpickve.d VX1, res4, 1
|
||||
xvpickve.d VX2, res4, 2
|
||||
xvpickve.d VX3, res4, 3
|
||||
xvfadd.d res4, VX1, res4
|
||||
xvfadd.d res4, VX2, res4
|
||||
xvfadd.d res4, VX3, res4
|
||||
#else
|
||||
xvpickve.w VX0, res1, 1
|
||||
xvpickve.w VX1, res1, 2
|
||||
xvpickve.w VX2, res1, 3
|
||||
xvpickve.w VX3, res1, 4
|
||||
xvpickve.w x1, res1, 5
|
||||
xvpickve.w x2, res1, 6
|
||||
xvpickve.w x3, res1, 7
|
||||
xvfadd.s res1, VX0, res1
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX3, res1
|
||||
xvfadd.s res1, x1, res1
|
||||
xvfadd.s res1, x2, res1
|
||||
xvfadd.s res1, x3, res1
|
||||
xvpickve.w VX0, res2, 1
|
||||
xvpickve.w VX1, res2, 2
|
||||
xvpickve.w VX2, res2, 3
|
||||
xvpickve.w VX3, res2, 4
|
||||
xvpickve.w x1, res2, 5
|
||||
xvpickve.w x2, res2, 6
|
||||
xvpickve.w x3, res2, 7
|
||||
xvfadd.s res2, VX0, res2
|
||||
xvfadd.s res2, VX1, res2
|
||||
xvfadd.s res2, VX2, res2
|
||||
xvfadd.s res2, VX3, res2
|
||||
xvfadd.s res2, x1, res2
|
||||
xvfadd.s res2, x2, res2
|
||||
xvfadd.s res2, x3, res2
|
||||
xvpickve.w VX0, res3, 1
|
||||
xvpickve.w VX1, res3, 2
|
||||
xvpickve.w VX2, res3, 3
|
||||
xvpickve.w VX3, res3, 4
|
||||
xvpickve.w x1, res3, 5
|
||||
xvpickve.w x2, res3, 6
|
||||
xvpickve.w x3, res3, 7
|
||||
xvfadd.s res3, VX0, res3
|
||||
xvfadd.s res3, VX1, res3
|
||||
xvfadd.s res3, VX2, res3
|
||||
xvfadd.s res3, VX3, res3
|
||||
xvfadd.s res3, x1, res3
|
||||
xvfadd.s res3, x2, res3
|
||||
xvfadd.s res3, x3, res3
|
||||
xvpickve.w VX0, res4, 1
|
||||
xvpickve.w VX1, res4, 2
|
||||
xvpickve.w VX2, res4, 3
|
||||
xvpickve.w VX3, res4, 4
|
||||
xvpickve.w x1, res4, 5
|
||||
xvpickve.w x2, res4, 6
|
||||
xvpickve.w x3, res4, 7
|
||||
xvfadd.s res4, VX0, res4
|
||||
xvfadd.s res4, VX1, res4
|
||||
xvfadd.s res4, VX2, res4
|
||||
xvfadd.s res4, VX3, res4
|
||||
xvfadd.s res4, x1, res4
|
||||
xvfadd.s res4, x2, res4
|
||||
xvfadd.s res4, x3, res4
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L997:
|
||||
#ifdef DOUBLE
|
||||
andi I, N, 3
|
||||
#else
|
||||
andi I, N, 7
|
||||
#endif
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, Y, 0 * SIZE
|
||||
LD a4, Y, 1 * SIZE
|
||||
MADD s1, a1, a3, s1
|
||||
MADD s2, a2, a3, s2
|
||||
MADD s3, a1, a4, s3
|
||||
MADD s4, a2, a4, s4
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
#ifndef CONJ
|
||||
SUB $f0, s1, s4
|
||||
ADD $f1, s3, s2
|
||||
#else
|
||||
ADD $f0, s1, s4
|
||||
SUB $f1, s3, s2
|
||||
#endif
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,397 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define Y $r7
|
||||
#define INCY $r8
|
||||
#define I $r19
|
||||
#define TEMP $r10
|
||||
#define t1 $r11
|
||||
#define t2 $r12
|
||||
#define t3 $r13
|
||||
#define t4 $r14
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define s1 $f16
|
||||
#define s2 $f17
|
||||
#define s3 $f18
|
||||
#define s4 $f19
|
||||
#define res1 $vr16
|
||||
#define res2 $vr17
|
||||
#define res3 $vr18
|
||||
#define res4 $vr19
|
||||
#define VX0 $vr12
|
||||
#define VX1 $vr13
|
||||
#define VX2 $vr14
|
||||
#define VX3 $vr15
|
||||
#define x1 $vr20
|
||||
#define x2 $vr21
|
||||
#define x3 $vr22
|
||||
#define x4 $vr23
|
||||
|
||||
PROLOGUE
|
||||
vxor.v res1, res1, res1
|
||||
vxor.v res2, res2, res2
|
||||
vxor.v res3, res3, res3
|
||||
vxor.v res4, res4, res4
|
||||
bge $r0, N, .L999
|
||||
li.d TEMP, 2 * SIZE
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
slli.d INCY, INCY, ZBASE_SHIFT
|
||||
#ifdef DOUBLE
|
||||
srai.d I, N, 1
|
||||
#else
|
||||
srai.d I, N, 2
|
||||
#endif
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
|
||||
b .L11 // INCX==1 and INCY==1
|
||||
.L20:
|
||||
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
|
||||
b .L21 // INCX!=1 and INCY==1
|
||||
|
||||
.L11:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L111:
|
||||
vld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
vld VX3, Y, 2 * SIZE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vpickev.d x3, VX3, VX2
|
||||
vpickod.d x4, VX3, VX2
|
||||
vfmadd.d res1, x1, x3, res1
|
||||
vfmadd.d res2, x2, x3, res2
|
||||
vfmadd.d res3, x1, x4, res3
|
||||
vfmadd.d res4, x2, x4, res4
|
||||
addi.d X, X, 4 * SIZE
|
||||
addi.d Y, Y, 4 * SIZE
|
||||
#else
|
||||
vld VX1, X, 4 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
vld VX3, Y, 4 * SIZE
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vpickev.w x3, VX3, VX2
|
||||
vpickod.w x4, VX3, VX2
|
||||
vfmadd.s res1, x1, x3, res1
|
||||
vfmadd.s res2, x2, x3, res2
|
||||
vfmadd.s res3, x1, x4, res3
|
||||
vfmadd.s res4, x2, x4, res4
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L111
|
||||
b .L996
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L121:
|
||||
vld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vld VX1, X, 2 * SIZE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.d x3, t1, 0
|
||||
vinsgr2vr.d x4, t2, 0
|
||||
vinsgr2vr.d x3, t3, 1
|
||||
vinsgr2vr.d x4, t4, 1
|
||||
addi.d X, X, 4 * SIZE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vfmadd.d res1, x1, x3, res1
|
||||
vfmadd.d res2, x2, x3, res2
|
||||
vfmadd.d res3, x1, x4, res3
|
||||
vfmadd.d res4, x2, x4, res4
|
||||
#else
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.w x3, t1, 0
|
||||
vinsgr2vr.w x4, t2, 0
|
||||
vinsgr2vr.w x3, t3, 1
|
||||
vinsgr2vr.w x4, t4, 1
|
||||
vld VX1, X, 4 * SIZE
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.w x3, t1, 2
|
||||
vinsgr2vr.w x4, t2, 2
|
||||
vinsgr2vr.w x3, t3, 3
|
||||
vinsgr2vr.w x4, t4, 3
|
||||
addi.d X, X, 8 * SIZE
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vfmadd.s res1, x1, x3, res1
|
||||
vfmadd.s res2, x2, x3, res2
|
||||
vfmadd.s res3, x1, x4, res3
|
||||
vfmadd.s res4, x2, x4, res4
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L121
|
||||
b .L996
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L211:
|
||||
vld VX2, Y, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vld VX3, Y, 2 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
addi.d Y, Y, 4 * SIZE
|
||||
vpickev.d x3, VX3, VX2
|
||||
vpickod.d x4, VX3, VX2
|
||||
vfmadd.d res1, x1, x3, res1
|
||||
vfmadd.d res2, x2, x3, res2
|
||||
vfmadd.d res3, x1, x4, res3
|
||||
vfmadd.d res4, x2, x4, res4
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 0
|
||||
vinsgr2vr.w x2, t2, 0
|
||||
vinsgr2vr.w x1, t3, 1
|
||||
vinsgr2vr.w x2, t4, 1
|
||||
vld VX3, Y, 4 * SIZE
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 2
|
||||
vinsgr2vr.w x2, t2, 2
|
||||
vinsgr2vr.w x1, t3, 3
|
||||
vinsgr2vr.w x2, t4, 3
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
vpickev.w x3, VX3, VX2
|
||||
vpickod.w x4, VX3, VX2
|
||||
vfmadd.s res1, x1, x3, res1
|
||||
vfmadd.s res2, x2, x3, res2
|
||||
vfmadd.s res3, x1, x4, res3
|
||||
vfmadd.s res4, x2, x4, res4
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L211
|
||||
b .L996
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L222:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.d x3, t1, 0
|
||||
vinsgr2vr.d x4, t2, 0
|
||||
vinsgr2vr.d x3, t3, 1
|
||||
vinsgr2vr.d x4, t4, 1
|
||||
vfmadd.d res1, x1, x3, res1
|
||||
vfmadd.d res2, x2, x3, res2
|
||||
vfmadd.d res3, x1, x4, res3
|
||||
vfmadd.d res4, x2, x4, res4
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 0
|
||||
vinsgr2vr.w x2, t2, 0
|
||||
vinsgr2vr.w x1, t3, 1
|
||||
vinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.w x3, t1, 0
|
||||
vinsgr2vr.w x4, t2, 0
|
||||
vinsgr2vr.w x3, t3, 1
|
||||
vinsgr2vr.w x4, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 2
|
||||
vinsgr2vr.w x2, t2, 2
|
||||
vinsgr2vr.w x1, t3, 3
|
||||
vinsgr2vr.w x2, t4, 3
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.w x3, t1, 2
|
||||
vinsgr2vr.w x4, t2, 2
|
||||
vinsgr2vr.w x3, t3, 3
|
||||
vinsgr2vr.w x4, t4, 3
|
||||
vfmadd.s res1, x1, x3, res1
|
||||
vfmadd.s res2, x2, x3, res2
|
||||
vfmadd.s res3, x1, x4, res3
|
||||
vfmadd.s res4, x2, x4, res4
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L222
|
||||
.align 3
|
||||
|
||||
.L996:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX1, res1, 1
|
||||
vfadd.d res1, VX1, res1
|
||||
vreplvei.d VX1, res2, 1
|
||||
vfadd.d res2, VX1, res2
|
||||
vreplvei.d VX1, res3, 1
|
||||
vfadd.d res3, VX1, res3
|
||||
vreplvei.d VX1, res4, 1
|
||||
vfadd.d res4, VX1, res4
|
||||
#else
|
||||
vreplvei.w VX1, res1, 1
|
||||
vreplvei.w VX2, res1, 2
|
||||
vreplvei.w VX3, res1, 3
|
||||
vfadd.s res1, VX1, res1
|
||||
vfadd.s res1, VX2, res1
|
||||
vfadd.s res1, VX3, res1
|
||||
vreplvei.w VX1, res2, 1
|
||||
vreplvei.w VX2, res2, 2
|
||||
vreplvei.w VX3, res2, 3
|
||||
vfadd.s res2, VX1, res2
|
||||
vfadd.s res2, VX2, res2
|
||||
vfadd.s res2, VX3, res2
|
||||
vreplvei.w VX1, res3, 1
|
||||
vreplvei.w VX2, res3, 2
|
||||
vreplvei.w VX3, res3, 3
|
||||
vfadd.s res3, VX1, res3
|
||||
vfadd.s res3, VX2, res3
|
||||
vfadd.s res3, VX3, res3
|
||||
vreplvei.w VX1, res4, 1
|
||||
vreplvei.w VX2, res4, 2
|
||||
vreplvei.w VX3, res4, 3
|
||||
vfadd.s res4, VX1, res4
|
||||
vfadd.s res4, VX2, res4
|
||||
vfadd.s res4, VX3, res4
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L997:
|
||||
#ifdef DOUBLE
|
||||
andi I, N, 1
|
||||
#else
|
||||
andi I, N, 3
|
||||
#endif
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, Y, 0 * SIZE
|
||||
LD a4, Y, 1 * SIZE
|
||||
MADD s1, a1, a3, s1
|
||||
MADD s2, a2, a3, s2
|
||||
MADD s3, a1, a4, s3
|
||||
MADD s4, a2, a4, s4
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
#ifndef CONJ
|
||||
SUB $f0, s1, s4
|
||||
ADD $f1, s3, s2
|
||||
#else
|
||||
ADD $f0, s1, s4
|
||||
SUB $f1, s3, s2
|
||||
#endif
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,857 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: bm
|
||||
#define N $r5 // param 2: bn
|
||||
#define K $r6 // param 3: bk
|
||||
#define ALPHA_R $f0 // param 4: alphar
|
||||
#define ALPHA_I $f1 // param 5: alphai
|
||||
#define A $r7 // param 6: ba
|
||||
#define B $r8 // param 7: bb
|
||||
#define C $r9 // param 8: bc
|
||||
#define LDC $r10 // param 9: ldc
|
||||
|
||||
#if defined (TRMMKERNEL)
|
||||
#define OFFSET $r11 // param 10: offset
|
||||
#endif
|
||||
#define OFF $r26
|
||||
|
||||
#define I $r12
|
||||
#define J $r13
|
||||
#define L $r14
|
||||
#define TL $r15
|
||||
#define A0 $r16
|
||||
#define B0 $r17
|
||||
#define C0 $r18
|
||||
#define C1 $r19
|
||||
#define C2 $r20
|
||||
#define C3 $r23
|
||||
#define T0 $r24
|
||||
#define T1 $r25
|
||||
|
||||
#define a1 $f2
|
||||
#define a2 $f3
|
||||
#define a3 $f4
|
||||
#define a4 $f5
|
||||
#define a5 $f6
|
||||
#define a6 $f7
|
||||
#define a7 $f8
|
||||
#define a8 $f9
|
||||
#define b1 $f10
|
||||
#define b2 $f11
|
||||
#define b3 $f12
|
||||
#define b4 $f13
|
||||
#define b5 $f14
|
||||
#define b6 $f15
|
||||
#define b7 $f16
|
||||
#define b8 $f17
|
||||
#define c11 $f18
|
||||
#define c12 $f19
|
||||
#define c21 $f20
|
||||
#define c22 $f21
|
||||
#define c31 $f22
|
||||
#define c32 $f23
|
||||
#define c41 $f24
|
||||
#define c42 $f25
|
||||
|
||||
/* LASX vectors */
|
||||
#define U0 $xr30
|
||||
#define U1 $xr31
|
||||
#define U2 $xr2
|
||||
#define U3 $xr3
|
||||
#define U4 $xr4
|
||||
#define U5 $xr5
|
||||
#define U6 $xr6
|
||||
#define U7 $xr7
|
||||
#define U8 $xr8
|
||||
#define U9 $xr9
|
||||
#define U10 $xr10
|
||||
#define U11 $xr11
|
||||
#define U12 $xr12
|
||||
#define U13 $xr13
|
||||
#define U14 $xr14
|
||||
#define U15 $xr15
|
||||
#define D0 $xr16
|
||||
#define D1 $xr17
|
||||
#define D2 $xr18
|
||||
#define D3 $xr19
|
||||
#define D4 $xr20
|
||||
#define D5 $xr21
|
||||
#define D6 $xr22
|
||||
#define D7 $xr23
|
||||
#define D8 $xr24
|
||||
#define D9 $xr25
|
||||
#define D10 $xr26
|
||||
#define D11 $xr27
|
||||
#define VALPHAR $xr28
|
||||
#define VALPHAI $xr29
|
||||
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
#define XVMADD1 XVFMADD
|
||||
#define XVMADD2 XVFMADD
|
||||
#define XVMADD3 XVNMSUB
|
||||
#define XVMADD4 XVFMADD
|
||||
|
||||
#define VMADD1 VFMADD
|
||||
#define VMADD2 VFMADD
|
||||
#define VMADD3 VNMSUB
|
||||
#define VMADD4 VFMADD
|
||||
|
||||
#define XVFADD1 XVFADD
|
||||
#define XVFADD2 XVFADD
|
||||
#define XVFADD3 XVFSUB
|
||||
#define XVFADD4 XVFADD
|
||||
|
||||
#define MADD1 MADD
|
||||
#define MADD2 MADD
|
||||
#define MADD3 NMSUB
|
||||
#define MADD4 MADD
|
||||
#endif
|
||||
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
#define XVMADD1 XVFMADD
|
||||
#define XVMADD2 XVFMADD
|
||||
#define XVMADD3 XVFMADD
|
||||
#define XVMADD4 XVNMSUB
|
||||
|
||||
#define VMADD1 VFMADD
|
||||
#define VMADD2 VFMADD
|
||||
#define VMADD3 VFMADD
|
||||
#define VMADD4 VNMSUB
|
||||
|
||||
#define XVFADD1 XVFADD
|
||||
#define XVFADD2 XVFADD
|
||||
#define XVFADD3 XVFADD
|
||||
#define XVFADD4 XVFSUB
|
||||
|
||||
#define MADD1 MADD
|
||||
#define MADD2 MADD
|
||||
#define MADD3 MADD
|
||||
#define MADD4 NMSUB
|
||||
#endif
|
||||
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
#define XVMADD1 XVFMADD
|
||||
#define XVMADD2 XVNMSUB
|
||||
#define XVMADD3 XVFMADD
|
||||
#define XVMADD4 XVFMADD
|
||||
|
||||
#define VMADD1 VFMADD
|
||||
#define VMADD2 VNMSUB
|
||||
#define VMADD3 VFMADD
|
||||
#define VMADD4 VFMADD
|
||||
|
||||
#define XVFADD1 XVFADD
|
||||
#define XVFADD2 XVFSUB
|
||||
#define XVFADD3 XVFADD
|
||||
#define XVFADD4 XVFADD
|
||||
|
||||
#define MADD1 MADD
|
||||
#define MADD2 NMSUB
|
||||
#define MADD3 MADD
|
||||
#define MADD4 MADD
|
||||
#endif
|
||||
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
#define XVMADD1 XVFMADD
|
||||
#define XVMADD2 XVNMSUB
|
||||
#define XVMADD3 XVNMSUB
|
||||
#define XVMADD4 XVNMSUB
|
||||
|
||||
#define VMADD1 VFMADD
|
||||
#define VMADD2 VNMSUB
|
||||
#define VMADD3 VNMSUB
|
||||
#define VMADD4 VNMSUB
|
||||
|
||||
#define XVFADD1 XVFADD
|
||||
#define XVFADD2 XVFSUB
|
||||
#define XVFADD3 XVFSUB
|
||||
#define XVFADD4 XVFSUB
|
||||
|
||||
#define MADD1 MADD
|
||||
#define MADD2 NMSUB
|
||||
#define MADD3 NMSUB
|
||||
#define MADD4 NMSUB
|
||||
#endif
|
||||
|
||||
PROLOGUE
|
||||
|
||||
addi.d $sp, $sp, -128
|
||||
SDARG $r23, $sp, 0
|
||||
SDARG $r24, $sp, 8
|
||||
SDARG $r25, $sp, 16
|
||||
SDARG $r26, $sp, 24
|
||||
SDARG $r27, $sp, 32
|
||||
ST $f23, $sp, 40
|
||||
ST $f24, $sp, 48
|
||||
ST $f25, $sp, 56
|
||||
ST $f26, $sp, 64
|
||||
ST $f27, $sp, 72
|
||||
ST $f28, $sp, 80
|
||||
ST $f29, $sp, 88
|
||||
ST $f30, $sp, 96
|
||||
ST $f31, $sp, 104
|
||||
ST ALPHA_R,$sp, 112
|
||||
ST ALPHA_I,$sp, 120
|
||||
|
||||
xvldrepl.w VALPHAR, $sp, 112
|
||||
xvldrepl.w VALPHAI, $sp, 120
|
||||
|
||||
#if defined (TRMMKERNEL) && !defined(LEFT)
|
||||
sub.d OFF, $r0, OFFSET
|
||||
#else
|
||||
xor OFF, OFF, OFF
|
||||
#endif
|
||||
|
||||
slli.d LDC, LDC, 2
|
||||
|
||||
move J, $r0
|
||||
srai.d T0, N, 1
|
||||
beq J, T0, .L19
|
||||
|
||||
.L10: /* for(j=0; j<bn/2; j+=1) */
|
||||
move C0, C
|
||||
slli.d TL, LDC, 1
|
||||
add.d C1, C0, TL
|
||||
move A0, A //ptrba
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
move OFF, OFFSET
|
||||
#endif
|
||||
|
||||
move I, $r0
|
||||
srai.d T0, M, 1
|
||||
beq I, T0, .L150
|
||||
|
||||
.L11: /* for(i=0; i<bm/2; i+=1) */
|
||||
move B0, B //ptrbb
|
||||
move TL, K /* TL = bk */
|
||||
#if defined(TRMMKERNEL)
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
move B0, B //ptrbb
|
||||
#else
|
||||
slli.d C3, OFF, 0x04
|
||||
add.d A0, A0, C3
|
||||
add.d B0, B, C3
|
||||
#endif
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
sub.d TL, K, OFF //temp
|
||||
#elif defined(LEFT)
|
||||
addi.d TL, OFF, 2
|
||||
#else
|
||||
addi.d TL, OFF, 2
|
||||
#endif
|
||||
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
xvxor.v U0, U0, U0
|
||||
xvxor.v U1, U1, U1
|
||||
|
||||
move L, $r0 //cycle param k
|
||||
srai.d C2, TL, 2
|
||||
beq L, C2, .L130
|
||||
blt C2, L, .L130
|
||||
|
||||
.L12: /* for(k=0; k<bk/4; k+=1) */
|
||||
xvld D0, A0, 0x00 //a 0-7
|
||||
xvld D1, A0, 0x20 //a 8-15
|
||||
xvld D2, B0, 0x00 //b 0-7
|
||||
xvld D3, B0, 0x20 //b 8-15
|
||||
|
||||
xvand.v D4, D0, D0
|
||||
xvpermi.q D4, D1, 0x02 //a 0 1 2 3 8 9 10 11
|
||||
xvand.v D5, D4, D4
|
||||
xvshuf4i.w D4, D4, 0x88 //a 0 2 0 2 8 10 8 10
|
||||
xvshuf4i.w D5, D5, 0xdd //a 1 3 1 3 9 11 9 11
|
||||
|
||||
xvand.v D6, D1, D1
|
||||
xvpermi.q D6, D0, 0x31 //a 4 5 6 7 12 13 14 15
|
||||
xvand.v D7, D6, D6
|
||||
xvshuf4i.w D6, D6, 0x88 //a 4 6 4 6 12 14 12 14
|
||||
xvshuf4i.w D7, D7, 0xdd //a 5 7 5 7 13 15 13 15
|
||||
|
||||
xvand.v D8, D2, D2
|
||||
xvpermi.q D8, D3, 0x02 //b 0 1 2 3 8 9 10 11
|
||||
xvand.v D9, D8, D8
|
||||
xvshuf4i.w D8, D8, 0xa0 //b 0 0 2 2 8 8 10 10
|
||||
xvshuf4i.w D9, D9, 0xf5 //a 1 1 3 3 9 9 11 11
|
||||
|
||||
xvand.v D10, D3, D3
|
||||
xvpermi.q D10, D2, 0x31 //b 4 5 6 7 12 13 14 15
|
||||
xvand.v D11, D10, D10
|
||||
xvshuf4i.w D10, D10, 0xa0 //b 4 4 6 6 12 12 14 14
|
||||
xvshuf4i.w D11, D11, 0xf5 //a 5 5 7 7 13 13 15 15
|
||||
|
||||
XVMADD1 U0, D4, D8, U0 //res0 2 4 6 0 2 4 6
|
||||
XVMADD2 U1, D5, D8, U1 //res1 3 4 7 1 3 4 7
|
||||
|
||||
xvpermi.q U0, U0, 0x01
|
||||
xvpermi.q U1, U1, 0x01
|
||||
XVMADD1 U0, D4, D8, U0
|
||||
XVMADD2 U1, D5, D8, U1
|
||||
|
||||
XVMADD3 U0, D5, D9, U0
|
||||
XVMADD4 U1, D4, D9, U1
|
||||
|
||||
xvpermi.q U0, U0, 0x01
|
||||
xvpermi.q U1, U1, 0x01
|
||||
XVMADD3 U0, D5, D9, U0
|
||||
XVMADD4 U1, D4, D9, U1
|
||||
|
||||
XVMADD1 U0, D6, D10, U0 //res0 2 4 6 0 2 4 6
|
||||
XVMADD2 U1, D7, D10, U1 //res1 3 4 7 1 3 4 7
|
||||
|
||||
xvpermi.q U0, U0, 0x01
|
||||
xvpermi.q U1, U1, 0x01
|
||||
XVMADD1 U0, D6, D10, U0
|
||||
XVMADD2 U1, D7, D10, U1
|
||||
|
||||
XVMADD3 U0, D7, D11, U0
|
||||
XVMADD4 U1, D6, D11, U1
|
||||
|
||||
xvpermi.q U0, U0, 0x01
|
||||
xvpermi.q U1, U1, 0x01
|
||||
XVMADD3 U0, D7, D11, U0
|
||||
XVMADD4 U1, D6, D11, U1
|
||||
|
||||
addi.d A0, A0, 0x40
|
||||
addi.d B0, B0, 0x40
|
||||
|
||||
addi.d L, L, 1
|
||||
blt L, C2, .L12
|
||||
|
||||
.L130:
|
||||
move L, $r0
|
||||
andi C2, TL, 3
|
||||
beq L, C2, .L14
|
||||
|
||||
.L13: /* for(k=0; k<(bk&3); k+=1) */
|
||||
vld $vr16, A0, 0x00 //a0 a1 a2 a3
|
||||
vld $vr17, B0, 0x00 //b0 b1 b2 b3
|
||||
|
||||
vshuf4i.w $vr20, $vr17, 0xa0 //b0 b0 b2 b2
|
||||
vshuf4i.w $vr21, $vr17, 0xf5 //b1 b1 b3 b3
|
||||
|
||||
vshuf4i.w $vr18, $vr16, 0x88 //a0 a2 a0 a2
|
||||
vshuf4i.w $vr19, $vr16, 0xdd //a1 a3 a1 a3
|
||||
|
||||
VMADD1 $vr30, $vr18, $vr20, $vr30 //res0 2 4 6
|
||||
VMADD2 $vr31, $vr19, $vr20, $vr31 //res1 3 5 7
|
||||
VMADD3 $vr30, $vr19, $vr21, $vr30
|
||||
VMADD4 $vr31, $vr18, $vr21, $vr31
|
||||
|
||||
addi.d A0, A0, 0x10
|
||||
addi.d B0, B0, 0x10
|
||||
|
||||
addi.d L, L, 1
|
||||
blt L, C2, .L13
|
||||
|
||||
.L14:
|
||||
#if defined(TRMMKERNEL)
|
||||
vld $vr8, C0, 0x00 //0 1 2 3
|
||||
vld $vr9, C1, 0x00 //4 5 6 7
|
||||
|
||||
vpackev.w $vr10, $vr9, $vr8 //0 4 2 6
|
||||
vpermi.w $vr10, $vr10, 0xd8 //0 2 4 6
|
||||
|
||||
vpackod.w $vr11, $vr9, $vr8 //1 5 3 7
|
||||
vpermi.w $vr11, $vr11, 0xd8 //1 3 5 7
|
||||
|
||||
vfmul.s $vr10, $vr30, $vr28
|
||||
vfmul.s $vr11, $vr31, $vr28
|
||||
VNMSUB $vr10, $vr31, $vr29, $vr10
|
||||
VFMADD $vr11, $vr30, $vr29, $vr11
|
||||
|
||||
vilvl.w $vr8, $vr11, $vr10 //0 1 2 3
|
||||
|
||||
vilvh.w $vr9, $vr11, $vr10 //4 5 6 7
|
||||
|
||||
vst $vr8, C0, 0x00
|
||||
vst $vr9, C1, 0x00
|
||||
#else
|
||||
vld $vr8, C0, 0x00 //0 1 2 3
|
||||
vld $vr9, C1, 0x00 //4 5 6 7
|
||||
|
||||
vpackev.w $vr10, $vr9, $vr8 //0 4 2 6
|
||||
vpermi.w $vr10, $vr10, 0xd8 //0 2 4 6
|
||||
|
||||
vpackod.w $vr11, $vr9, $vr8 //1 5 3 7
|
||||
vpermi.w $vr11, $vr11, 0xd8 //1 3 5 7
|
||||
|
||||
VFMADD $vr10, $vr30, $vr28, $vr10
|
||||
VFMADD $vr11, $vr31, $vr28, $vr11
|
||||
VNMSUB $vr10, $vr31, $vr29, $vr10
|
||||
VFMADD $vr11, $vr30, $vr29, $vr11
|
||||
|
||||
vilvl.w $vr8, $vr11, $vr10 //0 1 2 3
|
||||
|
||||
vilvh.w $vr9, $vr11, $vr10 //4 5 6 7
|
||||
|
||||
vst $vr8, C0, 0x00
|
||||
vst $vr9, C1, 0x00
|
||||
#endif
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#ifdef LEFT
|
||||
addi.d TL, TL, -2
|
||||
#else
|
||||
addi.d TL, TL, -2
|
||||
#endif
|
||||
slli.d C3, TL, 0x04
|
||||
add.d A0, A0, C3
|
||||
add.d B0, B0, C3
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
addi.d OFF, OFF, 2
|
||||
#endif
|
||||
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
addi.d C0, C0, 0x10
|
||||
addi.d C1, C1, 0x10
|
||||
|
||||
addi.d I, I, 1
|
||||
blt I, T0, .L11
|
||||
|
||||
.L150:
|
||||
move I, $r0
|
||||
andi T0, M, 1
|
||||
beq I, T0, .L18
|
||||
|
||||
.L15: /* for(i=0; i<(bm&1); i+=1) */
|
||||
move B0, B //ptrbb
|
||||
move TL, K /* TL = bk */
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
|
||||
move B0, B //ptrbb
|
||||
#else
|
||||
slli.d C3, OFF, 0x03
|
||||
add.d A0, A0, C3
|
||||
slli.d C3, OFF, 0x04
|
||||
add.d B0, B, C3
|
||||
#endif
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#elif defined(LEFT)
|
||||
addi.d TL, OFF, 1
|
||||
#else
|
||||
addi.d TL, OFF, 2
|
||||
#endif
|
||||
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
MTC c11, $r0
|
||||
MTC c12, $r0
|
||||
MTC c21, $r0
|
||||
MTC c22, $r0
|
||||
|
||||
move L, $r0 //cycle param k
|
||||
beq L, TL, .L17
|
||||
blt TL, L, .L17
|
||||
|
||||
.L16: /* for (k=0; k<bk; k+=1) */
|
||||
LD a1, A0, 0x00 //load0
|
||||
LD b1, B0, 0x00 //load1
|
||||
MADD1 c11, a1, b1, c11 //res0
|
||||
LD a2, A0, 0x04 //load2
|
||||
MADD2 c12, a2, b1, c12 //res1
|
||||
LD b2, B0, 0x04 //load3
|
||||
MADD3 c11, a2, b2, c11
|
||||
MADD4 c12, a1, b2, c12
|
||||
LD b3, B0, 0x08 //load4
|
||||
MADD1 c21, a1, b3, c21 //res2
|
||||
MADD2 c22, a2, b3, c22 //res3
|
||||
LD b4, B0, 0x0c //load5
|
||||
MADD3 c21, a2, b4, c21
|
||||
MADD4 c22, a1, b4, c22
|
||||
|
||||
addi.d A0, A0, 0x08
|
||||
addi.d B0, B0, 0x10
|
||||
|
||||
addi.d L, L, 1
|
||||
blt L, TL, .L16
|
||||
|
||||
.L17:
|
||||
#if defined(TRMMKERNEL)
|
||||
MUL a5, c11, ALPHA_R
|
||||
MUL a6, c12, ALPHA_I
|
||||
SUB a5, a5, a6
|
||||
ST a5, C0, 0x00
|
||||
|
||||
MUL a5, c12, ALPHA_R
|
||||
MUL a6, c11, ALPHA_I
|
||||
ADD a6, a5, a6
|
||||
ST a6, C0, 0x04
|
||||
|
||||
MUL b5, c21, ALPHA_R
|
||||
MUL b6, c22, ALPHA_I
|
||||
SUB b5, b5, b6
|
||||
ST b5, C1, 0x00
|
||||
|
||||
MUL b5, c22, ALPHA_R
|
||||
MUL b6, c21, ALPHA_I
|
||||
ADD b6, b5, b6
|
||||
ST b6, C1, 0x04
|
||||
#else
|
||||
LD a5, C0, 0x00 //C0[0]
|
||||
LD a6, C0, 0x04 //C0[1]
|
||||
LD b5, C1, 0x00 //C1[0]
|
||||
LD b6, C1, 0x04 //C1[1]
|
||||
|
||||
MADD a5, c11, ALPHA_R, a5
|
||||
MADD a6, c12, ALPHA_R, a6
|
||||
NMSUB a5, c12, ALPHA_I, a5
|
||||
MADD a6, c11, ALPHA_I, a6
|
||||
ST a5, C0, 0x00
|
||||
ST a6, C0, 0x04
|
||||
|
||||
MADD b5, c21, ALPHA_R, b5
|
||||
MADD b6, c22, ALPHA_R, b6
|
||||
NMSUB b5, c22, ALPHA_I, b5
|
||||
MADD b6, c21, ALPHA_I, b6
|
||||
ST b5, C1, 0x00
|
||||
ST b6, C1, 0x04
|
||||
#endif
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#ifdef LEFT
|
||||
addi.d TL, TL, -1
|
||||
#else
|
||||
addi.d TL, TL, -2
|
||||
#endif
|
||||
slli.d C3, TL, 0x03
|
||||
add.d A0, A0, C3
|
||||
slli.d C3, TL, 0x04
|
||||
add.d B0, B0, C3
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
addi.d OFF, OFF, 1
|
||||
#endif
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
addi.d C0, C0, 0x08
|
||||
addi.d C1, C1, 0x08
|
||||
|
||||
addi.d I, I, 1
|
||||
blt I, T0, .L15
|
||||
|
||||
.L18:
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
addi.d OFF, OFF, 2
|
||||
#endif
|
||||
|
||||
slli.d L, K, 0x04
|
||||
add.d B, B, L
|
||||
|
||||
slli.d I, LDC, 0x02
|
||||
add.d C, C, I
|
||||
|
||||
addi.d J, J, 1
|
||||
srai.d T0, N, 1
|
||||
blt J, T0, .L10
|
||||
|
||||
.L19:
|
||||
move J, $r0
|
||||
andi T0, N, 1
|
||||
beq J, T0, .L30
|
||||
|
||||
.L20: /* for (j=0; j<(bn&1); j+=1) */
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
move OFF, OFFSET
|
||||
#endif
|
||||
|
||||
move C0, C
|
||||
move A0, A //ptrba
|
||||
|
||||
move I, $r0
|
||||
srai.d T0, M, 1
|
||||
beq I, T0, .L24
|
||||
|
||||
.L21: /* for (i=0; i<bm/2; i+=1) */
|
||||
move B0, B //ptrbb
|
||||
move TL, K /* TL = bk */
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
|
||||
move B0, B //ptrbb
|
||||
#else
|
||||
slli.d C3, OFF, 0x04
|
||||
add.d A0, A0, C3
|
||||
slli.d C3, OFF, 0x03
|
||||
add.d B0, B, C3
|
||||
#endif
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#elif defined(LEFT)
|
||||
addi.d TL, OFF, 2
|
||||
#else
|
||||
addi.d TL, OFF, 1
|
||||
#endif
|
||||
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
MTC c11, $r0
|
||||
MTC c12, $r0
|
||||
MTC c21, $r0
|
||||
MTC c22, $r0
|
||||
|
||||
move L, $r0 //cycle param k
|
||||
beq L, TL, .L23
|
||||
blt TL, L, .L23
|
||||
|
||||
.L22: /* for (k=0; k<bk; k+=1) */
|
||||
LD a1, A0, 0x00 //load0
|
||||
LD b1, B0, 0x00 //load1
|
||||
MADD1 c11, a1, b1, c11 //res0
|
||||
LD a2, A0, 0x04 //load2
|
||||
MADD2 c12, a2, b1, c12 //res1
|
||||
LD b2, B0, 0x04 //load3
|
||||
MADD3 c11, a2, b2, c11
|
||||
MADD4 c12, a1, b2, c12
|
||||
LD a3, A0, 0x08 //load4
|
||||
MADD1 c21, a3, b1, c21 //res2
|
||||
LD a4, A0, 0x0c //load5
|
||||
MADD2 c22, a4, b1, c22 //res3
|
||||
MADD3 c21, a4, b2, c21
|
||||
MADD4 c22, a3, b2, c22
|
||||
|
||||
addi.d A0, A0, 0x10
|
||||
addi.d B0, B0, 0x08
|
||||
|
||||
addi.d L, L, 1
|
||||
blt L, TL, .L22
|
||||
|
||||
.L23:
|
||||
#if defined(TRMMKERNEL)
|
||||
MUL a5, c11, ALPHA_R
|
||||
MUL a6, c12, ALPHA_I
|
||||
SUB a5, a5, a6
|
||||
ST a5, C0, 0x00
|
||||
|
||||
MUL a5, c12, ALPHA_R
|
||||
MUL a6, c11, ALPHA_I
|
||||
ADD a6, a5, a6
|
||||
ST a6, C0, 0x04
|
||||
|
||||
MUL a7, c21, ALPHA_R
|
||||
MUL a8, c22, ALPHA_I
|
||||
SUB a7, a7, a8
|
||||
ST a7, C0, 0x08
|
||||
|
||||
MUL a7, c22, ALPHA_R
|
||||
MUL a8, c21, ALPHA_I
|
||||
ADD a8, a7, a8
|
||||
ST a8, C0, 0x0c
|
||||
#else
|
||||
LD a5, C0, 0x00 //C0[0]
|
||||
LD a6, C0, 0x04 //C0[1]
|
||||
LD a7, C0, 0x08 //C1[2]
|
||||
LD a8, C0, 0x0c //C1[3]
|
||||
|
||||
MADD a5, c11, ALPHA_R, a5
|
||||
MADD a6, c12, ALPHA_R, a6
|
||||
NMSUB a5, c12, ALPHA_I, a5
|
||||
MADD a6, c11, ALPHA_I, a6
|
||||
MADD a7, c21, ALPHA_R, a7
|
||||
MADD a8, c22, ALPHA_R, a8
|
||||
NMSUB a7, c22, ALPHA_I, a7
|
||||
MADD a8, c21, ALPHA_I, a8
|
||||
|
||||
ST a5, C0, 0x00
|
||||
ST a6, C0, 0x04
|
||||
ST a7, C0, 0x08
|
||||
ST a8, C0, 0x0c
|
||||
#endif
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#ifdef LEFT
|
||||
addi.d TL, TL, -2
|
||||
#else
|
||||
addi.d TL, TL, -1
|
||||
#endif
|
||||
slli.d C3, TL, 0x04
|
||||
add.d A0, A0, C3
|
||||
slli.d C3, TL, 0x03
|
||||
add.d B0, B0, C3
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
addi.d OFF, OFF, 2
|
||||
#endif
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
addi.d C0, C0, 0x10
|
||||
|
||||
addi.d I, I, 1
|
||||
blt I, T0, .L21
|
||||
|
||||
.L24:
|
||||
move I, $r0
|
||||
andi T1, M, 1 //bm&1
|
||||
beq I, T1, .L28
|
||||
|
||||
.L25: /* for (i=0; i<(bm&1); i+=1) */
|
||||
move B0, B //ptrbb
|
||||
move TL, K /* TL = bk */
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
|
||||
move B0, B //ptrbb
|
||||
#else
|
||||
slli.d C3, OFF, 0x03
|
||||
add.d A0, A0, C3
|
||||
add.d B0, B, C3
|
||||
#endif
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#elif defined(LEFT)
|
||||
addi.d TL, OFF, 1
|
||||
#else
|
||||
addi.d TL, OFF, 1
|
||||
#endif
|
||||
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
MTC c11, $r0
|
||||
MTC c12, $r0
|
||||
|
||||
move L, $r0 //cycle param k
|
||||
beq L, TL, .L27
|
||||
blt TL, L, .L27
|
||||
|
||||
.L26: /* for (k=0; k<bk; k+=1) */
|
||||
LD a1, A0, 0x00 //load0
|
||||
LD b1, B0, 0x00 //load1
|
||||
MADD1 c11, a1, b1, c11 //res0
|
||||
LD a2, A0, 0x04 //load2
|
||||
MADD2 c12, a2, b1, c12 //res1
|
||||
LD b2, B0, 0x04 //load3
|
||||
MADD3 c11, a2, b2, c11
|
||||
MADD4 c12, a1, b2, c12
|
||||
|
||||
addi.d A0, A0, 0x08
|
||||
addi.d B0, B0, 0x08
|
||||
|
||||
addi.d L, L, 1
|
||||
blt L, TL, .L26
|
||||
|
||||
.L27:
|
||||
#if defined(TRMMKERNEL)
|
||||
MUL a5, c11, ALPHA_R
|
||||
MUL a6, c12, ALPHA_I
|
||||
SUB a5, a5, a6
|
||||
ST a5, C0, 0x00
|
||||
|
||||
MUL a5, c12, ALPHA_R
|
||||
MUL a6, c11, ALPHA_I
|
||||
ADD a6, a5, a6
|
||||
ST a6, C0, 0x04
|
||||
#else
|
||||
LD a5, C0, 0x00 //C0[0]
|
||||
LD a6, C0, 0x04 //C0[1]
|
||||
|
||||
MADD a5, c11, ALPHA_R, a5
|
||||
MADD a6, c12, ALPHA_R, a6
|
||||
NMSUB a5, c12, ALPHA_I, a5
|
||||
MADD a6, c11, ALPHA_I, a6
|
||||
|
||||
ST a5, C0, 0x00
|
||||
ST a6, C0, 0x04
|
||||
#endif
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#ifdef LEFT
|
||||
addi.d TL, TL, -1
|
||||
#else
|
||||
addi.d TL, TL, -1
|
||||
#endif
|
||||
slli.d C3, TL, 0x03
|
||||
add.d A0, A0, C3
|
||||
add.d B0, B0, C3
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
addi.d OFF, OFF, 1
|
||||
#endif
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
addi.d C0, C0, 0x08
|
||||
|
||||
addi.d I, I, 1
|
||||
blt I, T1, .L25
|
||||
|
||||
.L28:
|
||||
slli.d L, K, 3
|
||||
add.d B, B, L
|
||||
|
||||
slli.d I, LDC, 1
|
||||
add.d C, C, I
|
||||
|
||||
addi.d J, J, 1
|
||||
andi T0, N, 1
|
||||
blt J, T0, .L20
|
||||
|
||||
.L30:
|
||||
LDARG $r23, $sp, 0
|
||||
LDARG $r24, $sp, 8
|
||||
LDARG $r25, $sp, 16
|
||||
LDARG $r26, $sp, 24
|
||||
LDARG $r27, $sp, 32
|
||||
LD $f23, $sp, 40
|
||||
LD $f24, $sp, 48
|
||||
LD $f25, $sp, 56
|
||||
LD $f26, $sp, 64
|
||||
LD $f27, $sp, 72
|
||||
LD $f28, $sp, 80
|
||||
LD $f29, $sp, 88
|
||||
LD $f30, $sp, 96
|
||||
LD $f31, $sp, 104
|
||||
|
||||
addi.d $sp, $sp, 128
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,812 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: bm
|
||||
#define N $r5 // param 2: bn
|
||||
#define K $r6 // param 3: bk
|
||||
#define ALPHA_R $f0 // param 4: alphar
|
||||
#define ALPHA_I $f1 // param 5: alphai
|
||||
#define A $r7 // param 6: ba
|
||||
#define B $r8 // param 7: bb
|
||||
#define C $r9 // param 8: bc
|
||||
#define LDC $r10 // param 9: ldc
|
||||
|
||||
#if defined (TRMMKERNEL)
|
||||
#define OFFSET $r11 // param 10: offset
|
||||
#endif
|
||||
#define OFF $r26
|
||||
|
||||
#define I $r12
|
||||
#define J $r13
|
||||
#define L $r14
|
||||
#define TL $r15
|
||||
#define A0 $r16
|
||||
#define B0 $r17
|
||||
#define C0 $r18
|
||||
#define C1 $r19
|
||||
#define C2 $r20
|
||||
#define C3 $r23
|
||||
#define T0 $r24
|
||||
#define T1 $r25
|
||||
|
||||
#define a1 $f2
|
||||
#define a2 $f3
|
||||
#define a3 $f4
|
||||
#define a4 $f5
|
||||
#define a5 $f6
|
||||
#define a6 $f7
|
||||
#define a7 $f8
|
||||
#define a8 $f9
|
||||
#define b1 $f10
|
||||
#define b2 $f11
|
||||
#define b3 $f12
|
||||
#define b4 $f13
|
||||
#define b5 $f14
|
||||
#define b6 $f15
|
||||
#define b7 $f16
|
||||
#define b8 $f17
|
||||
#define c11 $f18
|
||||
#define c12 $f19
|
||||
#define c21 $f20
|
||||
#define c22 $f21
|
||||
#define c31 $f22
|
||||
#define c32 $f23
|
||||
#define c41 $f24
|
||||
#define c42 $f25
|
||||
|
||||
/* LASX vectors */
|
||||
#define U0 $vr30
|
||||
#define U1 $vr31
|
||||
#define U2 $vr2
|
||||
#define U3 $vr3
|
||||
#define U4 $vr4
|
||||
#define U5 $vr5
|
||||
#define U6 $vr6
|
||||
#define U7 $vr7
|
||||
#define U8 $vr8
|
||||
#define U9 $vr9
|
||||
#define U10 $vr10
|
||||
#define U11 $vr11
|
||||
#define U12 $vr12
|
||||
#define U13 $vr13
|
||||
#define U14 $vr14
|
||||
#define U15 $vr15
|
||||
#define D0 $vr16
|
||||
#define D1 $vr17
|
||||
#define D2 $vr18
|
||||
#define D3 $vr19
|
||||
#define D4 $vr20
|
||||
#define D5 $vr21
|
||||
#define D6 $vr22
|
||||
#define D7 $vr23
|
||||
#define D8 $vr24
|
||||
#define D9 $vr25
|
||||
#define D10 $vr26
|
||||
#define D11 $vr27
|
||||
#define VALPHAR $vr28
|
||||
#define VALPHAI $vr29
|
||||
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
#define VMADD1 VFMADD
|
||||
#define VMADD2 VFMADD
|
||||
#define VMADD3 VNMSUB
|
||||
#define VMADD4 VFMADD
|
||||
|
||||
#define MADD1 MADD
|
||||
#define MADD2 MADD
|
||||
#define MADD3 NMSUB
|
||||
#define MADD4 MADD
|
||||
#endif
|
||||
|
||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
#define VMADD1 VFMADD
|
||||
#define VMADD2 VFMADD
|
||||
#define VMADD3 VFMADD
|
||||
#define VMADD4 VNMSUB
|
||||
|
||||
#define MADD1 MADD
|
||||
#define MADD2 MADD
|
||||
#define MADD3 MADD
|
||||
#define MADD4 NMSUB
|
||||
#endif
|
||||
|
||||
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
#define VMADD1 VFMADD
|
||||
#define VMADD2 VNMSUB
|
||||
#define VMADD3 VFMADD
|
||||
#define VMADD4 VFMADD
|
||||
|
||||
#define MADD1 MADD
|
||||
#define MADD2 NMSUB
|
||||
#define MADD3 MADD
|
||||
#define MADD4 MADD
|
||||
#endif
|
||||
|
||||
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
#define VMADD1 VFMADD
|
||||
#define VMADD2 VNMSUB
|
||||
#define VMADD3 VNMSUB
|
||||
#define VMADD4 VNMSUB
|
||||
|
||||
#define MADD1 MADD
|
||||
#define MADD2 NMSUB
|
||||
#define MADD3 NMSUB
|
||||
#define MADD4 NMSUB
|
||||
#endif
|
||||
|
||||
PROLOGUE
|
||||
|
||||
addi.d $sp, $sp, -128
|
||||
SDARG $r23, $sp, 0
|
||||
SDARG $r24, $sp, 8
|
||||
SDARG $r25, $sp, 16
|
||||
SDARG $r26, $sp, 24
|
||||
SDARG $r27, $sp, 32
|
||||
ST $f23, $sp, 40
|
||||
ST $f24, $sp, 48
|
||||
ST $f25, $sp, 56
|
||||
ST $f26, $sp, 64
|
||||
ST $f27, $sp, 72
|
||||
ST $f28, $sp, 80
|
||||
ST $f29, $sp, 88
|
||||
ST $f30, $sp, 96
|
||||
ST $f31, $sp, 104
|
||||
ST ALPHA_R,$sp, 112
|
||||
ST ALPHA_I,$sp, 120
|
||||
|
||||
vldrepl.w VALPHAR, $sp, 112
|
||||
vldrepl.w VALPHAI, $sp, 120
|
||||
|
||||
#if defined (TRMMKERNEL) && !defined(LEFT)
|
||||
sub.d OFF, $r0, OFFSET
|
||||
#else
|
||||
xor OFF, OFF, OFF
|
||||
#endif
|
||||
|
||||
slli.d LDC, LDC, 2
|
||||
|
||||
move J, $r0
|
||||
srai.d T0, N, 1
|
||||
beq J, T0, .L19
|
||||
|
||||
.L10: /* for(j=0; j<bn/2; j+=1) */
|
||||
move C0, C
|
||||
slli.d TL, LDC, 1
|
||||
add.d C1, C0, TL
|
||||
move A0, A //ptrba
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
move OFF, OFFSET
|
||||
#endif
|
||||
|
||||
move I, $r0
|
||||
srai.d T0, M, 1
|
||||
beq I, T0, .L150
|
||||
|
||||
.L11: /* for(i=0; i<bm/2; i+=1) */
|
||||
move B0, B //ptrbb
|
||||
move TL, K /* TL = bk */
|
||||
#if defined(TRMMKERNEL)
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
move B0, B //ptrbb
|
||||
#else
|
||||
slli.d C3, OFF, 0x04
|
||||
add.d A0, A0, C3
|
||||
add.d B0, B, C3
|
||||
#endif
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
sub.d TL, K, OFF //temp
|
||||
#elif defined(LEFT)
|
||||
addi.d TL, OFF, 2
|
||||
#else
|
||||
addi.d TL, OFF, 2
|
||||
#endif
|
||||
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
vxor.v U0, U0, U0
|
||||
vxor.v U1, U1, U1
|
||||
|
||||
move L, $r0 //cycle param k
|
||||
srai.d C2, TL, 2
|
||||
beq L, C2, .L130
|
||||
blt C2, L, .L130
|
||||
|
||||
.L12: /* for(k=0; k<bk/4; k+=1) */
|
||||
vld D0, A0, 0x00 //a0 a1 a2 a3
|
||||
vld D1, B0, 0x00 //b0 b1 b2 b3
|
||||
|
||||
vshuf4i.w D4, D1, 0xa0 //b0 b0 b2 b2
|
||||
vshuf4i.w D5, D1, 0xf5 //b1 b1 b3 b3
|
||||
|
||||
vshuf4i.w D2, D0, 0x88 //a0 a2 a0 a2
|
||||
vshuf4i.w D3, D0, 0xdd //a1 a3 a1 a3
|
||||
|
||||
VMADD1 U0, D2, D4, U0 //res0 2 4 6
|
||||
VMADD2 U1, D3, D4, U1 //res1 3 4 7
|
||||
VMADD3 U0, D3, D5, U0
|
||||
VMADD4 U1, D2, D5, U1
|
||||
|
||||
vld D0, A0, 0x10 //a0 a1 a2 a3
|
||||
vld D1, B0, 0x10 //b0 b1 b2 b3
|
||||
|
||||
vshuf4i.w D4, D1, 0xa0 //b0 b0 b2 b2
|
||||
vshuf4i.w D5, D1, 0xf5 //b1 b1 b3 b3
|
||||
|
||||
vshuf4i.w D2, D0, 0x88 //a0 a2 a0 a2
|
||||
vshuf4i.w D3, D0, 0xdd //a1 a3 a1 a3
|
||||
|
||||
VMADD1 U0, D2, D4, U0 //res0 2 4 6
|
||||
VMADD2 U1, D3, D4, U1 //res1 3 4 7
|
||||
VMADD3 U0, D3, D5, U0
|
||||
VMADD4 U1, D2, D5, U1
|
||||
|
||||
vld D0, A0, 0x20 //a0 a1 a2 a3
|
||||
vld D1, B0, 0x20 //b0 b1 b2 b3
|
||||
|
||||
vshuf4i.w D4, D1, 0xa0 //b0 b0 b2 b2
|
||||
vshuf4i.w D5, D1, 0xf5 //b1 b1 b3 b3
|
||||
|
||||
vshuf4i.w D2, D0, 0x88 //a0 a2 a0 a2
|
||||
vshuf4i.w D3, D0, 0xdd //a1 a3 a1 a3
|
||||
|
||||
VMADD1 U0, D2, D4, U0 //res0 2 4 6
|
||||
VMADD2 U1, D3, D4, U1 //res1 3 4 7
|
||||
VMADD3 U0, D3, D5, U0
|
||||
VMADD4 U1, D2, D5, U1
|
||||
|
||||
vld D0, A0, 0x30 //a0 a1 a2 a3
|
||||
vld D1, B0, 0x30 //b0 b1 b2 b3
|
||||
|
||||
vshuf4i.w D4, D1, 0xa0 //b0 b0 b2 b2
|
||||
vshuf4i.w D5, D1, 0xf5 //b1 b1 b3 b3
|
||||
|
||||
vshuf4i.w D2, D0, 0x88 //a0 a2 a0 a2
|
||||
vshuf4i.w D3, D0, 0xdd //a1 a3 a1 a3
|
||||
|
||||
VMADD1 U0, D2, D4, U0 //res0 2 4 6
|
||||
VMADD2 U1, D3, D4, U1 //res1 3 4 7
|
||||
VMADD3 U0, D3, D5, U0
|
||||
VMADD4 U1, D2, D5, U1
|
||||
|
||||
addi.d A0, A0, 0x40
|
||||
addi.d B0, B0, 0x40
|
||||
|
||||
addi.d L, L, 1
|
||||
blt L, C2, .L12
|
||||
|
||||
.L130:
|
||||
move L, $r0
|
||||
andi C2, TL, 3
|
||||
beq L, C2, .L14
|
||||
|
||||
.L13: /* for(k=0; k<(bk&3); k+=1) */
|
||||
vld D0, A0, 0x00 //a0 a1 a2 a3
|
||||
vld D1, B0, 0x00 //b0 b1 b2 b3
|
||||
|
||||
vshuf4i.w D4, D1, 0xa0 //b0 b0 b2 b2
|
||||
vshuf4i.w D5, D1, 0xf5 //b1 b1 b3 b3
|
||||
|
||||
vshuf4i.w D2, D0, 0x88 //a0 a2 a0 a2
|
||||
vshuf4i.w D3, D0, 0xdd //a1 a3 a1 a3
|
||||
|
||||
VMADD1 U0, D2, D4, U0 //res0 2 4 6
|
||||
VMADD2 U1, D3, D4, U1 //res1 3 5 7
|
||||
VMADD3 U0, D3, D5, U0
|
||||
VMADD4 U1, D2, D5, U1
|
||||
|
||||
addi.d A0, A0, 0x10
|
||||
addi.d B0, B0, 0x10
|
||||
|
||||
addi.d L, L, 1
|
||||
blt L, C2, .L13
|
||||
|
||||
.L14:
|
||||
#if defined(TRMMKERNEL)
|
||||
vld U8, C0, 0x00 //0 1 2 3
|
||||
vld U9, C1, 0x00 //4 5 6 7
|
||||
|
||||
vpackev.w U10, U9, U8 //0 4 2 6
|
||||
vpermi.w U10, U10, 0xd8 //0 2 4 6
|
||||
|
||||
vpackod.w U11, U9, U8 //1 5 3 7
|
||||
vpermi.w U11, U11, 0xd8 //1 3 5 7
|
||||
|
||||
vfmul.s U10, U0, VALPHAR
|
||||
vfmul.s U11, U1, VALPHAR
|
||||
VNMSUB U10, U1, VALPHAI, U10
|
||||
VFMADD U11, U0, VALPHAI, U11
|
||||
|
||||
vilvl.w U8, U11, U10 //0 1 2 3
|
||||
|
||||
vilvh.w U9, U11, U10 //4 5 6 7
|
||||
|
||||
vst U8, C0, 0x00
|
||||
vst U9, C1, 0x00
|
||||
#else
|
||||
vld U8, C0, 0x00 //0 1 2 3
|
||||
vld U9, C1, 0x00 //4 5 6 7
|
||||
|
||||
vpackev.w U10, U9, U8 //0 4 2 6
|
||||
vpermi.w U10, U10, 0xd8 //0 2 4 6
|
||||
|
||||
vpackod.w U11, U9, U8 //1 5 3 7
|
||||
vpermi.w U11, U11, 0xd8 //1 3 5 7
|
||||
|
||||
VFMADD U10, U0, VALPHAR, U10
|
||||
VFMADD U11, U1, VALPHAR, U11
|
||||
VNMSUB U10, U1, VALPHAI, U10
|
||||
VFMADD U11, U0, VALPHAI, U11
|
||||
|
||||
vilvl.w U8, U11, U10 //0 1 2 3
|
||||
|
||||
vilvh.w U9, U11, U10 //4 5 6 7
|
||||
|
||||
vst U8, C0, 0x00
|
||||
vst U9, C1, 0x00
|
||||
#endif
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#ifdef LEFT
|
||||
addi.d TL, TL, -2
|
||||
#else
|
||||
addi.d TL, TL, -2
|
||||
#endif
|
||||
slli.d C3, TL, 0x04
|
||||
add.d A0, A0, C3
|
||||
add.d B0, B0, C3
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
addi.d OFF, OFF, 2
|
||||
#endif
|
||||
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
addi.d C0, C0, 0x10
|
||||
addi.d C1, C1, 0x10
|
||||
|
||||
addi.d I, I, 1
|
||||
blt I, T0, .L11
|
||||
|
||||
.L150:
|
||||
move I, $r0
|
||||
andi T0, M, 1
|
||||
beq I, T0, .L18
|
||||
|
||||
.L15: /* for(i=0; i<(bm&1); i+=1) */
|
||||
move B0, B //ptrbb
|
||||
move TL, K /* TL = bk */
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
|
||||
move B0, B //ptrbb
|
||||
#else
|
||||
slli.d C3, OFF, 0x03
|
||||
add.d A0, A0, C3
|
||||
slli.d C3, OFF, 0x04
|
||||
add.d B0, B, C3
|
||||
#endif
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#elif defined(LEFT)
|
||||
addi.d TL, OFF, 1
|
||||
#else
|
||||
addi.d TL, OFF, 2
|
||||
#endif
|
||||
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
MTC c11, $r0
|
||||
MTC c12, $r0
|
||||
MTC c21, $r0
|
||||
MTC c22, $r0
|
||||
|
||||
move L, $r0 //cycle param k
|
||||
beq L, TL, .L17
|
||||
blt TL, L, .L17
|
||||
|
||||
.L16: /* for (k=0; k<bk; k+=1) */
|
||||
LD a1, A0, 0x00 //load0
|
||||
LD b1, B0, 0x00 //load1
|
||||
MADD1 c11, a1, b1, c11 //res0
|
||||
LD a2, A0, 0x04 //load2
|
||||
MADD2 c12, a2, b1, c12 //res1
|
||||
LD b2, B0, 0x04 //load3
|
||||
MADD3 c11, a2, b2, c11
|
||||
MADD4 c12, a1, b2, c12
|
||||
LD b3, B0, 0x08 //load4
|
||||
MADD1 c21, a1, b3, c21 //res2
|
||||
MADD2 c22, a2, b3, c22 //res3
|
||||
LD b4, B0, 0x0c //load5
|
||||
MADD3 c21, a2, b4, c21
|
||||
MADD4 c22, a1, b4, c22
|
||||
|
||||
addi.d A0, A0, 0x08
|
||||
addi.d B0, B0, 0x10
|
||||
|
||||
addi.d L, L, 1
|
||||
blt L, TL, .L16
|
||||
|
||||
.L17:
|
||||
#if defined(TRMMKERNEL)
|
||||
MUL a5, c11, ALPHA_R
|
||||
MUL a6, c12, ALPHA_I
|
||||
SUB a5, a5, a6
|
||||
ST a5, C0, 0x00
|
||||
|
||||
MUL a5, c12, ALPHA_R
|
||||
MUL a6, c11, ALPHA_I
|
||||
ADD a6, a5, a6
|
||||
ST a6, C0, 0x04
|
||||
|
||||
MUL b5, c21, ALPHA_R
|
||||
MUL b6, c22, ALPHA_I
|
||||
SUB b5, b5, b6
|
||||
ST b5, C1, 0x00
|
||||
|
||||
MUL b5, c22, ALPHA_R
|
||||
MUL b6, c21, ALPHA_I
|
||||
ADD b6, b5, b6
|
||||
ST b6, C1, 0x04
|
||||
#else
|
||||
LD a5, C0, 0x00 //C0[0]
|
||||
LD a6, C0, 0x04 //C0[1]
|
||||
LD b5, C1, 0x00 //C1[0]
|
||||
LD b6, C1, 0x04 //C1[1]
|
||||
|
||||
MADD a5, c11, ALPHA_R, a5
|
||||
MADD a6, c12, ALPHA_R, a6
|
||||
NMSUB a5, c12, ALPHA_I, a5
|
||||
MADD a6, c11, ALPHA_I, a6
|
||||
ST a5, C0, 0x00
|
||||
ST a6, C0, 0x04
|
||||
|
||||
MADD b5, c21, ALPHA_R, b5
|
||||
MADD b6, c22, ALPHA_R, b6
|
||||
NMSUB b5, c22, ALPHA_I, b5
|
||||
MADD b6, c21, ALPHA_I, b6
|
||||
ST b5, C1, 0x00
|
||||
ST b6, C1, 0x04
|
||||
#endif
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#ifdef LEFT
|
||||
addi.d TL, TL, -1
|
||||
#else
|
||||
addi.d TL, TL, -2
|
||||
#endif
|
||||
slli.d C3, TL, 0x03
|
||||
add.d A0, A0, C3
|
||||
slli.d C3, TL, 0x04
|
||||
add.d B0, B0, C3
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
addi.d OFF, OFF, 1
|
||||
#endif
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
addi.d C0, C0, 0x08
|
||||
addi.d C1, C1, 0x08
|
||||
|
||||
addi.d I, I, 1
|
||||
blt I, T0, .L15
|
||||
|
||||
.L18:
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
addi.d OFF, OFF, 2
|
||||
#endif
|
||||
|
||||
slli.d L, K, 0x04
|
||||
add.d B, B, L
|
||||
|
||||
slli.d I, LDC, 0x02
|
||||
add.d C, C, I
|
||||
|
||||
addi.d J, J, 1
|
||||
srai.d T0, N, 1
|
||||
blt J, T0, .L10
|
||||
|
||||
.L19:
|
||||
move J, $r0
|
||||
andi T0, N, 1
|
||||
beq J, T0, .L30
|
||||
|
||||
.L20: /* for (j=0; j<(bn&1); j+=1) */
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
move OFF, OFFSET
|
||||
#endif
|
||||
|
||||
move C0, C
|
||||
move A0, A //ptrba
|
||||
|
||||
move I, $r0
|
||||
srai.d T0, M, 1
|
||||
beq I, T0, .L24
|
||||
|
||||
.L21: /* for (i=0; i<bm/2; i+=1) */
|
||||
move B0, B //ptrbb
|
||||
move TL, K /* TL = bk */
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
|
||||
move B0, B //ptrbb
|
||||
#else
|
||||
slli.d C3, OFF, 0x04
|
||||
add.d A0, A0, C3
|
||||
slli.d C3, OFF, 0x03
|
||||
add.d B0, B, C3
|
||||
#endif
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#elif defined(LEFT)
|
||||
addi.d TL, OFF, 2
|
||||
#else
|
||||
addi.d TL, OFF, 1
|
||||
#endif
|
||||
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
MTC c11, $r0
|
||||
MTC c12, $r0
|
||||
MTC c21, $r0
|
||||
MTC c22, $r0
|
||||
|
||||
move L, $r0 //cycle param k
|
||||
beq L, TL, .L23
|
||||
blt TL, L, .L23
|
||||
|
||||
.L22: /* for (k=0; k<bk; k+=1) */
|
||||
LD a1, A0, 0x00 //load0
|
||||
LD b1, B0, 0x00 //load1
|
||||
MADD1 c11, a1, b1, c11 //res0
|
||||
LD a2, A0, 0x04 //load2
|
||||
MADD2 c12, a2, b1, c12 //res1
|
||||
LD b2, B0, 0x04 //load3
|
||||
MADD3 c11, a2, b2, c11
|
||||
MADD4 c12, a1, b2, c12
|
||||
LD a3, A0, 0x08 //load4
|
||||
MADD1 c21, a3, b1, c21 //res2
|
||||
LD a4, A0, 0x0c //load5
|
||||
MADD2 c22, a4, b1, c22 //res3
|
||||
MADD3 c21, a4, b2, c21
|
||||
MADD4 c22, a3, b2, c22
|
||||
|
||||
addi.d A0, A0, 0x10
|
||||
addi.d B0, B0, 0x08
|
||||
|
||||
addi.d L, L, 1
|
||||
blt L, TL, .L22
|
||||
|
||||
.L23:
|
||||
#if defined(TRMMKERNEL)
|
||||
MUL a5, c11, ALPHA_R
|
||||
MUL a6, c12, ALPHA_I
|
||||
SUB a5, a5, a6
|
||||
ST a5, C0, 0x00
|
||||
|
||||
MUL a5, c12, ALPHA_R
|
||||
MUL a6, c11, ALPHA_I
|
||||
ADD a6, a5, a6
|
||||
ST a6, C0, 0x04
|
||||
|
||||
MUL a7, c21, ALPHA_R
|
||||
MUL a8, c22, ALPHA_I
|
||||
SUB a7, a7, a8
|
||||
ST a7, C0, 0x08
|
||||
|
||||
MUL a7, c22, ALPHA_R
|
||||
MUL a8, c21, ALPHA_I
|
||||
ADD a8, a7, a8
|
||||
ST a8, C0, 0x0c
|
||||
#else
|
||||
LD a5, C0, 0x00 //C0[0]
|
||||
LD a6, C0, 0x04 //C0[1]
|
||||
LD a7, C0, 0x08 //C1[2]
|
||||
LD a8, C0, 0x0c //C1[3]
|
||||
|
||||
MADD a5, c11, ALPHA_R, a5
|
||||
MADD a6, c12, ALPHA_R, a6
|
||||
NMSUB a5, c12, ALPHA_I, a5
|
||||
MADD a6, c11, ALPHA_I, a6
|
||||
MADD a7, c21, ALPHA_R, a7
|
||||
MADD a8, c22, ALPHA_R, a8
|
||||
NMSUB a7, c22, ALPHA_I, a7
|
||||
MADD a8, c21, ALPHA_I, a8
|
||||
|
||||
ST a5, C0, 0x00
|
||||
ST a6, C0, 0x04
|
||||
ST a7, C0, 0x08
|
||||
ST a8, C0, 0x0c
|
||||
#endif
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#ifdef LEFT
|
||||
addi.d TL, TL, -2
|
||||
#else
|
||||
addi.d TL, TL, -1
|
||||
#endif
|
||||
slli.d C3, TL, 0x04
|
||||
add.d A0, A0, C3
|
||||
slli.d C3, TL, 0x03
|
||||
add.d B0, B0, C3
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
addi.d OFF, OFF, 2
|
||||
#endif
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
addi.d C0, C0, 0x10
|
||||
|
||||
addi.d I, I, 1
|
||||
blt I, T0, .L21
|
||||
|
||||
.L24:
|
||||
move I, $r0
|
||||
andi T1, M, 1 //bm&1
|
||||
beq I, T1, .L28
|
||||
|
||||
.L25: /* for (i=0; i<(bm&1); i+=1) */
|
||||
move B0, B //ptrbb
|
||||
move TL, K /* TL = bk */
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
|
||||
move B0, B //ptrbb
|
||||
#else
|
||||
slli.d C3, OFF, 0x03
|
||||
add.d A0, A0, C3
|
||||
add.d B0, B, C3
|
||||
#endif
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#elif defined(LEFT)
|
||||
addi.d TL, OFF, 1
|
||||
#else
|
||||
addi.d TL, OFF, 1
|
||||
#endif
|
||||
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
MTC c11, $r0
|
||||
MTC c12, $r0
|
||||
|
||||
move L, $r0 //cycle param k
|
||||
beq L, TL, .L27
|
||||
blt TL, L, .L27
|
||||
|
||||
.L26: /* for (k=0; k<bk; k+=1) */
|
||||
LD a1, A0, 0x00 //load0
|
||||
LD b1, B0, 0x00 //load1
|
||||
MADD1 c11, a1, b1, c11 //res0
|
||||
LD a2, A0, 0x04 //load2
|
||||
MADD2 c12, a2, b1, c12 //res1
|
||||
LD b2, B0, 0x04 //load3
|
||||
MADD3 c11, a2, b2, c11
|
||||
MADD4 c12, a1, b2, c12
|
||||
|
||||
addi.d A0, A0, 0x08
|
||||
addi.d B0, B0, 0x08
|
||||
|
||||
addi.d L, L, 1
|
||||
blt L, TL, .L26
|
||||
|
||||
.L27:
|
||||
#if defined(TRMMKERNEL)
|
||||
MUL a5, c11, ALPHA_R
|
||||
MUL a6, c12, ALPHA_I
|
||||
SUB a5, a5, a6
|
||||
ST a5, C0, 0x00
|
||||
|
||||
MUL a5, c12, ALPHA_R
|
||||
MUL a6, c11, ALPHA_I
|
||||
ADD a6, a5, a6
|
||||
ST a6, C0, 0x04
|
||||
#else
|
||||
LD a5, C0, 0x00 //C0[0]
|
||||
LD a6, C0, 0x04 //C0[1]
|
||||
|
||||
MADD a5, c11, ALPHA_R, a5
|
||||
MADD a6, c12, ALPHA_R, a6
|
||||
NMSUB a5, c12, ALPHA_I, a5
|
||||
MADD a6, c11, ALPHA_I, a6
|
||||
|
||||
ST a5, C0, 0x00
|
||||
ST a6, C0, 0x04
|
||||
#endif
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
sub.d TL, K, OFF
|
||||
#ifdef LEFT
|
||||
addi.d TL, TL, -1
|
||||
#else
|
||||
addi.d TL, TL, -1
|
||||
#endif
|
||||
slli.d C3, TL, 0x03
|
||||
add.d A0, A0, C3
|
||||
add.d B0, B0, C3
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
addi.d OFF, OFF, 1
|
||||
#endif
|
||||
#endif // #if defined(TRMMKERNEL)
|
||||
|
||||
addi.d C0, C0, 0x08
|
||||
|
||||
addi.d I, I, 1
|
||||
blt I, T1, .L25
|
||||
|
||||
.L28:
|
||||
slli.d L, K, 3
|
||||
add.d B, B, L
|
||||
|
||||
slli.d I, LDC, 1
|
||||
add.d C, C, I
|
||||
|
||||
addi.d J, J, 1
|
||||
andi T0, N, 1
|
||||
blt J, T0, .L20
|
||||
|
||||
.L30:
|
||||
LDARG $r23, $sp, 0
|
||||
LDARG $r24, $sp, 8
|
||||
LDARG $r25, $sp, 16
|
||||
LDARG $r26, $sp, 24
|
||||
LDARG $r27, $sp, 32
|
||||
LD $f23, $sp, 40
|
||||
LD $f24, $sp, 48
|
||||
LD $f25, $sp, 56
|
||||
LD $f26, $sp, 64
|
||||
LD $f27, $sp, 72
|
||||
LD $f28, $sp, 80
|
||||
LD $f29, $sp, 88
|
||||
LD $f30, $sp, 96
|
||||
LD $f31, $sp, 104
|
||||
|
||||
addi.d $sp, $sp, 128
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,193 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define S5 $r16
|
||||
#define S6 $r17
|
||||
#define S7 $r18
|
||||
#define S8 $r19
|
||||
#define TD $r20
|
||||
#define TS $r11
|
||||
#define TL $r7
|
||||
#define T0 $r23
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
#define F4 $f4
|
||||
#define F5 $f5
|
||||
#define F6 $f6
|
||||
#define F7 $f7
|
||||
|
||||
/* LASX vectors */
|
||||
#define U0 $xr0
|
||||
#define U1 $xr1
|
||||
#define U2 $xr2
|
||||
#define U3 $xr3
|
||||
#define U4 $xr4
|
||||
#define U5 $xr5
|
||||
#define U6 $xr6
|
||||
#define U7 $xr7
|
||||
#define D0 $xr8
|
||||
#define D1 $xr9
|
||||
#define D2 $xr10
|
||||
#define D3 $xr11
|
||||
#define D4 $xr12
|
||||
#define D5 $xr13
|
||||
#define D6 $xr14
|
||||
#define D7 $xr15
|
||||
#define D8 $xr16
|
||||
|
||||
PROLOGUE
|
||||
|
||||
addi.d $sp, $sp, -8
|
||||
SDARG $r23, $sp, 0
|
||||
|
||||
move TD, DST //boffset
|
||||
move TS, SRC //aoffset
|
||||
|
||||
slli.d TL, LDA, 0x02 //lda
|
||||
slli.d TL, TL, 0x01
|
||||
slli.d T0, TL, 0x01
|
||||
|
||||
srai.d I, N, 0x01
|
||||
beq I, ZERO, .L_N0
|
||||
|
||||
.L_J1: /* if (i > 0) I-- */
|
||||
move S1, TS //a_offset1
|
||||
add.d S2, TS, TL //a_offset2
|
||||
srai.d J, M, 0x02
|
||||
add.d TS, TS, T0
|
||||
|
||||
beq J, ZERO, .L_I3
|
||||
|
||||
.L_I1: /* if (j > 0) J-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x00
|
||||
xvld U2, S2, 0x00
|
||||
|
||||
xvpermi.q U0, U2, 0x02
|
||||
xvpermi.q U2, U1, 0x31
|
||||
|
||||
xvpermi.d U0, U0, 0xd8
|
||||
xvpermi.d U2, U2, 0xd8
|
||||
|
||||
xvst U0, TD, 0x00
|
||||
xvst U2, TD, 0x20
|
||||
|
||||
addi.d S1, S1, 0x20 // a_offset1
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d TD, TD, 0x40 // b_offset
|
||||
|
||||
addi.d J, J, -1
|
||||
blt ZERO, J, .L_I1
|
||||
|
||||
.L_I3:
|
||||
andi J, M, 0x03
|
||||
beq J, ZERO, .L_II20
|
||||
|
||||
.L_II1: /* j = (m & 3) if (j > 0) */
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S1, 0x04
|
||||
fld.s F2, S2, 0x00
|
||||
fld.s F3, S2, 0x04
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
fst.s F1, TD, 0x04
|
||||
fst.s F2, TD, 0x08
|
||||
fst.s F3, TD, 0x0c
|
||||
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d TD, TD, 0x10
|
||||
|
||||
addi.d J, J, -1
|
||||
blt ZERO, J, .L_II1
|
||||
|
||||
.L_II20:
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_J1
|
||||
|
||||
.L_N0: /* if(n&1)*/
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_N00
|
||||
|
||||
.L_N1:
|
||||
srai.d J, M, 0x02
|
||||
beq ZERO, J, .L_N10
|
||||
|
||||
.L_N11: /* j = (m >> 2) if (j > 0) */
|
||||
xvld U0, TS, 0x00
|
||||
|
||||
xvst U0, TD, 0x00
|
||||
|
||||
addi.d TS, TS, 0x20 // a_offset
|
||||
addi.d TD, TD, 0x20 // b_offset
|
||||
|
||||
addi.d J, J, -1
|
||||
blt ZERO, J, .L_N11
|
||||
|
||||
.L_N10:
|
||||
andi J, M, 0x03
|
||||
beq J, ZERO, .L_N00
|
||||
|
||||
.L_N12: /* j = (m & 3) if (j > 0) */
|
||||
fld.s F0, TS, 0x00
|
||||
fld.s F1, TS, 0x04
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
fst.s F1, TD, 0x04
|
||||
|
||||
addi.d TS, TS, 0x08 // a_offset
|
||||
addi.d TD, TD, 0x08 // b_offset
|
||||
|
||||
addi.d J, J, -1
|
||||
blt ZERO, J, .L_N12
|
||||
|
||||
.L_N00:
|
||||
LDARG $r23, $sp, 0
|
||||
addi.d $sp, $sp, 8
|
||||
jirl $r0, $r1, 0x00
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,202 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define S5 $r16
|
||||
#define S6 $r17
|
||||
#define S7 $r18
|
||||
#define S8 $r19
|
||||
#define TD $r20
|
||||
#define TS $r11
|
||||
#define TL $r7
|
||||
#define T0 $r23
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
#define F4 $f4
|
||||
#define F5 $f5
|
||||
#define F6 $f6
|
||||
#define F7 $f7
|
||||
|
||||
/* LASX vectors */
|
||||
#define U0 $vr0
|
||||
#define U1 $vr1
|
||||
#define U2 $vr2
|
||||
#define U3 $vr3
|
||||
#define U4 $vr4
|
||||
#define U5 $vr5
|
||||
#define U6 $vr6
|
||||
#define U7 $vr7
|
||||
#define D0 $vr8
|
||||
#define D1 $vr9
|
||||
#define D2 $vr10
|
||||
#define D3 $vr11
|
||||
#define D4 $vr12
|
||||
#define D5 $vr13
|
||||
#define D6 $vr14
|
||||
#define D7 $vr15
|
||||
#define D8 $vr16
|
||||
|
||||
PROLOGUE
|
||||
|
||||
addi.d $sp, $sp, -8
|
||||
SDARG $r23, $sp, 0
|
||||
|
||||
move TD, DST //boffset
|
||||
move TS, SRC //aoffset
|
||||
|
||||
slli.d TL, LDA, 0x02 //lda
|
||||
slli.d TL, TL, 0x01
|
||||
slli.d T0, TL, 0x01
|
||||
|
||||
srai.d I, N, 0x01
|
||||
beq I, ZERO, .L_N0
|
||||
|
||||
.L_J1: /* if (i > 0) I-- */
|
||||
move S1, TS //a_offset1
|
||||
add.d S2, TS, TL //a_offset2
|
||||
srai.d J, M, 0x02
|
||||
add.d TS, TS, T0
|
||||
|
||||
beq J, ZERO, .L_I3
|
||||
|
||||
.L_I1: /* if (j > 0) J-- */
|
||||
vld U0, S1, 0x00
|
||||
vld U1, S1, 0x10
|
||||
vld U2, S2, 0x00
|
||||
vld U3, S2, 0x10
|
||||
|
||||
vand.v D0, U2, U2
|
||||
vand.v D1, U3, U3
|
||||
vand.v D2, U2, U2
|
||||
vand.v D3, U3, U3
|
||||
|
||||
vpermi.w D0, U0, 0x44
|
||||
vpermi.w D2, U0, 0xee
|
||||
vpermi.w D1, U1, 0x44
|
||||
vpermi.w D3, U1, 0xee
|
||||
|
||||
vst D0, TD, 0x00
|
||||
vst D2, TD, 0x10
|
||||
vst D1, TD, 0x20
|
||||
vst D3, TD, 0x30
|
||||
|
||||
addi.d S1, S1, 0x20 // a_offset1
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d TD, TD, 0x40 // b_offset
|
||||
|
||||
addi.d J, J, -1
|
||||
blt ZERO, J, .L_I1
|
||||
|
||||
.L_I3:
|
||||
andi J, M, 0x03
|
||||
beq J, ZERO, .L_II20
|
||||
|
||||
.L_II1: /* j = (m & 3) if (j > 0) */
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S1, 0x04
|
||||
fld.s F2, S2, 0x00
|
||||
fld.s F3, S2, 0x04
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
fst.s F1, TD, 0x04
|
||||
fst.s F2, TD, 0x08
|
||||
fst.s F3, TD, 0x0c
|
||||
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d TD, TD, 0x10
|
||||
|
||||
addi.d J, J, -1
|
||||
blt ZERO, J, .L_II1
|
||||
|
||||
.L_II20:
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_J1
|
||||
|
||||
.L_N0: /* if(n&1)*/
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_N00
|
||||
|
||||
.L_N1:
|
||||
srai.d J, M, 0x02
|
||||
beq ZERO, J, .L_N10
|
||||
|
||||
.L_N11: /* j = (m >> 2) if (j > 0) */
|
||||
vld U0, TS, 0x00
|
||||
vld U1, TS, 0x10
|
||||
|
||||
vst U0, TD, 0x00
|
||||
vst U1, TD, 0x10
|
||||
|
||||
addi.d TS, TS, 0x20 // a_offset
|
||||
addi.d TD, TD, 0x20 // b_offset
|
||||
|
||||
addi.d J, J, -1
|
||||
blt ZERO, J, .L_N11
|
||||
|
||||
.L_N10:
|
||||
andi J, M, 0x03
|
||||
beq J, ZERO, .L_N00
|
||||
|
||||
.L_N12: /* j = (m & 3) if (j > 0) */
|
||||
fld.s F0, TS, 0x00
|
||||
fld.s F1, TS, 0x04
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
fst.s F1, TD, 0x04
|
||||
|
||||
addi.d TS, TS, 0x08 // a_offset
|
||||
addi.d TD, TD, 0x08 // b_offset
|
||||
|
||||
addi.d J, J, -1
|
||||
blt ZERO, J, .L_N12
|
||||
|
||||
.L_N00:
|
||||
LDARG $r23, $sp, 0
|
||||
addi.d $sp, $sp, 8
|
||||
jirl $r0, $r1, 0x00
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,218 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define TD $r16
|
||||
#define TS $r17
|
||||
#define TL $r7
|
||||
#define T0 $r18
|
||||
#define S8 $r19
|
||||
#define S9 $r20
|
||||
#define S10 $r23
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
#define F4 $f4
|
||||
#define F5 $f5
|
||||
#define F6 $f6
|
||||
#define F7 $f7
|
||||
|
||||
/* LASX vectors */
|
||||
#define U0 $xr0
|
||||
#define U1 $xr1
|
||||
#define U2 $xr2
|
||||
#define U3 $xr3
|
||||
#define U4 $xr4
|
||||
#define U5 $xr5
|
||||
#define U6 $xr6
|
||||
#define U7 $xr7
|
||||
#define D0 $xr8
|
||||
#define D1 $xr9
|
||||
#define D2 $xr10
|
||||
#define D3 $xr11
|
||||
#define D4 $xr12
|
||||
#define D5 $xr13
|
||||
#define D6 $xr14
|
||||
#define D7 $xr15
|
||||
|
||||
|
||||
PROLOGUE
|
||||
|
||||
addi.d $sp, $sp, -8
|
||||
SDARG $r23, $sp, 0
|
||||
|
||||
move TS, SRC //aoffset
|
||||
move TD, DST //boffset
|
||||
slli.d TL, LDA, 0x02 //lda
|
||||
slli.d TL, TL, 0x01
|
||||
|
||||
ori T0, ZERO, 0x01
|
||||
andn T0, N, T0
|
||||
mul.d T0, M, T0
|
||||
slli.d T0, T0, 0x01
|
||||
slli.d T0, T0, 0x02
|
||||
add.d S9, DST, T0 //boffset2
|
||||
|
||||
srai.d J, M, 0x01 //j
|
||||
|
||||
beq J, ZERO, .L_M1
|
||||
|
||||
.L_J1: /* if(j>0) j--*/
|
||||
move S1, TS //aoffset1
|
||||
slli.d T0, TL, 0x01
|
||||
add.d S2, S1, TL //aoffset2
|
||||
add.d TS, TS, T0
|
||||
|
||||
move S8, TD //boffset1
|
||||
addi.d TD, TD, 0x20
|
||||
|
||||
srai.d I, N, 0x02
|
||||
beq ZERO, I, .L_JN1
|
||||
|
||||
.L_JI1: /* if(i>0) i--*/
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x00
|
||||
xvld U2, S2, 0x00
|
||||
|
||||
xvpermi.q U0, U2, 0x02
|
||||
xvpermi.q U2, U1, 0x31
|
||||
|
||||
xvst U0, S8, 0x00
|
||||
|
||||
slli.d T0, M, 0x04
|
||||
add.d S8, S8, T0
|
||||
|
||||
xvst U2, S8, 0x00
|
||||
|
||||
add.d S8, S8, T0
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_JI1
|
||||
|
||||
.L_JN1: /* if(n&2) */
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_JN2
|
||||
|
||||
vld $vr0, S1, 0x00
|
||||
vld $vr1, S2, 0x00
|
||||
|
||||
vst $vr0, S8, 0x00
|
||||
vst $vr1, S8, 0x10
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
|
||||
.L_JN2: /* if(n&1) */
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_J0
|
||||
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S1, 0x04
|
||||
fld.s F2, S2, 0x00
|
||||
fld.s F3, S2, 0x04
|
||||
|
||||
fst.s F0, S9, 0x00
|
||||
fst.s F1, S9, 0x04
|
||||
fst.s F2, S9, 0x08
|
||||
fst.s F3, S9, 0x0c
|
||||
|
||||
addi.d S9, S9, 0x10
|
||||
|
||||
.L_J0:
|
||||
addi.d J, J, -1
|
||||
blt ZERO, J, .L_J1
|
||||
|
||||
.L_M1: /* if(m&1) */
|
||||
andi I, M, 0x01
|
||||
beq ZERO, I, .L_M0
|
||||
|
||||
srai.d I, N, 0x02
|
||||
beq ZERO, I, .L_M1N1
|
||||
|
||||
.L_M1I1: /* if(i>0) */
|
||||
vld $vr0, TS, 0x00
|
||||
vld $vr1, TS, 0x10
|
||||
|
||||
vst $vr0, TD, 0x00
|
||||
|
||||
slli.d T0, M, 0x04
|
||||
add.d TD, TD, T0
|
||||
|
||||
vst $vr1, TD, 0x00
|
||||
|
||||
add.d TD, TD, T0
|
||||
addi.d TS, TS, 0x20
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_M1I1
|
||||
|
||||
.L_M1N1: /* if(n&2) */
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_M1N2
|
||||
|
||||
vld $vr0, TS, 0x00
|
||||
|
||||
vst $vr0, TD, 0x00
|
||||
|
||||
addi.d TS, TS, 0x10
|
||||
|
||||
.L_M1N2: /* if(n&1) */
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M0
|
||||
|
||||
fld.s F0, TS, 0x00
|
||||
fld.s F1, TS, 0x04
|
||||
|
||||
fst.s F0, S9, 0x00
|
||||
fst.s F1, S9, 0x04
|
||||
|
||||
.L_M0:
|
||||
LDARG $r23, $sp, 0
|
||||
addi.d $sp, $sp, 8
|
||||
jirl $r0, $r1, 0x00
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,218 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define TD $r16
|
||||
#define TS $r17
|
||||
#define TL $r7
|
||||
#define T0 $r18
|
||||
#define S8 $r19
|
||||
#define S9 $r20
|
||||
#define S10 $r23
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
#define F4 $f4
|
||||
#define F5 $f5
|
||||
#define F6 $f6
|
||||
#define F7 $f7
|
||||
|
||||
/* LASX vectors */
|
||||
#define U0 $vr0
|
||||
#define U1 $vr1
|
||||
#define U2 $vr2
|
||||
#define U3 $vr3
|
||||
#define U4 $vr4
|
||||
#define U5 $vr5
|
||||
#define U6 $vr6
|
||||
#define U7 $vr7
|
||||
#define D0 $vr8
|
||||
#define D1 $vr9
|
||||
#define D2 $vr10
|
||||
#define D3 $vr11
|
||||
#define D4 $vr12
|
||||
#define D5 $vr13
|
||||
#define D6 $vr14
|
||||
#define D7 $vr15
|
||||
|
||||
|
||||
PROLOGUE
|
||||
|
||||
addi.d $sp, $sp, -8
|
||||
SDARG $r23, $sp, 0
|
||||
|
||||
move TS, SRC //aoffset
|
||||
move TD, DST //boffset
|
||||
slli.d TL, LDA, 0x02 //lda
|
||||
slli.d TL, TL, 0x01
|
||||
|
||||
ori T0, ZERO, 0x01
|
||||
andn T0, N, T0
|
||||
mul.d T0, M, T0
|
||||
slli.d T0, T0, 0x01
|
||||
slli.d T0, T0, 0x02
|
||||
add.d S9, DST, T0 //boffset2
|
||||
|
||||
srai.d J, M, 0x01 //j
|
||||
|
||||
beq J, ZERO, .L_M1
|
||||
|
||||
.L_J1: /* if(j>0) j--*/
|
||||
move S1, TS //aoffset1
|
||||
slli.d T0, TL, 0x01
|
||||
add.d S2, S1, TL //aoffset2
|
||||
add.d TS, TS, T0
|
||||
|
||||
move S8, TD //boffset1
|
||||
addi.d TD, TD, 0x20
|
||||
|
||||
srai.d I, N, 0x02
|
||||
beq ZERO, I, .L_JN1
|
||||
|
||||
.L_JI1: /* if(i>0) i--*/
|
||||
vld U0, S1, 0x00
|
||||
vld U1, S1, 0x10
|
||||
vld U2, S2, 0x00
|
||||
vld U3, S2, 0x10
|
||||
|
||||
vst U0, S8, 0x00
|
||||
vst U2, S8, 0x10
|
||||
|
||||
slli.d T0, M, 0x04
|
||||
add.d S8, S8, T0
|
||||
|
||||
vst U1, S8, 0x00
|
||||
vst U3, S8, 0x10
|
||||
|
||||
add.d S8, S8, T0
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_JI1
|
||||
|
||||
.L_JN1: /* if(n&2) */
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_JN2
|
||||
|
||||
vld U0, S1, 0x00
|
||||
vld U1, S2, 0x00
|
||||
|
||||
vst U0, S8, 0x00
|
||||
vst U1, S8, 0x10
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
|
||||
.L_JN2: /* if(n&1) */
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_J0
|
||||
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S1, 0x04
|
||||
fld.s F2, S2, 0x00
|
||||
fld.s F3, S2, 0x04
|
||||
|
||||
fst.s F0, S9, 0x00
|
||||
fst.s F1, S9, 0x04
|
||||
fst.s F2, S9, 0x08
|
||||
fst.s F3, S9, 0x0c
|
||||
|
||||
addi.d S9, S9, 0x10
|
||||
|
||||
.L_J0:
|
||||
addi.d J, J, -1
|
||||
blt ZERO, J, .L_J1
|
||||
|
||||
.L_M1: /* if(m&1) */
|
||||
andi I, M, 0x01
|
||||
beq ZERO, I, .L_M0
|
||||
|
||||
srai.d I, N, 0x02
|
||||
beq ZERO, I, .L_M1N1
|
||||
|
||||
.L_M1I1: /* if(i>0) */
|
||||
vld U0, TS, 0x00
|
||||
vld U1, TS, 0x10
|
||||
|
||||
vst U0, TD, 0x00
|
||||
|
||||
slli.d T0, M, 0x04
|
||||
add.d TD, TD, T0
|
||||
|
||||
vst U1, TD, 0x00
|
||||
|
||||
add.d TD, TD, T0
|
||||
addi.d TS, TS, 0x20
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_M1I1
|
||||
|
||||
.L_M1N1: /* if(n&2) */
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_M1N2
|
||||
|
||||
vld U0, TS, 0x00
|
||||
|
||||
vst U0, TD, 0x00
|
||||
|
||||
addi.d TS, TS, 0x10
|
||||
|
||||
.L_M1N2: /* if(n&1) */
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M0
|
||||
|
||||
fld.s F0, TS, 0x00
|
||||
fld.s F1, TS, 0x04
|
||||
|
||||
fst.s F0, S9, 0x00
|
||||
fst.s F1, S9, 0x04
|
||||
|
||||
.L_M0:
|
||||
LDARG $r23, $sp, 0
|
||||
addi.d $sp, $sp, 8
|
||||
jirl $r0, $r1, 0x00
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,147 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define t1 $r12
|
||||
#define t2 $r13
|
||||
#define t3 $r14
|
||||
#define t4 $r15
|
||||
#define a1 $f15
|
||||
#define a2 $f16
|
||||
#define res $f19
|
||||
#define VX0 $xr15
|
||||
#define VX1 $xr16
|
||||
#define VX2 $xr17
|
||||
#define VX3 $xr18
|
||||
#define VX4 $xr21
|
||||
#define res1 $xr19
|
||||
#define res2 $xr20
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
xvxor.v res1, res1, res1
|
||||
xvxor.v res2, res2, res2
|
||||
bge $r0, N, .L999
|
||||
beq $r0, INCX, .L999
|
||||
li.d TEMP, SIZE
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
srai.d I, N, 2
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvfcvtl.d.s VX1, VX0
|
||||
xvfcvth.d.s VX2, VX0
|
||||
xvfmadd.d res1, VX1, VX1, res1
|
||||
xvfmadd.d res2, VX2, VX2, res2
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 8 * SIZE
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
b .L996
|
||||
|
||||
.L20:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
add.d X, X, INCX
|
||||
xvfcvtl.d.s VX1, VX0
|
||||
xvfcvth.d.s VX2, VX0
|
||||
xvfmadd.d res1, VX1, VX1, res1
|
||||
xvfmadd.d res2, VX2, VX2, res2
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
b .L996
|
||||
|
||||
.L996:
|
||||
xvfadd.d res1, res1, res2
|
||||
xvpickve.d VX1, res1, 1
|
||||
xvpickve.d VX2, res1, 2
|
||||
xvpickve.d VX3, res1, 3
|
||||
xvfadd.d res1, VX1, res1
|
||||
xvfadd.d res1, VX2, res1
|
||||
xvfadd.d res1, VX3, res1
|
||||
.align 3
|
||||
|
||||
.L997:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
fld.s a1, X, 0 * SIZE
|
||||
fld.s a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s a2, a2
|
||||
fmadd.d res, a1, a1, res
|
||||
fmadd.d res, a2, a2, res
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
fsqrt.d res, res
|
||||
move $r4, $r17
|
||||
fcvt.s.d $f0, res
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,155 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define t1 $r12
|
||||
#define t2 $r13
|
||||
#define t3 $r14
|
||||
#define t4 $r15
|
||||
#define a1 $f15
|
||||
#define a2 $f16
|
||||
#define res $f19
|
||||
#define VX0 $vr15
|
||||
#define VX1 $vr16
|
||||
#define VX2 $vr17
|
||||
#define VX3 $vr18
|
||||
#define VX4 $vr21
|
||||
#define res1 $vr19
|
||||
#define res2 $vr20
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
vxor.v res1, res1, res1
|
||||
vxor.v res2, res2, res2
|
||||
bge $r0, N, .L999
|
||||
beq $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
srai.d I, N, 2
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
vld VX0, X, 0 * SIZE
|
||||
vfcvtl.d.s VX1, VX0
|
||||
vfcvth.d.s VX2, VX0
|
||||
vfmadd.d res1, VX1, VX1, res1
|
||||
vfmadd.d res2, VX2, VX2, res2
|
||||
vld VX0, X, 4 * SIZE
|
||||
vfcvtl.d.s VX3, VX0
|
||||
vfcvth.d.s VX4, VX0
|
||||
vfmadd.d res1, VX3, VX3, res1
|
||||
vfmadd.d res2, VX4, VX4, res2
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 8 * SIZE
|
||||
blt $r0, I, .L10
|
||||
b .L996
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
add.d X, X, INCX
|
||||
vfcvtl.d.s VX1, VX0
|
||||
vfcvth.d.s VX2, VX0
|
||||
vfmadd.d res1, VX1, VX1, res1
|
||||
vfmadd.d res2, VX2, VX2, res2
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
add.d X, X, INCX
|
||||
vfcvtl.d.s VX3, VX0
|
||||
vfcvth.d.s VX4, VX0
|
||||
vfmadd.d res1, VX3, VX3, res1
|
||||
vfmadd.d res2, VX4, VX4, res2
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
b .L996
|
||||
.align 3
|
||||
|
||||
.L996:
|
||||
vfadd.d res1, res1, res2
|
||||
vreplvei.d VX1, res1, 1
|
||||
vfadd.d res1, VX1, res1
|
||||
.align 3
|
||||
|
||||
.L997:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
fld.s a1, X, 0 * SIZE
|
||||
fld.s a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s a2, a2
|
||||
fmadd.d res, a1, a1, res
|
||||
fmadd.d res, a2, a2, res
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
fsqrt.d res, res
|
||||
move $r4, $r17
|
||||
fcvt.s.d $f0, $f19
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,306 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define Y $r7
|
||||
#define INCY $r8
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define t1 $r14
|
||||
#define t2 $r15
|
||||
#define t3 $r16
|
||||
#define t4 $r19
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define VX0 $xr12
|
||||
#define VX1 $xr13
|
||||
|
||||
PROLOGUE
|
||||
bge $r0, N, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
slli.d INCY, INCY, BASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
|
||||
b .L11 // INCX==1 and INCY==1
|
||||
.L20:
|
||||
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
|
||||
b .L21 // INCX!=1 and INCY==1
|
||||
|
||||
/* INCX==1 and INCY==1 */
|
||||
.L11:
|
||||
bge $r0, I, .L112
|
||||
.align 3
|
||||
|
||||
.L111:
|
||||
xvld VX0, X, 0
|
||||
addi.d I, I, -1
|
||||
xvst VX0, Y, 0
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 32
|
||||
xvst VX0, Y, 32
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L111
|
||||
.align 3
|
||||
|
||||
.L112:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L113:
|
||||
LD $f12, X, 0
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
ST $f12, Y, 0
|
||||
addi.d Y, Y, SIZE
|
||||
blt $r0, I, .L113
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
/* INCX==1 and INCY!=1 */
|
||||
.L12:
|
||||
bge $r0, I, .L122
|
||||
.align 3
|
||||
|
||||
.L121:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0
|
||||
xvld VX1, X, 32
|
||||
xvstelm.d VX0, Y, 0, 0
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.d VX0, Y, 0, 1
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.d VX0, Y, 0, 2
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.d VX0, Y, 0, 3
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.d VX1, Y, 0, 0
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.d VX1, Y, 0, 1
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.d VX1, Y, 0, 2
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.d VX1, Y, 0, 3
|
||||
add.d Y, Y, INCY
|
||||
#else
|
||||
xvld VX0, X, 0
|
||||
xvstelm.w VX0, Y, 0, 0
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.w VX0, Y, 0, 1
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.w VX0, Y, 0, 2
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.w VX0, Y, 0, 3
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.w VX0, Y, 0, 4
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.w VX0, Y, 0, 5
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.w VX0, Y, 0, 6
|
||||
add.d Y, Y, INCY
|
||||
xvstelm.w VX0, Y, 0, 7
|
||||
add.d Y, Y, INCY
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L121
|
||||
.align 3
|
||||
|
||||
.L122:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L123:
|
||||
LD $f12, X, 0
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
ST $f12, Y, 0
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L123
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
/* INCX!=1 and INCY==1 */
|
||||
.L21:
|
||||
bge $r0, I, .L212
|
||||
.align 3
|
||||
|
||||
.L211:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
xvst VX0, Y, 0
|
||||
ld.d t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvst VX1, Y, 32
|
||||
#else
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
xvst VX0, Y, 0
|
||||
#endif
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L211
|
||||
.align 3
|
||||
|
||||
.L212:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L213:
|
||||
LD $f12, X, 0
|
||||
addi.d I, I, -1
|
||||
ST $f12, Y, 0
|
||||
add.d X, X, INCX
|
||||
addi.d Y, Y, SIZE
|
||||
blt $r0, I, .L213
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
/* INCX!=1 and INCY!=1 */
|
||||
.L22:
|
||||
bge $r0, I, .L223
|
||||
.align 3
|
||||
|
||||
.L222:
|
||||
LD a1, X, 0
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0
|
||||
add.d X, X, INCX
|
||||
ST a1, Y, 0
|
||||
add.d Y, Y, INCY
|
||||
ST a2, Y, 0
|
||||
add.d Y, Y, INCY
|
||||
ST a3, X, 0
|
||||
add.d Y, Y, INCY
|
||||
ST a4, X, 0
|
||||
add.d Y, Y, INCY
|
||||
LD a1, X, 0
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0
|
||||
add.d X, X, INCX
|
||||
ST a1, Y, 0
|
||||
add.d Y, Y, INCY
|
||||
ST a2, Y, 0
|
||||
add.d Y, Y, INCY
|
||||
ST a3, X, 0
|
||||
add.d Y, Y, INCY
|
||||
ST a4, X, 0
|
||||
add.d Y, Y, INCY
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L222
|
||||
.align 3
|
||||
|
||||
.L223:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L224:
|
||||
LD $f12, X, 0
|
||||
addi.d I, I, -1
|
||||
ST $f12, Y, 0
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L224
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r12
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,316 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define Y $r7
|
||||
#define INCY $r8
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define t1 $r14
|
||||
#define t2 $r15
|
||||
#define t3 $r16
|
||||
#define t4 $r19
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define VX0 $vr12
|
||||
#define VX1 $vr13
|
||||
|
||||
PROLOGUE
|
||||
bge $r0, N, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
slli.d INCY, INCY, BASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
|
||||
b .L11 // INCX==1 and INCY==1
|
||||
.L20:
|
||||
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
|
||||
b .L21 // INCX!=1 and INCY==1
|
||||
|
||||
/* INCX==1 and INCY==1 */
|
||||
.L11:
|
||||
bge $r0, I, .L112
|
||||
.align 3
|
||||
|
||||
.L111:
|
||||
vld VX0, X, 0
|
||||
vld VX1, X, 16
|
||||
addi.d I, I, -1
|
||||
vst VX0, Y, 0
|
||||
vst VX1, Y, 16
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 32
|
||||
vld VX1, X, 48
|
||||
vst VX0, Y, 32
|
||||
vst VX1, Y, 48
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L111
|
||||
.align 3
|
||||
|
||||
.L112:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L113:
|
||||
LD $f12, X, 0
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
ST $f12, Y, 0
|
||||
addi.d Y, Y, SIZE
|
||||
blt $r0, I, .L113
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
/* INCX==1 and INCY!=1 */
|
||||
.L12:
|
||||
bge $r0, I, .L122
|
||||
.align 3
|
||||
|
||||
.L121:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0
|
||||
vld VX1, X, 16
|
||||
vstelm.d VX0, Y, 0, 0
|
||||
add.d Y, Y, INCY
|
||||
vstelm.d VX0, Y, 0, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.d VX1, Y, 0, 0
|
||||
add.d Y, Y, INCY
|
||||
vstelm.d VX1, Y, 0, 1
|
||||
add.d Y, Y, INCY
|
||||
vld VX0, X, 32
|
||||
vld VX1, X, 48
|
||||
vstelm.d VX0, Y, 0, 0
|
||||
add.d Y, Y, INCY
|
||||
vstelm.d VX0, Y, 0, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.d VX1, Y, 0, 0
|
||||
add.d Y, Y, INCY
|
||||
vstelm.d VX1, Y, 0, 1
|
||||
add.d Y, Y, INCY
|
||||
#else
|
||||
vld VX0, X, 0
|
||||
vld VX1, X, 16
|
||||
vstelm.w VX0, Y, 0, 0
|
||||
add.d Y, Y, INCY
|
||||
vstelm.w VX0, Y, 0, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.w VX0, Y, 0, 2
|
||||
add.d Y, Y, INCY
|
||||
vstelm.w VX0, Y, 0, 3
|
||||
add.d Y, Y, INCY
|
||||
vstelm.w VX1, Y, 0, 0
|
||||
add.d Y, Y, INCY
|
||||
vstelm.w VX1, Y, 0, 1
|
||||
add.d Y, Y, INCY
|
||||
vstelm.w VX1, Y, 0, 2
|
||||
add.d Y, Y, INCY
|
||||
vstelm.w VX1, Y, 0, 3
|
||||
add.d Y, Y, INCY
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L121
|
||||
.align 3
|
||||
|
||||
.L122:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L123:
|
||||
LD $f12, X, 0
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
ST $f12, Y, 0
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L123
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
/* INCX!=1 and INCY==1 */
|
||||
.L21:
|
||||
bge $r0, I, .L212
|
||||
.align 3
|
||||
|
||||
.L211:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vst VX0, Y, 0
|
||||
vst VX1, Y, 16
|
||||
ld.d t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vst VX0, Y, 32
|
||||
vst VX1, Y, 48
|
||||
#else
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
vst VX0, Y, 0
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
vst VX1, Y, 16
|
||||
#endif
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L211
|
||||
.align 3
|
||||
|
||||
.L212:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L213:
|
||||
LD $f12, X, 0
|
||||
addi.d I, I, -1
|
||||
ST $f12, Y, 0
|
||||
add.d X, X, INCX
|
||||
addi.d Y, Y, SIZE
|
||||
blt $r0, I, .L213
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
/* INCX!=1 and INCY!=1 */
|
||||
.L22:
|
||||
bge $r0, I, .L223
|
||||
.align 3
|
||||
|
||||
.L222:
|
||||
LD a1, X, 0
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0
|
||||
add.d X, X, INCX
|
||||
ST a1, Y, 0
|
||||
add.d Y, Y, INCY
|
||||
ST a2, Y, 0
|
||||
add.d Y, Y, INCY
|
||||
ST a3, X, 0
|
||||
add.d Y, Y, INCY
|
||||
ST a4, X, 0
|
||||
add.d Y, Y, INCY
|
||||
LD a1, X, 0
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0
|
||||
add.d X, X, INCX
|
||||
ST a1, Y, 0
|
||||
add.d Y, Y, INCY
|
||||
ST a2, Y, 0
|
||||
add.d Y, Y, INCY
|
||||
ST a3, X, 0
|
||||
add.d Y, Y, INCY
|
||||
ST a4, X, 0
|
||||
add.d Y, Y, INCY
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L222
|
||||
.align 3
|
||||
|
||||
.L223:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L224:
|
||||
LD $f12, X, 0
|
||||
addi.d I, I, -1
|
||||
ST $f12, Y, 0
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L224
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r12
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,907 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define Y $r7
|
||||
#define INCY $r8
|
||||
#define C $f0
|
||||
#define S $f1
|
||||
|
||||
#define I $r12
|
||||
#define TEMP $r13
|
||||
#define t1 $r14
|
||||
#define t2 $r16
|
||||
#define t3 $r15
|
||||
#define t4 $r17
|
||||
#define XX $r18
|
||||
#define YY $r19
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define s1 $f16
|
||||
#define s2 $f17
|
||||
#define s3 $f18
|
||||
#define s4 $f19
|
||||
#define VX0 $vr8
|
||||
#define VX1 $vr20
|
||||
#define VX2 $vr21
|
||||
#define VX3 $vr22
|
||||
#define VT0 $vr10
|
||||
#define VT1 $vr18
|
||||
#define VXC $vr23
|
||||
#define VXS $vr9
|
||||
#define VXZ $vr11
|
||||
#define x1 $vr12
|
||||
#define x2 $vr13
|
||||
#define x3 $vr14
|
||||
#define x4 $vr15
|
||||
|
||||
PROLOGUE
|
||||
|
||||
bge $r0, N, .L999
|
||||
li.d TEMP, 1
|
||||
movgr2fr.d a1, $r0
|
||||
FFINT a1, a1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
slli.d INCY, INCY, ZBASE_SHIFT
|
||||
MTG t1, C
|
||||
MTG t2, S
|
||||
MTG t3, a1
|
||||
#ifdef DOUBLE
|
||||
vreplgr2vr.d VXC, t1
|
||||
vreplgr2vr.d VXS, t2
|
||||
vreplgr2vr.d VXZ, t3
|
||||
#else
|
||||
vreplgr2vr.w VXC, t1
|
||||
vreplgr2vr.w VXS, t2
|
||||
vreplgr2vr.w VXZ, t3
|
||||
srai.d I, N, 2
|
||||
#endif
|
||||
beq INCX, $r0, .L996
|
||||
beq INCY, $r0, .L996
|
||||
bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1
|
||||
bne INCY, TEMP, .L22
|
||||
|
||||
.L11:
|
||||
bge $r0, I, .L997
|
||||
CMPEQ $fcc0, C, a1
|
||||
bcnez $fcc0, .L110
|
||||
CMPEQ $fcc0, S, a1
|
||||
bcnez $fcc0, .L112 // C!=0 S==0
|
||||
b .L111 // C!=0 S!=0
|
||||
.align 3
|
||||
|
||||
.L110:
|
||||
CMPEQ $fcc0, S, a1
|
||||
bcnez $fcc0, .L114 // C==0 S==0
|
||||
b .L113 // C==0 S!=0
|
||||
.align 3
|
||||
|
||||
.L111: // C!=0 S!=0
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vld VX3, Y, 2 * SIZE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vpickev.d x3, VX3, VX2
|
||||
vpickod.d x4, VX3, VX2
|
||||
vfmul.d VX0, x1, VXC
|
||||
vfmadd.d VX0, x3, VXS, VX0
|
||||
vfmul.d VX1, x1, VXS
|
||||
vfmsub.d VX1, x3, VXC, VX1
|
||||
vfmul.d VX2, x2, VXC
|
||||
vfmadd.d VX2, x4, VXS, VX2
|
||||
vfmul.d VX3, x2, VXS
|
||||
vfmsub.d VX3, x4, VXC, VX3
|
||||
vilvl.d x1, VX2 ,VX0
|
||||
vilvh.d x2, VX2, VX0
|
||||
vilvl.d x3, VX3 ,VX1
|
||||
vilvh.d x4, VX3, VX1
|
||||
vst x1, X, 0 * SIZE
|
||||
vst x3, Y, 0 * SIZE
|
||||
vst x2, X, 2 * SIZE
|
||||
vst x4, Y, 2 * SIZE
|
||||
addi.d X, X, 4 * SIZE
|
||||
addi.d Y, Y, 4 * SIZE
|
||||
#else
|
||||
vld VX1, X, 4 * SIZE
|
||||
vld VX3, Y, 4 * SIZE
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vpickev.w x3, VX3, VX2
|
||||
vpickod.w x4, VX3, VX2
|
||||
vfmul.s VX0, x1, VXC
|
||||
vfmadd.s VX0, x3, VXS, VX0
|
||||
vfmul.s VX1, x1, VXS
|
||||
vfmsub.s VX1, x3, VXC, VX1
|
||||
vfmul.s VX2, x2, VXC
|
||||
vfmadd.s VX2, x4, VXS, VX2
|
||||
vfmul.s VX3, x2, VXS
|
||||
vfmsub.s VX3, x4, VXC, VX3
|
||||
vilvl.w x1, VX2 ,VX0
|
||||
vilvh.w x2, VX2, VX0
|
||||
vilvl.w x3, VX3 ,VX1
|
||||
vilvh.w x4, VX3, VX1
|
||||
vst x1, X, 0 * SIZE
|
||||
vst x3, Y, 0 * SIZE
|
||||
vst x2, X, 4 * SIZE
|
||||
vst x4, Y, 4 * SIZE
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L111
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L112: // C!=0 S==0
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vld VX3, Y, 2 * SIZE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vpickev.d x3, VX3, VX2
|
||||
vpickod.d x4, VX3, VX2
|
||||
vfmul.d VX0, x1, VXC
|
||||
vfmul.d VX1, x3, VXC
|
||||
vfmul.d VX2, x2, VXC
|
||||
vfmul.d VX3, x4, VXC
|
||||
vilvl.d x1, VX2 ,VX0
|
||||
vilvh.d x2, VX2, VX0
|
||||
vilvl.d x3, VX3 ,VX1
|
||||
vilvh.d x4, VX3, VX1
|
||||
vst x1, X, 0 * SIZE
|
||||
vst x3, Y, 0 * SIZE
|
||||
vst x2, X, 2 * SIZE
|
||||
vst x4, Y, 2 * SIZE
|
||||
addi.d X, X, 4 * SIZE
|
||||
addi.d Y, Y, 4 * SIZE
|
||||
#else
|
||||
vld VX1, X, 4 * SIZE
|
||||
vld VX3, Y, 4 * SIZE
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vpickev.w x3, VX3, VX2
|
||||
vpickod.w x4, VX3, VX2
|
||||
vfmul.s VX0, x1, VXC
|
||||
vfmul.s VX1, x3, VXC
|
||||
vfmul.s VX2, x2, VXC
|
||||
vfmul.s VX3, x4, VXC
|
||||
vilvl.w x1, VX2 ,VX0
|
||||
vilvh.w x2, VX2, VX0
|
||||
vilvl.w x3, VX3 ,VX1
|
||||
vilvh.w x4, VX3, VX1
|
||||
vst x1, X, 0 * SIZE
|
||||
vst x3, Y, 0 * SIZE
|
||||
vst x2, X, 4 * SIZE
|
||||
vst x4, Y, 4 * SIZE
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L112
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L113: // C==0 S!=0
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vld VX3, Y, 2 * SIZE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vpickev.d x3, VX3, VX2
|
||||
vpickod.d x4, VX3, VX2
|
||||
vfmul.d VX0, x3, VXS
|
||||
vfmul.d VX1, x1, VXS
|
||||
vfsub.d VX1, VXZ, VX1
|
||||
vfmul.d VX2, x4, VXS
|
||||
vfmul.d VX3, x2, VXS
|
||||
vfsub.d VX3, VXZ, VX3
|
||||
vilvl.d x1, VX2 ,VX0
|
||||
vilvh.d x2, VX2, VX0
|
||||
vilvl.d x3, VX3 ,VX1
|
||||
vilvh.d x4, VX3, VX1
|
||||
vst x1, X, 0 * SIZE
|
||||
vst x3, Y, 0 * SIZE
|
||||
vst x2, X, 2 * SIZE
|
||||
vst x4, Y, 2 * SIZE
|
||||
addi.d X, X, 4 * SIZE
|
||||
addi.d Y, Y, 4 * SIZE
|
||||
#else
|
||||
vld VX1, X, 4 * SIZE
|
||||
vld VX3, Y, 4 * SIZE
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vpickev.w x3, VX3, VX2
|
||||
vpickod.w x4, VX3, VX2
|
||||
vfmul.s VX0, x3, VXS
|
||||
vfmul.s VX1, x1, VXS
|
||||
vfsub.s VX1, VXZ, VX1
|
||||
vfmul.s VX2, x4, VXS
|
||||
vfmul.s VX3, x2, VXS
|
||||
vfsub.s VX3, VXZ, VX3
|
||||
vilvl.w x1, VX2 ,VX0
|
||||
vilvh.w x2, VX2, VX0
|
||||
vilvl.w x3, VX3 ,VX1
|
||||
vilvh.w x4, VX3, VX1
|
||||
vst x1, X, 0 * SIZE
|
||||
vst x3, Y, 0 * SIZE
|
||||
vst x2, X, 4 * SIZE
|
||||
vst x4, Y, 4 * SIZE
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L113
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L114: // C==0 S==0
|
||||
vst VXZ, X, 0 * SIZE
|
||||
vst VXZ, Y, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vst VXZ, X, 2 * SIZE
|
||||
vst VXZ, Y, 2 * SIZE
|
||||
addi.d X, X, 4 * SIZE
|
||||
addi.d Y, Y, 4 * SIZE
|
||||
#else
|
||||
vst VXZ, X, 4 * SIZE
|
||||
vst VXZ, Y, 4 * SIZE
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L114
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#ifdef DOUBLE
|
||||
srai.d I, N, 2
|
||||
#endif
|
||||
bge $r0, I, .L997
|
||||
move YY, Y
|
||||
move XX, X
|
||||
CMPEQ $fcc0, C, a1
|
||||
bcnez $fcc0, .L220
|
||||
CMPEQ $fcc0, S, a1
|
||||
bcnez $fcc0, .L222 // C!=0 S==0
|
||||
b .L221 // C!=0 S!=0
|
||||
.align 3
|
||||
|
||||
.L220:
|
||||
CMPEQ $fcc0, S, a1
|
||||
bcnez $fcc0, .L224 // C==0 S==0
|
||||
b .L223 // C==0 S!=0
|
||||
.align 3
|
||||
|
||||
.L221: // C!=0 S!=0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
vinsgr2vr.d x3, t1, 0
|
||||
vinsgr2vr.d x4, t2, 0
|
||||
vinsgr2vr.d x3, t3, 1
|
||||
vinsgr2vr.d x4, t4, 1
|
||||
add.d Y, Y, INCY
|
||||
vfmul.d VX0, x1, VXC
|
||||
vfmadd.d VX0, x3, VXS, VX0
|
||||
vfmul.d VX1, x1, VXS
|
||||
vfmsub.d VX1, x3, VXC, VX1
|
||||
vfmul.d VX2, x2, VXC
|
||||
vfmadd.d VX2, x4, VXS, VX2
|
||||
vfmul.d VX3, x2, VXS
|
||||
vfmsub.d VX3, x4, VXC, VX3
|
||||
vstelm.d VX0, XX, 0, 0
|
||||
vstelm.d VX2, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d VX0, XX, 0, 1
|
||||
vstelm.d VX2, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d VX1, YY, 0, 0
|
||||
vstelm.d VX3, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VX1, YY, 0, 1
|
||||
vstelm.d VX3, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
add.d X, X, INCX
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
vinsgr2vr.d x3, t1, 0
|
||||
vinsgr2vr.d x4, t2, 0
|
||||
vinsgr2vr.d x3, t3, 1
|
||||
vinsgr2vr.d x4, t4, 1
|
||||
add.d Y, Y, INCY
|
||||
vfmul.d VX0, x1, VXC
|
||||
vfmadd.d VX0, x3, VXS, VX0
|
||||
vfmul.d VX1, x1, VXS
|
||||
vfmsub.d VX1, x3, VXC, VX1
|
||||
vfmul.d VX2, x2, VXC
|
||||
vfmadd.d VX2, x4, VXS, VX2
|
||||
vfmul.d VX3, x2, VXS
|
||||
vfmsub.d VX3, x4, VXC, VX3
|
||||
vstelm.d VX0, XX, 0, 0
|
||||
vstelm.d VX2, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d VX0, XX, 0, 1
|
||||
vstelm.d VX2, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d VX1, YY, 0, 0
|
||||
vstelm.d VX3, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VX1, YY, 0, 1
|
||||
vstelm.d VX3, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L221
|
||||
b .L995
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 0
|
||||
vinsgr2vr.w x2, t2, 0
|
||||
vinsgr2vr.w x1, t3, 1
|
||||
vinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.w x3, t1, 0
|
||||
vinsgr2vr.w x4, t2, 0
|
||||
vinsgr2vr.w x3, t3, 1
|
||||
vinsgr2vr.w x4, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
vinsgr2vr.w x1, t1, 2
|
||||
vinsgr2vr.w x2, t2, 2
|
||||
vinsgr2vr.w x1, t3, 3
|
||||
vinsgr2vr.w x2, t4, 3
|
||||
add.d X, X, INCX
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
vinsgr2vr.w x3, t1, 2
|
||||
vinsgr2vr.w x4, t2, 2
|
||||
vinsgr2vr.w x3, t3, 3
|
||||
vinsgr2vr.w x4, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
|
||||
vfmul.s VX0, x1, VXC
|
||||
vfmadd.s VX0, x3, VXS, VX0
|
||||
vfmul.s VX1, x1, VXS
|
||||
vfmsub.s VX1, x3, VXC, VX1
|
||||
vfmul.s VX2, x2, VXC
|
||||
vfmadd.s VX2, x4, VXS, VX2
|
||||
vfmul.s VX3, x2, VXS
|
||||
vfmsub.s VX3, x4, VXC, VX3
|
||||
vstelm.w VX0, XX, 0, 0
|
||||
vstelm.w VX2, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w VX0, XX, 0, 1
|
||||
vstelm.w VX2, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w VX0, XX, 0, 2
|
||||
vstelm.w VX2, XX, 1 * SIZE, 2
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w VX0, XX, 0, 3
|
||||
vstelm.w VX2, XX, 1 * SIZE, 3
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w VX1, YY, 0, 0
|
||||
vstelm.w VX3, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX1, YY, 0, 1
|
||||
vstelm.w VX3, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX1, YY, 0, 2
|
||||
vstelm.w VX3, YY, 1 * SIZE, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX1, YY, 0, 3
|
||||
vstelm.w VX3, YY, 1 * SIZE, 3
|
||||
add.d YY, YY, INCY
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L221
|
||||
b .L997
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L222: // C!=0 S==0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
vinsgr2vr.d x3, t1, 0
|
||||
vinsgr2vr.d x4, t2, 0
|
||||
vinsgr2vr.d x3, t3, 1
|
||||
vinsgr2vr.d x4, t4, 1
|
||||
add.d Y, Y, INCY
|
||||
vfmul.d VX0, x1, VXC
|
||||
vfmul.d VX1, x3, VXC
|
||||
vfmul.d VX2, x2, VXC
|
||||
vfmul.d VX3, x4, VXC
|
||||
vstelm.d VX0, XX, 0, 0
|
||||
vstelm.d VX2, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d VX0, XX, 0, 1
|
||||
vstelm.d VX2, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d VX1, YY, 0, 0
|
||||
vstelm.d VX3, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VX1, YY, 0, 1
|
||||
vstelm.d VX3, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
add.d X, X, INCX
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
vinsgr2vr.d x3, t1, 0
|
||||
vinsgr2vr.d x4, t2, 0
|
||||
vinsgr2vr.d x3, t3, 1
|
||||
vinsgr2vr.d x4, t4, 1
|
||||
add.d Y, Y, INCY
|
||||
vfmul.d VX0, x1, VXC
|
||||
vfmul.d VX1, x3, VXC
|
||||
vfmul.d VX2, x2, VXC
|
||||
vfmul.d VX3, x4, VXC
|
||||
vstelm.d VX0, XX, 0, 0
|
||||
vstelm.d VX2, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d VX0, XX, 0, 1
|
||||
vstelm.d VX2, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d VX1, YY, 0, 0
|
||||
vstelm.d VX3, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VX1, YY, 0, 1
|
||||
vstelm.d VX3, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L222
|
||||
b .L995
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 0
|
||||
vinsgr2vr.w x2, t2, 0
|
||||
vinsgr2vr.w x1, t3, 1
|
||||
vinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.w x3, t1, 0
|
||||
vinsgr2vr.w x4, t2, 0
|
||||
vinsgr2vr.w x3, t3, 1
|
||||
vinsgr2vr.w x4, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
vinsgr2vr.w x1, t1, 2
|
||||
vinsgr2vr.w x2, t2, 2
|
||||
vinsgr2vr.w x1, t3, 3
|
||||
vinsgr2vr.w x2, t4, 3
|
||||
add.d X, X, INCX
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
vinsgr2vr.w x3, t1, 2
|
||||
vinsgr2vr.w x4, t2, 2
|
||||
vinsgr2vr.w x3, t3, 3
|
||||
vinsgr2vr.w x4, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
vfmul.s VX0, x1, VXC
|
||||
vfmul.s VX1, x3, VXC
|
||||
vfmul.s VX2, x2, VXC
|
||||
vfmul.s VX3, x4, VXC
|
||||
vstelm.w VX0, XX, 0, 0
|
||||
vstelm.w VX2, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w VX0, XX, 0, 1
|
||||
vstelm.w VX2, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w VX0, XX, 0, 2
|
||||
vstelm.w VX2, XX, 1 * SIZE, 2
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w VX0, XX, 0, 3
|
||||
vstelm.w VX2, XX, 1 * SIZE, 3
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w VX1, YY, 0, 0
|
||||
vstelm.w VX3, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX1, YY, 0, 1
|
||||
vstelm.w VX3, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX1, YY, 0, 2
|
||||
vstelm.w VX3, YY, 1 * SIZE, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX1, YY, 0, 3
|
||||
vstelm.w VX3, YY, 1 * SIZE, 3
|
||||
add.d YY, YY, INCY
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L222
|
||||
b .L997
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L223: // C==0 S!=0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
vinsgr2vr.d x3, t1, 0
|
||||
vinsgr2vr.d x4, t2, 0
|
||||
vinsgr2vr.d x3, t3, 1
|
||||
vinsgr2vr.d x4, t4, 1
|
||||
add.d Y, Y, INCY
|
||||
vfmul.d VX0, x3, VXS
|
||||
vfmul.d VX1, x1, VXS
|
||||
vfsub.d VX1, VXZ, VX1
|
||||
vfmul.d VX2, x4, VXS
|
||||
vfmul.d VX3, x2, VXS
|
||||
vfsub.d VX3, VXZ, VX3
|
||||
vstelm.d VX0, XX, 0, 0
|
||||
vstelm.d VX2, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d VX0, XX, 0, 1
|
||||
vstelm.d VX2, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d VX1, YY, 0, 0
|
||||
vstelm.d VX3, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VX1, YY, 0, 1
|
||||
vstelm.d VX3, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
add.d X, X, INCX
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
vinsgr2vr.d x3, t1, 0
|
||||
vinsgr2vr.d x4, t2, 0
|
||||
vinsgr2vr.d x3, t3, 1
|
||||
vinsgr2vr.d x4, t4, 1
|
||||
add.d Y, Y, INCY
|
||||
vfmul.d VX0, x3, VXS
|
||||
vfmul.d VX1, x1, VXS
|
||||
vfsub.d VX1, VXZ, VX1
|
||||
vfmul.d VX2, x4, VXS
|
||||
vfmul.d VX3, x2, VXS
|
||||
vfsub.d VX3, VXZ, VX3
|
||||
vstelm.d VX0, XX, 0, 0
|
||||
vstelm.d VX2, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d VX0, XX, 0, 1
|
||||
vstelm.d VX2, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d VX1, YY, 0, 0
|
||||
vstelm.d VX3, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VX1, YY, 0, 1
|
||||
vstelm.d VX3, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L223
|
||||
b .L995
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 0
|
||||
vinsgr2vr.w x2, t2, 0
|
||||
vinsgr2vr.w x1, t3, 1
|
||||
vinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
vinsgr2vr.w x3, t1, 0
|
||||
vinsgr2vr.w x4, t2, 0
|
||||
vinsgr2vr.w x3, t3, 1
|
||||
vinsgr2vr.w x4, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
vinsgr2vr.w x1, t1, 2
|
||||
vinsgr2vr.w x2, t2, 2
|
||||
vinsgr2vr.w x1, t3, 3
|
||||
vinsgr2vr.w x2, t4, 3
|
||||
add.d X, X, INCX
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
vinsgr2vr.w x3, t1, 2
|
||||
vinsgr2vr.w x4, t2, 2
|
||||
vinsgr2vr.w x3, t3, 3
|
||||
vinsgr2vr.w x4, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
vfmul.s VX0, x3, VXS
|
||||
vfmul.s VX1, x1, VXS
|
||||
vfsub.s VX1, VXZ, VX1
|
||||
vfmul.s VX2, x4, VXS
|
||||
vfmul.s VX3, x2, VXS
|
||||
vfsub.s VX3, VXZ, VX3
|
||||
vstelm.w VX0, XX, 0, 0
|
||||
vstelm.w VX2, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w VX0, XX, 0, 1
|
||||
vstelm.w VX2, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w VX0, XX, 0, 2
|
||||
vstelm.w VX2, XX, 1 * SIZE, 2
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w VX0, XX, 0, 3
|
||||
vstelm.w VX2, XX, 1 * SIZE, 3
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w VX1, YY, 0, 0
|
||||
vstelm.w VX3, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX1, YY, 0, 1
|
||||
vstelm.w VX3, YY, 1 * SIZE, 1
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX1, YY, 0, 2
|
||||
vstelm.w VX3, YY, 1 * SIZE, 2
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VX1, YY, 0, 3
|
||||
vstelm.w VX3, YY, 1 * SIZE, 3
|
||||
add.d YY, YY, INCY
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L223
|
||||
b .L997
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L224: // C==0 S==0
|
||||
#ifdef DOUBLE
|
||||
vstelm.d VXZ, XX, 0, 0
|
||||
vstelm.d VXZ, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d VXZ, XX, 0, 0
|
||||
vstelm.d VXZ, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d VXZ, XX, 0, 0
|
||||
vstelm.d VXZ, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d VXZ, XX, 0, 0
|
||||
vstelm.d VXZ, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d VXZ, YY, 0, 0
|
||||
vstelm.d VXZ, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VXZ, YY, 0, 0
|
||||
vstelm.d VXZ, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VXZ, YY, 0, 0
|
||||
vstelm.d VXZ, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.d VXZ, YY, 0, 0
|
||||
vstelm.d VXZ, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L224
|
||||
move X, XX
|
||||
move Y, YY
|
||||
b .L995
|
||||
#else
|
||||
vstelm.w VXZ, XX, 0, 0
|
||||
vstelm.w VXZ, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w VXZ, XX, 0, 0
|
||||
vstelm.w VXZ, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w VXZ, XX, 0, 0
|
||||
vstelm.w VXZ, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w VXZ, XX, 0, 0
|
||||
vstelm.w VXZ, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w VXZ, YY, 0, 0
|
||||
vstelm.w VXZ, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VXZ, YY, 0, 0
|
||||
vstelm.w VXZ, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VXZ, YY, 0, 0
|
||||
vstelm.w VXZ, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
vstelm.w VXZ, YY, 0, 0
|
||||
vstelm.w VXZ, YY, 1 * SIZE, 0
|
||||
add.d YY, YY, INCY
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L224
|
||||
move X, XX
|
||||
move Y, YY
|
||||
b .L997
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
#ifdef DOUBLE
|
||||
.L995:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
b .L998
|
||||
.align 3
|
||||
|
||||
#endif
|
||||
.L996:
|
||||
move I, N
|
||||
b .L998
|
||||
.align 3
|
||||
|
||||
.L997:
|
||||
#ifdef DOUBLE
|
||||
andi I, N, 1
|
||||
#else
|
||||
andi I, N, 3
|
||||
#endif
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, Y, 0 * SIZE
|
||||
LD a4, Y, 1 * SIZE
|
||||
MUL s1, a1, C
|
||||
MADD s1, a3, S, s1
|
||||
MUL s2, a1, S
|
||||
MSUB s2, a3, C, s2
|
||||
MUL s3, a2, C
|
||||
MADD s3, a4, S, s3
|
||||
MUL s4, a2, S
|
||||
MSUB s4, a4, C, s4
|
||||
addi.d I, I, -1
|
||||
ST s1, X, 0 * SIZE
|
||||
ST s2, Y, 0 * SIZE
|
||||
ST s3, X, 1 * SIZE
|
||||
ST s4, Y, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r12
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,645 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define ALPHAR $f0
|
||||
#define ALPHAI $f1
|
||||
#define X $r7
|
||||
#define INCX $r8
|
||||
|
||||
#define I $r12
|
||||
#define TEMP $r13
|
||||
#define t1 $r14
|
||||
#define t2 $r16
|
||||
#define t3 $r15
|
||||
#define t4 $r17
|
||||
#define XX $r18
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define s1 $f16
|
||||
#define s2 $f17
|
||||
#define s3 $f18
|
||||
#define s4 $f19
|
||||
#define VX0 $xr8
|
||||
#define VX1 $xr20
|
||||
#define VX2 $xr21
|
||||
#define VX3 $xr22
|
||||
#define VXAR $xr23
|
||||
#define VXAI $xr19
|
||||
#define VXZ $xr12
|
||||
#define x1 $xr18
|
||||
#define x2 $xr17
|
||||
#define x3 $xr16
|
||||
#define x4 $xr15
|
||||
|
||||
PROLOGUE
|
||||
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
movgr2fr.d a1, $r0
|
||||
FFINT a1, a1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
MTG t1, ALPHAR
|
||||
#ifdef DOUBLE
|
||||
xvreplgr2vr.d VXAR, t1
|
||||
movfr2gr.d t2, ALPHAI
|
||||
xvreplgr2vr.d VXAI, t2
|
||||
xvxor.v VXZ, VXZ, VXZ
|
||||
srai.d I, N, 2
|
||||
#else
|
||||
xvreplgr2vr.w VXAR, t1
|
||||
movfr2gr.s t2, ALPHAI
|
||||
xvreplgr2vr.w VXAI, t2
|
||||
xvxor.v VXZ, VXZ, VXZ
|
||||
srai.d I, N, 3
|
||||
#endif
|
||||
bne INCX, TEMP, .L22
|
||||
|
||||
.L11:
|
||||
bge $r0, I, .L997
|
||||
CMPEQ $fcc0, ALPHAR, a1
|
||||
CMPEQ $fcc1, ALPHAI, a1
|
||||
bceqz $fcc0, .L13
|
||||
b .L14
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
|
||||
b .L113 //alpha_r != 0.0 && alpha_i == 0.0
|
||||
|
||||
.L14:
|
||||
bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0
|
||||
b .L111 //alpha_r == 0.0 && alpha_i == 0.0
|
||||
.align 3
|
||||
|
||||
.L111: //alpha_r == 0.0 && alpha_i == 0.0
|
||||
xvst VXZ, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvst VXZ, X, 4 * SIZE
|
||||
addi.d X, X, 8 * SIZE
|
||||
#else
|
||||
xvst VXZ, X, 8 * SIZE
|
||||
addi.d X, X, 16 * SIZE
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L111
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L112: //alpha_r == 0.0 && alpha_i != 0.0
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
xvfmul.d x3, VXAI, x2
|
||||
xvfsub.d x3, VXZ, x3
|
||||
xvfmul.d x4, VXAI, x1
|
||||
xvilvl.d VX2, x4 ,x3
|
||||
xvilvh.d VX3, x4, x3
|
||||
xvst VX2, X, 0 * SIZE
|
||||
xvst VX3, X, 4 * SIZE
|
||||
addi.d X, X, 8 * SIZE
|
||||
#else
|
||||
xvld VX1, X, 8 * SIZE
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
xvfmul.s x3, VXAI, x2
|
||||
xvfsub.s x3, VXZ, x3
|
||||
xvfmul.s x4, VXAI, x1
|
||||
xvilvl.w VX2, x4 ,x3
|
||||
xvilvh.w VX3, x4, x3
|
||||
xvst VX2, X, 0 * SIZE
|
||||
xvst VX3, X, 8 * SIZE
|
||||
addi.d X, X, 16 * SIZE
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L112
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L113: //alpha_r != 0.0 && alpha_i == 0.0
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
xvfmul.d x3, VXAR, x1
|
||||
xvfmul.d x4, VXAR, x2
|
||||
xvilvl.d VX2, x4 ,x3
|
||||
xvilvh.d VX3, x4, x3
|
||||
xvst VX2, X, 0 * SIZE
|
||||
xvst VX3, X, 4 * SIZE
|
||||
addi.d X, X, 8 * SIZE
|
||||
#else
|
||||
xvld VX1, X, 8 * SIZE
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
xvfmul.s x3, VXAR, x1
|
||||
xvfmul.s x4, VXAR, x2
|
||||
xvilvl.w VX2, x4 ,x3
|
||||
xvilvh.w VX3, x4, x3
|
||||
xvst VX2, X, 0 * SIZE
|
||||
xvst VX3, X, 8 * SIZE
|
||||
addi.d X, X, 16 * SIZE
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L113
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L114: //alpha_r != 0.0 && alpha_i != 0.0
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
xvfmul.d VX0, VXAI, x2
|
||||
xvfmsub.d x3, VXAR, x1, VX0
|
||||
xvfmul.d VX1, VXAI, x1
|
||||
xvfmadd.d x4, VXAR, x2, VX1
|
||||
xvilvl.d VX2, x4 ,x3
|
||||
xvilvh.d VX3, x4, x3
|
||||
xvst VX2, X, 0 * SIZE
|
||||
xvst VX3, X, 4 * SIZE
|
||||
addi.d X, X, 8 * SIZE
|
||||
#else
|
||||
xvld VX1, X, 8 * SIZE
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
xvfmul.s VX0, VXAI, x2
|
||||
xvfmsub.s x3, VXAR, x1, VX0
|
||||
xvfmul.s VX1, VXAI, x1
|
||||
xvfmadd.s x4, VXAR, x2, VX1
|
||||
xvilvl.w VX2, x4 ,x3
|
||||
xvilvh.w VX3, x4, x3
|
||||
xvst VX2, X, 0 * SIZE
|
||||
xvst VX3, X, 8 * SIZE
|
||||
addi.d X, X, 16 * SIZE
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L114
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
bge $r0, I, .L997
|
||||
move XX, X
|
||||
CMPEQ $fcc0, ALPHAR, a1
|
||||
CMPEQ $fcc1, ALPHAI, a1
|
||||
bceqz $fcc0, .L23
|
||||
b .L24
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
|
||||
b .L223 //alpha_r != 0.0 && alpha_i == 0.0
|
||||
|
||||
.L24:
|
||||
bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0
|
||||
b .L221 //alpha_r == 0.0 && alpha_i == 0.0
|
||||
.align 3
|
||||
|
||||
.L221: //alpha_r == 0.0 && alpha_i == 0.0
|
||||
#ifdef DOUBLE
|
||||
xvstelm.d VXZ, X, 0, 0
|
||||
xvstelm.d VXZ, X, 1 * SIZE, 0
|
||||
add.d X, X, INCX
|
||||
xvstelm.d VXZ, X, 0, 0
|
||||
xvstelm.d VXZ, X, 1 * SIZE, 0
|
||||
add.d X, X, INCX
|
||||
xvstelm.d VXZ, X, 0, 0
|
||||
xvstelm.d VXZ, X, 1 * SIZE, 0
|
||||
add.d X, X, INCX
|
||||
xvstelm.d VXZ, X, 0, 0
|
||||
xvstelm.d VXZ, X, 1 * SIZE, 0
|
||||
#else
|
||||
xvstelm.w VXZ, X, 0, 0
|
||||
xvstelm.w VXZ, X, 1 * SIZE, 0
|
||||
add.d X, X, INCX
|
||||
xvstelm.w VXZ, X, 0, 0
|
||||
xvstelm.w VXZ, X, 1 * SIZE, 0
|
||||
add.d X, X, INCX
|
||||
xvstelm.w VXZ, X, 0, 0
|
||||
xvstelm.w VXZ, X, 1 * SIZE, 0
|
||||
add.d X, X, INCX
|
||||
xvstelm.w VXZ, X, 0, 0
|
||||
xvstelm.w VXZ, X, 1 * SIZE, 0
|
||||
add.d X, X, INCX
|
||||
xvstelm.w VXZ, X, 0, 0
|
||||
xvstelm.w VXZ, X, 1 * SIZE, 0
|
||||
add.d X, X, INCX
|
||||
xvstelm.w VXZ, X, 0, 0
|
||||
xvstelm.w VXZ, X, 1 * SIZE, 0
|
||||
add.d X, X, INCX
|
||||
xvstelm.w VXZ, X, 0, 0
|
||||
xvstelm.w VXZ, X, 1 * SIZE, 0
|
||||
add.d X, X, INCX
|
||||
xvstelm.w VXZ, X, 0, 0
|
||||
xvstelm.w VXZ, X, 1 * SIZE, 0
|
||||
#endif
|
||||
add.d X, X, INCX
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L221
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L222: //alpha_r == 0.0 && alpha_i != 0.0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 0
|
||||
xvinsgr2vr.d x2, t2, 0
|
||||
xvinsgr2vr.d x1, t3, 1
|
||||
xvinsgr2vr.d x2, t4, 1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
xvinsgr2vr.d x1, t1, 2
|
||||
xvinsgr2vr.d x2, t2, 2
|
||||
xvinsgr2vr.d x1, t3, 3
|
||||
xvinsgr2vr.d x2, t4, 3
|
||||
add.d X, X, INCX
|
||||
|
||||
xvfmul.d x3, VXAI, x2
|
||||
xvfsub.d x3, VXZ, x3
|
||||
xvfmul.d x4, VXAI, x1
|
||||
addi.d I, I, -1
|
||||
xvstelm.d x3, XX, 0 * SIZE, 0
|
||||
xvstelm.d x4, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.d x3, XX, 0 * SIZE, 1
|
||||
xvstelm.d x4, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.d x3, XX, 0 * SIZE, 2
|
||||
xvstelm.d x4, XX, 1 * SIZE, 2
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.d x3, XX, 0 * SIZE, 3
|
||||
xvstelm.d x4, XX, 1 * SIZE, 3
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 0
|
||||
xvinsgr2vr.w x2, t2, 0
|
||||
xvinsgr2vr.w x1, t3, 1
|
||||
xvinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
xvinsgr2vr.w x1, t1, 2
|
||||
xvinsgr2vr.w x2, t2, 2
|
||||
xvinsgr2vr.w x1, t3, 3
|
||||
xvinsgr2vr.w x2, t4, 3
|
||||
add.d X, X, INCX
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 4
|
||||
xvinsgr2vr.w x2, t2, 4
|
||||
xvinsgr2vr.w x1, t3, 5
|
||||
xvinsgr2vr.w x2, t4, 5
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
xvinsgr2vr.w x1, t1, 6
|
||||
xvinsgr2vr.w x2, t2, 6
|
||||
xvinsgr2vr.w x1, t3, 7
|
||||
xvinsgr2vr.w x2, t4, 7
|
||||
add.d X, X, INCX
|
||||
|
||||
xvfmul.s x3, VXAI, x2
|
||||
xvfsub.s x3, VXZ, x3
|
||||
xvfmul.s x4, VXAI, x1
|
||||
addi.d I, I, -1
|
||||
xvstelm.w x3, XX, 0 * SIZE, 0
|
||||
xvstelm.w x4, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 1
|
||||
xvstelm.w x4, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 2
|
||||
xvstelm.w x4, XX, 1 * SIZE, 2
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 3
|
||||
xvstelm.w x4, XX, 1 * SIZE, 3
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 4
|
||||
xvstelm.w x4, XX, 1 * SIZE, 4
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 5
|
||||
xvstelm.w x4, XX, 1 * SIZE, 5
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 6
|
||||
xvstelm.w x4, XX, 1 * SIZE, 6
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 7
|
||||
xvstelm.w x4, XX, 1 * SIZE, 7
|
||||
#endif
|
||||
add.d XX, XX, INCX
|
||||
blt $r0, I, .L222
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L223: //alpha_r != 0.0 && alpha_i == 0.0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 0
|
||||
xvinsgr2vr.d x2, t2, 0
|
||||
xvinsgr2vr.d x1, t3, 1
|
||||
xvinsgr2vr.d x2, t4, 1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
xvinsgr2vr.d x1, t1, 2
|
||||
xvinsgr2vr.d x2, t2, 2
|
||||
xvinsgr2vr.d x1, t3, 3
|
||||
xvinsgr2vr.d x2, t4, 3
|
||||
add.d X, X, INCX
|
||||
|
||||
xvfmul.d x3, VXAR, x1
|
||||
xvfmul.d x4, VXAR, x2
|
||||
addi.d I, I, -1
|
||||
xvstelm.d x3, XX, 0 * SIZE, 0
|
||||
xvstelm.d x4, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.d x3, XX, 0 * SIZE, 1
|
||||
xvstelm.d x4, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.d x3, XX, 0 * SIZE, 2
|
||||
xvstelm.d x4, XX, 1 * SIZE, 2
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.d x3, XX, 0 * SIZE, 3
|
||||
xvstelm.d x4, XX, 1 * SIZE, 3
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 0
|
||||
xvinsgr2vr.w x2, t2, 0
|
||||
xvinsgr2vr.w x1, t3, 1
|
||||
xvinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
xvinsgr2vr.w x1, t1, 2
|
||||
xvinsgr2vr.w x2, t2, 2
|
||||
xvinsgr2vr.w x1, t3, 3
|
||||
xvinsgr2vr.w x2, t4, 3
|
||||
add.d X, X, INCX
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 4
|
||||
xvinsgr2vr.w x2, t2, 4
|
||||
xvinsgr2vr.w x1, t3, 5
|
||||
xvinsgr2vr.w x2, t4, 5
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
xvinsgr2vr.w x1, t1, 6
|
||||
xvinsgr2vr.w x2, t2, 6
|
||||
xvinsgr2vr.w x1, t3, 7
|
||||
xvinsgr2vr.w x2, t4, 7
|
||||
add.d X, X, INCX
|
||||
|
||||
xvfmul.s x3, VXAR, x1
|
||||
xvfmul.s x4, VXAR, x2
|
||||
addi.d I, I, -1
|
||||
xvstelm.w x3, XX, 0 * SIZE, 0
|
||||
xvstelm.w x4, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 1
|
||||
xvstelm.w x4, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 2
|
||||
xvstelm.w x4, XX, 1 * SIZE, 2
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 3
|
||||
xvstelm.w x4, XX, 1 * SIZE, 3
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 4
|
||||
xvstelm.w x4, XX, 1 * SIZE, 4
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 5
|
||||
xvstelm.w x4, XX, 1 * SIZE, 5
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 6
|
||||
xvstelm.w x4, XX, 1 * SIZE, 6
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 7
|
||||
xvstelm.w x4, XX, 1 * SIZE, 7
|
||||
#endif
|
||||
add.d XX, XX, INCX
|
||||
blt $r0, I, .L223
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L224: //alpha_r != 0.0 && alpha_i != 0.0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 0
|
||||
xvinsgr2vr.d x2, t2, 0
|
||||
xvinsgr2vr.d x1, t3, 1
|
||||
xvinsgr2vr.d x2, t4, 1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
xvinsgr2vr.d x1, t1, 2
|
||||
xvinsgr2vr.d x2, t2, 2
|
||||
xvinsgr2vr.d x1, t3, 3
|
||||
xvinsgr2vr.d x2, t4, 3
|
||||
add.d X, X, INCX
|
||||
|
||||
xvfmul.d VX0, VXAI, x2
|
||||
xvfmsub.d x3, VXAR, x1, VX0
|
||||
xvfmul.d VX1, VXAI, x1
|
||||
xvfmadd.d x4, VXAR, x2, VX1
|
||||
addi.d I, I, -1
|
||||
xvstelm.d x3, XX, 0 * SIZE, 0
|
||||
xvstelm.d x4, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.d x3, XX, 0 * SIZE, 1
|
||||
xvstelm.d x4, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.d x3, XX, 0 * SIZE, 2
|
||||
xvstelm.d x4, XX, 1 * SIZE, 2
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.d x3, XX, 0 * SIZE, 3
|
||||
xvstelm.d x4, XX, 1 * SIZE, 3
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 0
|
||||
xvinsgr2vr.w x2, t2, 0
|
||||
xvinsgr2vr.w x1, t3, 1
|
||||
xvinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
xvinsgr2vr.w x1, t1, 2
|
||||
xvinsgr2vr.w x2, t2, 2
|
||||
xvinsgr2vr.w x1, t3, 3
|
||||
xvinsgr2vr.w x2, t4, 3
|
||||
add.d X, X, INCX
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 4
|
||||
xvinsgr2vr.w x2, t2, 4
|
||||
xvinsgr2vr.w x1, t3, 5
|
||||
xvinsgr2vr.w x2, t4, 5
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
xvinsgr2vr.w x1, t1, 6
|
||||
xvinsgr2vr.w x2, t2, 6
|
||||
xvinsgr2vr.w x1, t3, 7
|
||||
xvinsgr2vr.w x2, t4, 7
|
||||
add.d X, X, INCX
|
||||
|
||||
xvfmul.s VX0, VXAI, x2
|
||||
xvfmsub.s x3, VXAR, x1, VX0
|
||||
xvfmul.s VX1, VXAI, x1
|
||||
xvfmadd.s x4, VXAR, x2, VX1
|
||||
addi.d I, I, -1
|
||||
xvstelm.w x3, XX, 0 * SIZE, 0
|
||||
xvstelm.w x4, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 1
|
||||
xvstelm.w x4, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 2
|
||||
xvstelm.w x4, XX, 1 * SIZE, 2
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 3
|
||||
xvstelm.w x4, XX, 1 * SIZE, 3
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 4
|
||||
xvstelm.w x4, XX, 1 * SIZE, 4
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 5
|
||||
xvstelm.w x4, XX, 1 * SIZE, 5
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 6
|
||||
xvstelm.w x4, XX, 1 * SIZE, 6
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.w x3, XX, 0 * SIZE, 7
|
||||
xvstelm.w x4, XX, 1 * SIZE, 7
|
||||
#endif
|
||||
add.d XX, XX, INCX
|
||||
blt $r0, I, .L224
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L997:
|
||||
#ifdef DOUBLE
|
||||
andi I, N, 3
|
||||
#else
|
||||
andi I, N, 7
|
||||
#endif
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
MUL s1, ALPHAI, a2
|
||||
MUL s2, ALPHAI, a1
|
||||
MSUB s1, ALPHAR, a1, s1
|
||||
MADD s2, ALPHAR, a2, s2
|
||||
ST s1, X, 0 * SIZE
|
||||
ST s2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r12
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,571 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define ALPHAR $f0
|
||||
#define ALPHAI $f1
|
||||
#define X $r7
|
||||
#define INCX $r8
|
||||
|
||||
#define I $r12
|
||||
#define TEMP $r13
|
||||
#define t1 $r14
|
||||
#define t2 $r16
|
||||
#define t3 $r15
|
||||
#define t4 $r17
|
||||
#define XX $r18
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define s1 $f16
|
||||
#define s2 $f17
|
||||
#define s3 $f18
|
||||
#define s4 $f19
|
||||
#define VX0 $vr8
|
||||
#define VX1 $vr20
|
||||
#define VX2 $vr21
|
||||
#define VX3 $vr22
|
||||
#define VXAR $vr23
|
||||
#define VXAI $vr19
|
||||
#define VXZ $vr12
|
||||
#define x1 $vr18
|
||||
#define x2 $vr17
|
||||
#define x3 $vr16
|
||||
#define x4 $vr15
|
||||
|
||||
PROLOGUE
|
||||
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
movgr2fr.d a1, $r0
|
||||
FFINT a1, a1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
MTG t1, ALPHAR
|
||||
#ifdef DOUBLE
|
||||
vreplgr2vr.d VXAR, t1
|
||||
movfr2gr.d t2, ALPHAI
|
||||
vreplgr2vr.d VXAI, t2
|
||||
#else
|
||||
vreplgr2vr.w VXAR, t1
|
||||
movfr2gr.s t2, ALPHAI
|
||||
vreplgr2vr.w VXAI, t2
|
||||
#endif
|
||||
vxor.v VXZ, VXZ, VXZ
|
||||
srai.d I, N, 2
|
||||
bne INCX, TEMP, .L22
|
||||
|
||||
.L11:
|
||||
bge $r0, I, .L997
|
||||
CMPEQ $fcc0, ALPHAR, a1
|
||||
CMPEQ $fcc1, ALPHAI, a1
|
||||
bceqz $fcc0, .L13
|
||||
b .L14
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
|
||||
b .L113 //alpha_r != 0.0 && alpha_i == 0.0
|
||||
|
||||
.L14:
|
||||
bceqz $fcc1, .L112 //alpha_r == 0.0 && alpha_i != 0.0
|
||||
b .L111 //alpha_r == 0.0 && alpha_i == 0.0
|
||||
.align 3
|
||||
|
||||
.L111: //alpha_r == 0.0 && alpha_i == 0.0
|
||||
vst VXZ, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vst VXZ, X, 2 * SIZE
|
||||
vst VXZ, X, 4 * SIZE
|
||||
vst VXZ, X, 6 * SIZE
|
||||
#else
|
||||
vst VXZ, X, 4 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L111
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L112: //alpha_r == 0.0 && alpha_i != 0.0
|
||||
vld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vfmul.d x3, VXAI, x2
|
||||
vfsub.d x3, VXZ, x3
|
||||
vfmul.d x4, VXAI, x1
|
||||
vilvl.d VX2, x4 ,x3
|
||||
vilvh.d VX3, x4, x3
|
||||
vst VX2, X, 0 * SIZE
|
||||
vst VX3, X, 2 * SIZE
|
||||
vld VX0, X, 4 * SIZE
|
||||
vld VX1, X, 6 * SIZE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vfmul.d x3, VXAI, x2
|
||||
vfsub.d x3, VXZ, x3
|
||||
vfmul.d x4, VXAI, x1
|
||||
vilvl.d VX2, x4 ,x3
|
||||
vilvh.d VX3, x4, x3
|
||||
vst VX2, X, 4 * SIZE
|
||||
vst VX3, X, 6 * SIZE
|
||||
#else
|
||||
vld VX1, X, 4 * SIZE
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vfmul.s x3, VXAI, x2
|
||||
vfsub.s x3, VXZ, x3
|
||||
vfmul.s x4, VXAI, x1
|
||||
vilvl.w VX2, x4 ,x3
|
||||
vilvh.w VX3, x4, x3
|
||||
vst VX2, X, 0 * SIZE
|
||||
vst VX3, X, 4 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L112
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L113: //alpha_r != 0.0 && alpha_i == 0.0
|
||||
vld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vfmul.d x3, VXAR, x1
|
||||
vfmul.d x4, VXAR, x2
|
||||
vilvl.d VX2, x4 ,x3
|
||||
vilvh.d VX3, x4, x3
|
||||
vst VX2, X, 0 * SIZE
|
||||
vst VX3, X, 2 * SIZE
|
||||
vld VX0, X, 4 * SIZE
|
||||
vld VX1, X, 6 * SIZE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vfmul.d x3, VXAR, x1
|
||||
vfmul.d x4, VXAR, x2
|
||||
vilvl.d VX2, x4 ,x3
|
||||
vilvh.d VX3, x4, x3
|
||||
vst VX2, X, 4 * SIZE
|
||||
vst VX3, X, 6 * SIZE
|
||||
#else
|
||||
vld VX1, X, 4 * SIZE
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vfmul.s x3, VXAR, x1
|
||||
vfmul.s x4, VXAR, x2
|
||||
vilvl.w VX2, x4 ,x3
|
||||
vilvh.w VX3, x4, x3
|
||||
vst VX2, X, 0 * SIZE
|
||||
vst VX3, X, 4 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L113
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L114: //alpha_r != 0.0 && alpha_i != 0.0
|
||||
vld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmsub.d x3, VXAR, x1, VX0
|
||||
vfmul.d VX1, VXAI, x1
|
||||
vfmadd.d x4, VXAR, x2, VX1
|
||||
vilvl.d VX2, x4 ,x3
|
||||
vilvh.d VX3, x4, x3
|
||||
vst VX2, X, 0 * SIZE
|
||||
vst VX3, X, 2 * SIZE
|
||||
vld VX0, X, 4 * SIZE
|
||||
vld VX1, X, 6 * SIZE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmsub.d x3, VXAR, x1, VX0
|
||||
vfmul.d VX1, VXAI, x1
|
||||
vfmadd.d x4, VXAR, x2, VX1
|
||||
vilvl.d VX2, x4 ,x3
|
||||
vilvh.d VX3, x4, x3
|
||||
vst VX2, X, 4 * SIZE
|
||||
vst VX3, X, 6 * SIZE
|
||||
#else
|
||||
vld VX1, X, 4 * SIZE
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vfmul.s VX0, VXAI, x2
|
||||
vfmsub.s x3, VXAR, x1, VX0
|
||||
vfmul.s VX1, VXAI, x1
|
||||
vfmadd.s x4, VXAR, x2, VX1
|
||||
vilvl.w VX2, x4 ,x3
|
||||
vilvh.w VX3, x4, x3
|
||||
vst VX2, X, 0 * SIZE
|
||||
vst VX3, X, 4 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L114
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
bge $r0, I, .L997
|
||||
move XX, X
|
||||
CMPEQ $fcc0, ALPHAR, a1
|
||||
CMPEQ $fcc1, ALPHAI, a1
|
||||
bceqz $fcc0, .L23
|
||||
b .L24
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
|
||||
b .L223 //alpha_r != 0.0 && alpha_i == 0.0
|
||||
|
||||
.L24:
|
||||
bceqz $fcc1, .L222 //alpha_r == 0.0 && alpha_i != 0.0
|
||||
b .L221 //alpha_r == 0.0 && alpha_i == 0.0
|
||||
.align 3
|
||||
|
||||
.L221: //alpha_r == 0.0 && alpha_i == 0.0
|
||||
#ifdef DOUBLE
|
||||
vstelm.d VXZ, X, 0, 0
|
||||
vstelm.d VXZ, X, 1 * SIZE, 0
|
||||
add.d X, X, INCX
|
||||
vstelm.d VXZ, X, 0, 0
|
||||
vstelm.d VXZ, X, 1 * SIZE, 0
|
||||
add.d X, X, INCX
|
||||
vstelm.d VXZ, X, 0, 0
|
||||
vstelm.d VXZ, X, 1 * SIZE, 0
|
||||
add.d X, X, INCX
|
||||
vstelm.d VXZ, X, 0, 0
|
||||
vstelm.d VXZ, X, 1 * SIZE, 0
|
||||
#else
|
||||
vstelm.w VXZ, X, 0, 0
|
||||
vstelm.w VXZ, X, 1 * SIZE, 0
|
||||
add.d X, X, INCX
|
||||
vstelm.w VXZ, X, 0, 0
|
||||
vstelm.w VXZ, X, 1 * SIZE, 0
|
||||
add.d X, X, INCX
|
||||
vstelm.w VXZ, X, 0, 0
|
||||
vstelm.w VXZ, X, 1 * SIZE, 0
|
||||
add.d X, X, INCX
|
||||
vstelm.w VXZ, X, 0, 0
|
||||
vstelm.w VXZ, X, 1 * SIZE, 0
|
||||
#endif
|
||||
add.d X, X, INCX
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L221
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L222: //alpha_r == 0.0 && alpha_i != 0.0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
vfmul.d x3, VXAI, x2
|
||||
vfsub.d x3, VXZ, x3
|
||||
vfmul.d x4, VXAI, x1
|
||||
vstelm.d x3, XX, 0 * SIZE, 0
|
||||
vstelm.d x4, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d x3, XX, 0 * SIZE, 1
|
||||
vstelm.d x4, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
add.d X, X, INCX
|
||||
vfmul.d x3, VXAI, x2
|
||||
vfsub.d x3, VXZ, x3
|
||||
vfmul.d x4, VXAI, x1
|
||||
addi.d I, I, -1
|
||||
vstelm.d x3, XX, 0 * SIZE, 0
|
||||
vstelm.d x4, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d x3, XX, 0 * SIZE, 1
|
||||
vstelm.d x4, XX, 1 * SIZE, 1
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 0
|
||||
vinsgr2vr.w x2, t2, 0
|
||||
vinsgr2vr.w x1, t3, 1
|
||||
vinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
vinsgr2vr.w x1, t1, 2
|
||||
vinsgr2vr.w x2, t2, 2
|
||||
vinsgr2vr.w x1, t3, 3
|
||||
vinsgr2vr.w x2, t4, 3
|
||||
add.d X, X, INCX
|
||||
|
||||
vfmul.s x3, VXAI, x2
|
||||
vfsub.s x3, VXZ, x3
|
||||
vfmul.s x4, VXAI, x1
|
||||
addi.d I, I, -1
|
||||
vstelm.w x3, XX, 0 * SIZE, 0
|
||||
vstelm.w x4, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w x3, XX, 0 * SIZE, 1
|
||||
vstelm.w x4, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w x3, XX, 0 * SIZE, 2
|
||||
vstelm.w x4, XX, 1 * SIZE, 2
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w x3, XX, 0 * SIZE, 3
|
||||
vstelm.w x4, XX, 1 * SIZE, 3
|
||||
#endif
|
||||
add.d XX, XX, INCX
|
||||
blt $r0, I, .L222
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L223: //alpha_r != 0.0 && alpha_i == 0.0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
vfmul.d x3, VXAR, x1
|
||||
vfmul.d x4, VXAR, x2
|
||||
vstelm.d x3, XX, 0 * SIZE, 0
|
||||
vstelm.d x4, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d x3, XX, 0 * SIZE, 1
|
||||
vstelm.d x4, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
add.d X, X, INCX
|
||||
vfmul.d x3, VXAR, x1
|
||||
vfmul.d x4, VXAR, x2
|
||||
addi.d I, I, -1
|
||||
vstelm.d x3, XX, 0 * SIZE, 0
|
||||
vstelm.d x4, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d x3, XX, 0 * SIZE, 1
|
||||
vstelm.d x4, XX, 1 * SIZE, 1
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 0
|
||||
vinsgr2vr.w x2, t2, 0
|
||||
vinsgr2vr.w x1, t3, 1
|
||||
vinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
vinsgr2vr.w x1, t1, 2
|
||||
vinsgr2vr.w x2, t2, 2
|
||||
vinsgr2vr.w x1, t3, 3
|
||||
vinsgr2vr.w x2, t4, 3
|
||||
add.d X, X, INCX
|
||||
|
||||
vfmul.s x3, VXAR, x1
|
||||
vfmul.s x4, VXAR, x2
|
||||
addi.d I, I, -1
|
||||
vstelm.w x3, XX, 0 * SIZE, 0
|
||||
vstelm.w x4, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w x3, XX, 0 * SIZE, 1
|
||||
vstelm.w x4, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w x3, XX, 0 * SIZE, 2
|
||||
vstelm.w x4, XX, 1 * SIZE, 2
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w x3, XX, 0 * SIZE, 3
|
||||
vstelm.w x4, XX, 1 * SIZE, 3
|
||||
#endif
|
||||
add.d XX, XX, INCX
|
||||
blt $r0, I, .L223
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L224: //alpha_r != 0.0 && alpha_i != 0.0
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmsub.d x3, VXAR, x1, VX0
|
||||
vfmul.d VX1, VXAI, x1
|
||||
vfmadd.d x4, VXAR, x2, VX1
|
||||
vstelm.d x3, XX, 0 * SIZE, 0
|
||||
vstelm.d x4, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d x3, XX, 0 * SIZE, 1
|
||||
vstelm.d x4, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
add.d X, X, INCX
|
||||
vfmul.d VX0, VXAI, x2
|
||||
vfmsub.d x3, VXAR, x1, VX0
|
||||
vfmul.d VX1, VXAI, x1
|
||||
vfmadd.d x4, VXAR, x2, VX1
|
||||
addi.d I, I, -1
|
||||
vstelm.d x3, XX, 0 * SIZE, 0
|
||||
vstelm.d x4, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d x3, XX, 0 * SIZE, 1
|
||||
vstelm.d x4, XX, 1 * SIZE, 1
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 0
|
||||
vinsgr2vr.w x2, t2, 0
|
||||
vinsgr2vr.w x1, t3, 1
|
||||
vinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
vinsgr2vr.w x1, t1, 2
|
||||
vinsgr2vr.w x2, t2, 2
|
||||
vinsgr2vr.w x1, t3, 3
|
||||
vinsgr2vr.w x2, t4, 3
|
||||
add.d X, X, INCX
|
||||
|
||||
vfmul.s VX0, VXAI, x2
|
||||
vfmsub.s x3, VXAR, x1, VX0
|
||||
vfmul.s VX1, VXAI, x1
|
||||
vfmadd.s x4, VXAR, x2, VX1
|
||||
addi.d I, I, -1
|
||||
vstelm.w x3, XX, 0 * SIZE, 0
|
||||
vstelm.w x4, XX, 1 * SIZE, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w x3, XX, 0 * SIZE, 1
|
||||
vstelm.w x4, XX, 1 * SIZE, 1
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w x3, XX, 0 * SIZE, 2
|
||||
vstelm.w x4, XX, 1 * SIZE, 2
|
||||
add.d XX, XX, INCX
|
||||
vstelm.w x3, XX, 0 * SIZE, 3
|
||||
vstelm.w x4, XX, 1 * SIZE, 3
|
||||
#endif
|
||||
add.d XX, XX, INCX
|
||||
blt $r0, I, .L224
|
||||
b .L997
|
||||
.align 3
|
||||
|
||||
.L997:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
MUL s1, ALPHAI, a2
|
||||
MUL s2, ALPHAI, a1
|
||||
MSUB s1, ALPHAR, a1, s1
|
||||
MADD s2, ALPHAR, a2, s2
|
||||
ST s1, X, 0 * SIZE
|
||||
ST s2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r12
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,274 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define t1 $r15
|
||||
#define t2 $r12
|
||||
#define t3 $r13
|
||||
#define t4 $r14
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define s1 $f16
|
||||
#define VX0 $xr12
|
||||
#define VX1 $xr13
|
||||
#define VX2 $xr14
|
||||
#define VX3 $xr15
|
||||
#define res1 $xr16
|
||||
#define res2 $xr17
|
||||
PROLOGUE
|
||||
xvxor.v res1, res1, res1
|
||||
xvxor.v res2, res2, res2
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvfadd.d res2, VX0, VX1
|
||||
xvfadd.d res1, res1, res2
|
||||
xvld VX2, X, 8 * SIZE
|
||||
xvld VX3, X, 12 * SIZE
|
||||
xvfadd.d res2, VX2, VX3
|
||||
xvfadd.d res1, res1, res2
|
||||
#else
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX1, X, 8 * SIZE
|
||||
xvfadd.s res2, VX0, VX1
|
||||
xvfadd.s res1, res2, res1
|
||||
#endif
|
||||
addi.d X, X, 16 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L11
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VX1, res1, 1
|
||||
xvpickve.d VX2, res1, 2
|
||||
xvpickve.d VX3, res1, 3
|
||||
xvfadd.d res1, VX1, res1
|
||||
xvfadd.d res1, VX2, res1
|
||||
xvfadd.d res1, VX3, res1
|
||||
#else
|
||||
xvfadd.s res2, res1, res2
|
||||
xvpickve.w VX1, res1, 1
|
||||
xvpickve.w VX2, res1, 2
|
||||
xvpickve.w VX3, res1, 3
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX3, res1
|
||||
xvpickve.w VX0, res2, 4
|
||||
xvpickve.w VX1, res2, 5
|
||||
xvpickve.w VX2, res2, 6
|
||||
xvpickve.w VX3, res2, 7
|
||||
xvfadd.s res1, VX0, res1
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L14:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
ADD a1, a1, a2
|
||||
ADD s1, a1, s1
|
||||
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 2 * SIZE
|
||||
blt $r0, I, .L14
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvfadd.d res2, VX0, VX1
|
||||
xvfadd.d res1, res1, res2
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvfadd.d res2, VX0, VX1
|
||||
xvfadd.d res1, res1, res2
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX1, t1, 0
|
||||
xvinsgr2vr.w VX1, t2, 1
|
||||
xvinsgr2vr.w VX1, t3, 2
|
||||
xvinsgr2vr.w VX1, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX1, t1, 4
|
||||
xvinsgr2vr.w VX1, t2, 5
|
||||
xvinsgr2vr.w VX1, t3, 6
|
||||
xvinsgr2vr.w VX1, t4, 7
|
||||
xvfadd.s res2, VX0, VX1
|
||||
xvfadd.s res1, res2, res1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VX1, res1, 1
|
||||
xvpickve.d VX2, res1, 2
|
||||
xvpickve.d VX3, res1, 3
|
||||
xvfadd.d res1, VX1, res1
|
||||
xvfadd.d res1, VX2, res1
|
||||
xvfadd.d res1, VX3, res1
|
||||
#else
|
||||
xvfadd.s res2, res1, res2
|
||||
xvpickve.w VX1, res1, 1
|
||||
xvpickve.w VX2, res1, 2
|
||||
xvpickve.w VX3, res1, 3
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX3, res1
|
||||
xvpickve.w VX0, res2, 4
|
||||
xvpickve.w VX1, res2, 5
|
||||
xvpickve.w VX2, res2, 6
|
||||
xvpickve.w VX3, res2, 7
|
||||
xvfadd.s res1, VX0, res1
|
||||
xvfadd.s res1, VX1, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
xvfadd.s res1, VX2, res1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
ADD a1, a1, a2
|
||||
ADD s1, a1, s1
|
||||
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
fmov.s $f0, $f16
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,266 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define t1 $r15
|
||||
#define t2 $r12
|
||||
#define t3 $r13
|
||||
#define t4 $r14
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define s1 $f16
|
||||
#define VX0 $vr12
|
||||
#define VX1 $vr13
|
||||
#define VX2 $vr14
|
||||
#define VX3 $vr15
|
||||
#define res1 $vr16
|
||||
#define res2 $vr17
|
||||
#define res3 $vr18
|
||||
PROLOGUE
|
||||
vxor.v res1, res1, res1
|
||||
vxor.v res2, res2, res2
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
vld VX2, X, 4 * SIZE
|
||||
vld VX3, X, 6 * SIZE
|
||||
vfadd.d res2, VX2, VX3
|
||||
vfadd.d res1, res1, res2
|
||||
vld VX0, X, 8 * SIZE
|
||||
vld VX1, X, 10 * SIZE
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
vld VX2, X, 12 * SIZE
|
||||
vld VX3, X, 14 * SIZE
|
||||
vfadd.d res2, VX2, VX3
|
||||
vfadd.d res1, res1, res2
|
||||
#else
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX1, X, 4 * SIZE
|
||||
vfadd.s res2, VX0, VX1
|
||||
vld VX2, X, 8 * SIZE
|
||||
vld VX3, X, 12 * SIZE
|
||||
vfadd.s res3, VX2, VX3
|
||||
vfadd.s res2, res3, res2
|
||||
vfadd.s res1, res1, res2
|
||||
#endif
|
||||
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 16 * SIZE
|
||||
blt $r0, I, .L11
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX1, res1, 1
|
||||
vfadd.d res1, VX1, res1
|
||||
#else
|
||||
vreplvei.w VX1, res1, 1
|
||||
vreplvei.w VX2, res1, 2
|
||||
vreplvei.w VX3, res1, 3
|
||||
vfadd.s res1, VX1, res1
|
||||
vfadd.s res1, VX2, res1
|
||||
vfadd.s res1, VX3, res1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L14:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
ADD a1, a1, a2
|
||||
ADD s1, a1, s1
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 2 * SIZE
|
||||
blt $r0, I, .L14
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
vinsgr2vr.d VX1, t1, 0
|
||||
vinsgr2vr.d VX1, t2, 1
|
||||
add.d X, X, INCX
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t3, 0
|
||||
vinsgr2vr.d VX0, t4, 1
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
add.d X, X, INCX
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
vinsgr2vr.d VX1, t1, 0
|
||||
vinsgr2vr.d VX1, t2, 1
|
||||
add.d X, X, INCX
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t3, 0
|
||||
vinsgr2vr.d VX0, t4, 1
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
add.d X, X, INCX
|
||||
vfadd.d res2, VX0, VX1
|
||||
vfadd.d res1, res1, res2
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
vfadd.s res2, VX0, VX1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX2, t1, 0
|
||||
vinsgr2vr.w VX2, t2, 1
|
||||
vinsgr2vr.w VX2, t3, 2
|
||||
vinsgr2vr.w VX2, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX3, t1, 0
|
||||
vinsgr2vr.w VX3, t2, 1
|
||||
vinsgr2vr.w VX3, t3, 2
|
||||
vinsgr2vr.w VX3, t4, 3
|
||||
vfadd.s res3, VX2, VX3
|
||||
vfadd.s res2, res3, res2
|
||||
vfadd.s res1, res1, res2
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX1, res1, 1
|
||||
vfadd.d res1, VX1, res1
|
||||
#else
|
||||
vreplvei.w VX1, res1, 1
|
||||
vreplvei.w VX2, res1, 2
|
||||
vreplvei.w VX3, res1, 3
|
||||
vfadd.s res1, VX1, res1
|
||||
vfadd.s res1, VX2, res1
|
||||
vfadd.s res1, VX3, res1
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
ADD a1, a1, a2
|
||||
ADD s1, a1, s1
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
fmov.s $f0, $f16
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,394 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#define N $r4
|
||||
#define X $r7
|
||||
#define INCX $r8
|
||||
#define Y $r9
|
||||
#define INCY $r10
|
||||
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define XX $r5
|
||||
#define YY $r6
|
||||
#define t1 $r14
|
||||
#define t2 $r15
|
||||
#define t3 $r16
|
||||
#define t4 $r19
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define b1 $f16
|
||||
#define b2 $f17
|
||||
#define b3 $f18
|
||||
#define b4 $f19
|
||||
#define VX0 $xr12
|
||||
#define VX1 $xr13
|
||||
#define VX2 $xr14
|
||||
#define VX3 $xr15
|
||||
|
||||
|
||||
PROLOGUE
|
||||
bge $r0, N, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
slli.d INCY, INCY, ZBASE_SHIFT
|
||||
srai.d I, N, 2
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
|
||||
b .L11 // INCX==1 and INCY==1
|
||||
.L20:
|
||||
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
|
||||
b .L21 // INCX!=1 and INCY==1
|
||||
|
||||
.L11:
|
||||
bge $r0, I, .L112
|
||||
.align 3
|
||||
|
||||
.L111:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
xvld VX3, Y, 4 * SIZE
|
||||
xvst VX2, X, 0 * SIZE
|
||||
xvst VX3, X, 4 * SIZE
|
||||
xvst VX0, Y, 0 * SIZE
|
||||
xvst VX1, Y, 4 * SIZE
|
||||
#else
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
xvst VX2, X, 0 * SIZE
|
||||
xvst VX0, Y, 0 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L111
|
||||
.align 3
|
||||
|
||||
.L112:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
.L113:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, Y, 0 * SIZE
|
||||
LD a4, Y, 1 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
ST a3, X, 0 * SIZE
|
||||
ST a4, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 2 * SIZE
|
||||
addi.d Y, Y, 2 * SIZE
|
||||
blt $r0, I, .L113
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L12: // INCX==1 and INCY!=1
|
||||
bge $r0, I, .L122
|
||||
.align 3
|
||||
|
||||
.L121:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0 * SIZE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
xvstelm.d VX0, Y, 0 * SIZE, 0
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
xvstelm.d VX0, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
xvstelm.d VX0, Y, 0 * SIZE, 2
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
xvstelm.d VX0, Y, 1 * SIZE, 3
|
||||
xvinsgr2vr.d VX2, t1, 0
|
||||
xvinsgr2vr.d VX2, t2, 1
|
||||
xvinsgr2vr.d VX2, t3, 2
|
||||
xvinsgr2vr.d VX2, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
xvst VX2, X, 0 * SIZE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
xvstelm.d VX1, Y, 0 * SIZE, 0
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
xvstelm.d VX1, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
xvstelm.d VX1, Y, 0 * SIZE, 2
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
xvstelm.d VX1, Y, 1 * SIZE, 3
|
||||
xvinsgr2vr.d VX3, t1, 0
|
||||
xvinsgr2vr.d VX3, t2, 1
|
||||
xvinsgr2vr.d VX3, t3, 2
|
||||
xvinsgr2vr.d VX3, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
xvst VX3, X, 4 * SIZE
|
||||
#else
|
||||
xvld VX0, X, 0 * SIZE
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
xvstelm.w VX0, Y, 0 * SIZE, 0
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
xvstelm.w VX0, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
xvstelm.w VX0, Y, 0 * SIZE, 2
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
xvstelm.w VX0, Y, 1 * SIZE, 3
|
||||
xvinsgr2vr.w VX2, t1, 0
|
||||
xvinsgr2vr.w VX2, t2, 1
|
||||
xvinsgr2vr.w VX2, t3, 2
|
||||
xvinsgr2vr.w VX2, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
xvstelm.w VX0, Y, 0 * SIZE, 4
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
xvstelm.w VX0, Y, 1 * SIZE, 5
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
xvstelm.w VX0, Y, 0 * SIZE, 6
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
xvstelm.w VX0, Y, 1 * SIZE, 7
|
||||
xvinsgr2vr.w VX2, t1, 4
|
||||
xvinsgr2vr.w VX2, t2, 5
|
||||
xvinsgr2vr.w VX2, t3, 6
|
||||
xvinsgr2vr.w VX2, t4, 7
|
||||
add.d Y, Y, INCY
|
||||
xvst VX2, X, 0 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L121
|
||||
.align 3
|
||||
|
||||
.L122:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L123:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, Y, 0 * SIZE
|
||||
LD a4, Y, 1 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
ST a3, X, 0 * SIZE
|
||||
ST a4, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 2 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L123
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
bge $r0, I, .L212
|
||||
.align 3
|
||||
|
||||
.L211:
|
||||
#ifdef DOUBLE
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
xvstelm.d VX2, X, 0 * SIZE, 0
|
||||
ld.d t2, X, 1 * SIZE
|
||||
xvstelm.d VX2, X, 1 * SIZE, 1
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
xvstelm.d VX2, X, 0 * SIZE, 2
|
||||
ld.d t4, X, 1 * SIZE
|
||||
xvstelm.d VX2, X, 1 * SIZE, 3
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
add.d X, X, INCX
|
||||
xvst VX0, Y, 0 * SIZE
|
||||
xvld VX3, Y, 4 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
xvstelm.d VX3, X, 0 * SIZE, 0
|
||||
ld.d t2, X, 1 * SIZE
|
||||
xvstelm.d VX3, X, 1 * SIZE, 1
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
xvstelm.d VX3, X, 0 * SIZE, 2
|
||||
ld.d t4, X, 1 * SIZE
|
||||
xvstelm.d VX3, X, 1 * SIZE, 3
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
add.d X, X, INCX
|
||||
xvst VX1, Y, 4 * SIZE
|
||||
#else
|
||||
xvld VX2, Y, 0 * SIZE
|
||||
ld.w t1, X, 0 * SIZE
|
||||
xvstelm.w VX2, X, 0 * SIZE, 0
|
||||
ld.w t2, X, 1 * SIZE
|
||||
xvstelm.w VX2, X, 1 * SIZE, 1
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
xvstelm.w VX2, X, 0 * SIZE, 2
|
||||
ld.w t4, X, 1 * SIZE
|
||||
xvstelm.w VX2, X, 1 * SIZE, 3
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
add.d X, X, INCX
|
||||
ld.w t1, X, 0 * SIZE
|
||||
xvstelm.w VX2, X, 0 * SIZE, 4
|
||||
ld.w t2, X, 1 * SIZE
|
||||
xvstelm.w VX2, X, 1 * SIZE, 5
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
xvstelm.w VX2, X, 0 * SIZE, 6
|
||||
ld.w t4, X, 1 * SIZE
|
||||
xvstelm.w VX2, X, 1 * SIZE, 7
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
add.d X, X, INCX
|
||||
xvst VX0, Y, 0 * SIZE
|
||||
#endif
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L211
|
||||
.align 3
|
||||
|
||||
.L212:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L213:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, Y, 0 * SIZE
|
||||
LD a4, Y, 1 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
ST a3, X, 0 * SIZE
|
||||
ST a4, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
addi.d Y, Y, 2 * SIZE
|
||||
blt $r0, I, .L213
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
bge $r0, I, .L223
|
||||
.align 3
|
||||
move XX, X
|
||||
|
||||
.L222:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
LD b2, Y, 1 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD b3, Y, 0 * SIZE
|
||||
ST a3, Y, 0 * SIZE
|
||||
LD b4, Y, 1 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
ST b1, XX, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST b2, XX, 1 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
ST b3, XX, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST b4, XX, 1 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
|
||||
LD b1, Y, 0 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
LD b2, Y, 1 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD b3, Y, 0 * SIZE
|
||||
ST a3, Y, 0 * SIZE
|
||||
LD b4, Y, 1 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
|
||||
ST b1, XX, 0 * SIZE
|
||||
ST b2, XX, 1 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ST b3, XX, 0 * SIZE
|
||||
ST b4, XX, 1 * SIZE
|
||||
|
||||
add.d XX, XX, INCX
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L222
|
||||
.align 3
|
||||
|
||||
.L223:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L224:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, Y, 0 * SIZE
|
||||
LD a4, Y, 1 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
ST a3, X, 0 * SIZE
|
||||
ST a4, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L224
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r12
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,421 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#define N $r4
|
||||
#define X $r7
|
||||
#define INCX $r8
|
||||
#define Y $r9
|
||||
#define INCY $r10
|
||||
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define XX $r5
|
||||
#define YY $r6
|
||||
#define t1 $r14
|
||||
#define t2 $r15
|
||||
#define t3 $r16
|
||||
#define t4 $r19
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define b1 $f16
|
||||
#define b2 $f17
|
||||
#define b3 $f18
|
||||
#define b4 $f19
|
||||
#define VX0 $vr12
|
||||
#define VX1 $vr13
|
||||
#define VX2 $vr14
|
||||
#define VX3 $vr15
|
||||
|
||||
|
||||
PROLOGUE
|
||||
bge $r0, N, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
slli.d INCY, INCY, ZBASE_SHIFT
|
||||
srai.d I, N, 2
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
|
||||
b .L11 // INCX==1 and INCY==1
|
||||
.L20:
|
||||
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
|
||||
b .L21 // INCX!=1 and INCY==1
|
||||
|
||||
.L11:
|
||||
bge $r0, I, .L112
|
||||
.align 3
|
||||
|
||||
.L111:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
vld VX3, Y, 2 * SIZE
|
||||
vst VX2, X, 0 * SIZE
|
||||
vst VX3, X, 2 * SIZE
|
||||
vst VX0, Y, 0 * SIZE
|
||||
vst VX1, Y, 2 * SIZE
|
||||
vld VX0, X, 4 * SIZE
|
||||
vld VX1, X, 6 * SIZE
|
||||
vld VX2, Y, 4 * SIZE
|
||||
vld VX3, Y, 6 * SIZE
|
||||
vst VX2, X, 4 * SIZE
|
||||
vst VX3, X, 6 * SIZE
|
||||
vst VX0, Y, 4 * SIZE
|
||||
vst VX1, Y, 6 * SIZE
|
||||
#else
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX1, X, 4 * SIZE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
vld VX3, Y, 4 * SIZE
|
||||
vst VX2, X, 0 * SIZE
|
||||
vst VX3, X, 4 * SIZE
|
||||
vst VX0, Y, 0 * SIZE
|
||||
vst VX1, Y, 4 * SIZE
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L111
|
||||
.align 3
|
||||
|
||||
.L112:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L113:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, Y, 0 * SIZE
|
||||
LD a4, Y, 1 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
ST a3, X, 0 * SIZE
|
||||
ST a4, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 2 * SIZE
|
||||
addi.d Y, Y, 2 * SIZE
|
||||
blt $r0, I, .L113
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L12: // INCX==1 and INCY!=1
|
||||
bge $r0, I, .L122
|
||||
.align 3
|
||||
|
||||
.L121:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0 * SIZE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
vstelm.d VX0, Y, 0 * SIZE, 0
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
vstelm.d VX0, Y, 1 * SIZE, 1
|
||||
vinsgr2vr.d VX2, t1, 0
|
||||
vinsgr2vr.d VX2, t2, 1
|
||||
add.d Y, Y, INCY
|
||||
vst VX2, X, 0 * SIZE
|
||||
vld VX1, X, 2 * SIZE
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
vstelm.d VX1, Y, 0 * SIZE, 0
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
vstelm.d VX1, Y, 1 * SIZE, 1
|
||||
vinsgr2vr.d VX3, t3, 0
|
||||
vinsgr2vr.d VX3, t4, 1
|
||||
add.d Y, Y, INCY
|
||||
vst VX3, X, 2 * SIZE
|
||||
vld VX0, X, 4 * SIZE
|
||||
ld.d t1, Y, 0 * SIZE
|
||||
vstelm.d VX0, Y, 0 * SIZE, 0
|
||||
ld.d t2, Y, 1 * SIZE
|
||||
vstelm.d VX0, Y, 1 * SIZE, 1
|
||||
vinsgr2vr.d VX2, t1, 0
|
||||
vinsgr2vr.d VX2, t2, 1
|
||||
add.d Y, Y, INCY
|
||||
vst VX2, X, 4 * SIZE
|
||||
vld VX1, X, 6 * SIZE
|
||||
ld.d t3, Y, 0 * SIZE
|
||||
vstelm.d VX1, Y, 0 * SIZE, 0
|
||||
ld.d t4, Y, 1 * SIZE
|
||||
vstelm.d VX1, Y, 1 * SIZE, 1
|
||||
vinsgr2vr.d VX3, t3, 0
|
||||
vinsgr2vr.d VX3, t4, 1
|
||||
add.d Y, Y, INCY
|
||||
vst VX3, X, 6 * SIZE
|
||||
#else
|
||||
vld VX0, X, 0 * SIZE
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
vstelm.w VX0, Y, 0 * SIZE, 0
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
vstelm.w VX0, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
vstelm.w VX0, Y, 0 * SIZE, 2
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
vstelm.w VX0, Y, 1 * SIZE, 3
|
||||
vinsgr2vr.w VX2, t1, 0
|
||||
vinsgr2vr.w VX2, t2, 1
|
||||
vinsgr2vr.w VX2, t3, 2
|
||||
vinsgr2vr.w VX2, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
vst VX2, X, 0 * SIZE
|
||||
|
||||
vld VX1, X, 4 * SIZE
|
||||
ld.w t1, Y, 0 * SIZE
|
||||
vstelm.w VX1, Y, 0 * SIZE, 0
|
||||
ld.w t2, Y, 1 * SIZE
|
||||
vstelm.w VX1, Y, 1 * SIZE, 1
|
||||
add.d Y, Y, INCY
|
||||
ld.w t3, Y, 0 * SIZE
|
||||
vstelm.w VX1, Y, 0 * SIZE, 2
|
||||
ld.w t4, Y, 1 * SIZE
|
||||
vstelm.w VX1, Y, 1 * SIZE, 3
|
||||
vinsgr2vr.w VX3, t1, 0
|
||||
vinsgr2vr.w VX3, t2, 1
|
||||
vinsgr2vr.w VX3, t3, 2
|
||||
vinsgr2vr.w VX3, t4, 3
|
||||
add.d Y, Y, INCY
|
||||
vst VX3, X, 4 * SIZE
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L121
|
||||
.align 3
|
||||
|
||||
.L122:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L123:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, Y, 0 * SIZE
|
||||
LD a4, Y, 1 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
ST a3, X, 0 * SIZE
|
||||
ST a4, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 2 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L123
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L21:// INCX!=1 and INCY==1
|
||||
bge $r0, I, .L212
|
||||
.align 3
|
||||
|
||||
.L211:
|
||||
#ifdef DOUBLE
|
||||
vld VX2, Y, 0 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
vstelm.d VX2, X, 0 * SIZE, 0
|
||||
ld.d t2, X, 1 * SIZE
|
||||
vstelm.d VX2, X, 1 * SIZE, 1
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
add.d X, X, INCX
|
||||
vst VX0, Y, 0 * SIZE
|
||||
vld VX3, Y, 2 * SIZE
|
||||
ld.d t3, X, 0 * SIZE
|
||||
vstelm.d VX3, X, 0 * SIZE, 0
|
||||
ld.d t4, X, 1 * SIZE
|
||||
vstelm.d VX3, X, 1 * SIZE, 1
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
add.d X, X, INCX
|
||||
vst VX1, Y, 2 * SIZE
|
||||
vld VX2, Y, 4 * SIZE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
vstelm.d VX2, X, 0 * SIZE, 0
|
||||
ld.d t2, X, 1 * SIZE
|
||||
vstelm.d VX2, X, 1 * SIZE, 1
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
add.d X, X, INCX
|
||||
vst VX0, Y, 4 * SIZE
|
||||
vld VX3, Y, 6 * SIZE
|
||||
ld.d t3, X, 0 * SIZE
|
||||
vstelm.d VX3, X, 0 * SIZE, 0
|
||||
ld.d t4, X, 1 * SIZE
|
||||
vstelm.d VX3, X, 1 * SIZE, 1
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
add.d X, X, INCX
|
||||
vst VX1, Y, 6 * SIZE
|
||||
#else
|
||||
vld VX2, Y, 0 * SIZE
|
||||
ld.w t1, X, 0 * SIZE
|
||||
vstelm.w VX2, X, 0 * SIZE, 0
|
||||
ld.w t2, X, 1 * SIZE
|
||||
vstelm.w VX2, X, 1 * SIZE, 1
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
vstelm.w VX2, X, 0 * SIZE, 2
|
||||
ld.w t4, X, 1 * SIZE
|
||||
vstelm.w VX2, X, 1 * SIZE, 3
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
add.d X, X, INCX
|
||||
vst VX0, Y, 0 * SIZE
|
||||
|
||||
vld VX3, Y, 4 * SIZE
|
||||
ld.w t1, X, 0 * SIZE
|
||||
vstelm.w VX3, X, 0 * SIZE, 0
|
||||
ld.w t2, X, 1 * SIZE
|
||||
vstelm.w VX3, X, 1 * SIZE, 1
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
vstelm.w VX3, X, 0 * SIZE, 2
|
||||
ld.w t4, X, 1 * SIZE
|
||||
vstelm.w VX3, X, 1 * SIZE, 3
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
add.d X, X, INCX
|
||||
vst VX1, Y, 4 * SIZE
|
||||
#endif
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L211
|
||||
.align 3
|
||||
|
||||
.L212:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L213:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, Y, 0 * SIZE
|
||||
LD a4, Y, 1 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
ST a3, X, 0 * SIZE
|
||||
ST a4, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
addi.d Y, Y, 2 * SIZE
|
||||
blt $r0, I, .L213
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
bge $r0, I, .L223
|
||||
.align 3
|
||||
move XX, X
|
||||
|
||||
.L222:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
LD b2, Y, 1 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD b3, Y, 0 * SIZE
|
||||
ST a3, Y, 0 * SIZE
|
||||
LD b4, Y, 1 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
ST b1, XX, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST b2, XX, 1 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
ST b3, XX, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST b4, XX, 1 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
|
||||
LD b1, Y, 0 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
LD b2, Y, 1 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD b3, Y, 0 * SIZE
|
||||
ST a3, Y, 0 * SIZE
|
||||
LD b4, Y, 1 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
|
||||
ST b1, XX, 0 * SIZE
|
||||
ST b2, XX, 1 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ST b3, XX, 0 * SIZE
|
||||
ST b4, XX, 1 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L222
|
||||
.align 3
|
||||
|
||||
.L223:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L224:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, Y, 0 * SIZE
|
||||
LD a4, Y, 1 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
ST a3, X, 0 * SIZE
|
||||
ST a4, X, 1 * SIZE
|
||||
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L224
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r12
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,185 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define S5 $r16
|
||||
#define S6 $r17
|
||||
#define S7 $r18
|
||||
#define S8 $r19
|
||||
#define TD $r20
|
||||
#define TS $r21
|
||||
#define TL $r7
|
||||
#define T0 $r6
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
#define F4 $f4
|
||||
#define F5 $f5
|
||||
#define F6 $f6
|
||||
#define F7 $f7
|
||||
/* LSX vectors */
|
||||
#define U0 $vr0
|
||||
#define U1 $vr1
|
||||
#define U2 $vr2
|
||||
#define U3 $vr3
|
||||
#define U4 $vr4
|
||||
#define U5 $vr5
|
||||
#define U6 $vr6
|
||||
#define U7 $vr7
|
||||
#define D0 $vr8
|
||||
#define D1 $vr9
|
||||
#define D2 $vr10
|
||||
#define D3 $vr11
|
||||
#define D4 $vr12
|
||||
#define D5 $vr13
|
||||
#define D6 $vr14
|
||||
#define D7 $vr15
|
||||
|
||||
PROLOGUE
|
||||
|
||||
move TD, DST
|
||||
move TS, SRC
|
||||
slli.d TL, LDA, 0x03
|
||||
slli.d T0, TL, 0x01
|
||||
srai.d J, N, 0x02
|
||||
beq J, ZERO, .L_N2
|
||||
.L_J1: /* J-- */
|
||||
move S1, TS
|
||||
add.d S2, TS, TL
|
||||
srai.d I, M, 0x02
|
||||
add.d S3, S2, TL
|
||||
add.d S4, S2, T0
|
||||
add.d TS, S3, T0
|
||||
addi.d J, J, -1
|
||||
beq I, ZERO, .L_I3
|
||||
.L_I1: /* I-- */
|
||||
GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00
|
||||
GINTERLACE v, d, D0, D2, U1, U0
|
||||
GINTERLACE v, d, D1, D3, U3, U2
|
||||
GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30
|
||||
addi.d TD, TD, 0x40
|
||||
|
||||
GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10
|
||||
GINTERLACE v, d, D0, D2, U1, U0
|
||||
GINTERLACE v, d, D1, D3, U3, U2
|
||||
GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d S3, S3, 0x20
|
||||
addi.d S4, S4, 0x20
|
||||
addi.d TD, TD, 0x40
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_I1
|
||||
.L_I3:
|
||||
andi I, M, 0x03
|
||||
beq I, ZERO, .L_I0
|
||||
.L_II1:
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
fld.d F2, S3, 0x00
|
||||
fld.d F3, S4, 0x00
|
||||
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d S1, S1, 0x08
|
||||
fst.d F1, TD, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
fst.d F2, TD, 0x10
|
||||
addi.d S3, S3, 0x08
|
||||
fst.d F3, TD, 0x18
|
||||
addi.d S4, S4, 0x08
|
||||
|
||||
addi.d TD, TD, 0x20
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_II1
|
||||
.L_I0:
|
||||
blt ZERO, J, .L_J1
|
||||
.L_N2:
|
||||
andi J, N, 0x02
|
||||
beq ZERO, J, .L_N1
|
||||
|
||||
move S1, TS
|
||||
add.d S2, TS, TL
|
||||
srai.d I, M, 0x01
|
||||
add.d TS, S2, TL
|
||||
beq I, ZERO, .L_2I3
|
||||
.L_2I1: /* I-- */
|
||||
GLD v, , U0, S1, 0x00, U1, S2, 0x00
|
||||
GINTERLACE v, d, D0, D1, U1, U0
|
||||
GST v, , D0, TD, 0x00, D1, TD, 0x10
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
addi.d TD, TD, 0x20
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_2I1
|
||||
.L_2I3:
|
||||
andi I, M, 0x01
|
||||
beq ZERO, I, .L_N1
|
||||
.L_2II1: /* I-- */
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d I, I, -1
|
||||
fst.d F1, TD, 0x08
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d TD, TD, 0x10
|
||||
blt ZERO, I, .L_2II1
|
||||
.L_N1:
|
||||
move S1, TS
|
||||
beq ZERO, M, .L_N0
|
||||
.L_M1:
|
||||
fld.d F0, S1, 0x00
|
||||
addi.d S1, S1, 0x08
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d TD, TD, 0x08
|
||||
addi.d M, M, -1
|
||||
blt ZERO, M, .L_M1
|
||||
.L_N0:
|
||||
jirl $r0, $r1, 0x00
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,283 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define S5 $r16
|
||||
#define S6 $r17
|
||||
#define S7 $r18
|
||||
#define S8 $r19
|
||||
#define TD $r20
|
||||
#define TS $r21
|
||||
#define TL $r7
|
||||
#define T0 $r6
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
#define F4 $f4
|
||||
#define F5 $f5
|
||||
#define F6 $f6
|
||||
#define F7 $f7
|
||||
/* LSX vectors */
|
||||
#define U0 $vr0
|
||||
#define U1 $vr1
|
||||
#define U2 $vr2
|
||||
#define U3 $vr3
|
||||
#define U4 $vr4
|
||||
#define U5 $vr5
|
||||
#define U6 $vr6
|
||||
#define U7 $vr7
|
||||
#define D0 $vr8
|
||||
#define D1 $vr9
|
||||
#define D2 $vr10
|
||||
#define D3 $vr11
|
||||
#define D4 $vr12
|
||||
#define D5 $vr13
|
||||
#define D6 $vr14
|
||||
#define D7 $vr15
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 26, 32
|
||||
move TD, DST
|
||||
move TS, SRC
|
||||
slli.d TL, LDA, 0x03
|
||||
slli.d T0, TL, 0x01
|
||||
srai.d J, N, 0x03
|
||||
beq J, ZERO, .L_N4
|
||||
.L_J1:
|
||||
move S1, TS
|
||||
add.d S2, TS, TL
|
||||
srai.d I, M, 0x03
|
||||
add.d S3, S2, TL
|
||||
addi.d J, J, -1
|
||||
add.d S4, S3, TL
|
||||
add.d S5, S3, T0
|
||||
add.d S6, S4, T0
|
||||
add.d S7, S5, T0
|
||||
add.d S8, S6, T0
|
||||
add.d TS, S7, T0
|
||||
beq I, ZERO, .L_I7
|
||||
.L_I1:
|
||||
GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00, \
|
||||
U4, S5, 0x00, U5, S6, 0x00, U6, S7, 0x00, U7, S8, 0x00
|
||||
GINTERLACE v, d, D0, D4, U1, U0
|
||||
GINTERLACE v, d, D1, D5, U3, U2
|
||||
GINTERLACE v, d, D2, D6, U5, U4
|
||||
GINTERLACE v, d, D3, D7, U7, U6
|
||||
GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \
|
||||
D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70
|
||||
addi.d TD, TD, 0x80
|
||||
GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10, \
|
||||
U4, S5, 0x10, U5, S6, 0x10, U6, S7, 0x10, U7, S8, 0x10
|
||||
GINTERLACE v, d, D0, D4, U1, U0
|
||||
GINTERLACE v, d, D1, D5, U3, U2
|
||||
GINTERLACE v, d, D2, D6, U5, U4
|
||||
GINTERLACE v, d, D3, D7, U7, U6
|
||||
GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \
|
||||
D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70
|
||||
addi.d TD, TD, 0x80
|
||||
GLD v, , U0, S1, 0x20, U1, S2, 0x20, U2, S3, 0x20, U3, S4, 0x20, \
|
||||
U4, S5, 0x20, U5, S6, 0x20, U6, S7, 0x20, U7, S8, 0x20
|
||||
GINTERLACE v, d, D0, D4, U1, U0
|
||||
GINTERLACE v, d, D1, D5, U3, U2
|
||||
GINTERLACE v, d, D2, D6, U5, U4
|
||||
GINTERLACE v, d, D3, D7, U7, U6
|
||||
GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \
|
||||
D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70
|
||||
addi.d TD, TD, 0x80
|
||||
GLD v, , U0, S1, 0x30, U1, S2, 0x30, U2, S3, 0x30, U3, S4, 0x30, \
|
||||
U4, S5, 0x30, U5, S6, 0x30, U6, S7, 0x30, U7, S8, 0x30
|
||||
GINTERLACE v, d, D0, D4, U1, U0
|
||||
GINTERLACE v, d, D1, D5, U3, U2
|
||||
GINTERLACE v, d, D2, D6, U5, U4
|
||||
GINTERLACE v, d, D3, D7, U7, U6
|
||||
GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \
|
||||
D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70
|
||||
addi.d TD, TD, 0x80
|
||||
|
||||
addi.d S1, S1, 0x40
|
||||
addi.d S2, S2, 0x40
|
||||
addi.d S3, S3, 0x40
|
||||
addi.d S4, S4, 0x40
|
||||
addi.d S5, S5, 0x40
|
||||
addi.d S6, S6, 0x40
|
||||
addi.d S7, S7, 0x40
|
||||
addi.d S8, S8, 0x40
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_I1
|
||||
.L_I7:
|
||||
andi I, M, 0x07
|
||||
beq I, ZERO, .L_I0
|
||||
.L_II1: /* I-- */
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
fld.d F2, S3, 0x00
|
||||
fld.d F3, S4, 0x00
|
||||
fld.d F4, S5, 0x00
|
||||
fld.d F5, S6, 0x00
|
||||
fld.d F6, S7, 0x00
|
||||
fld.d F7, S8, 0x00
|
||||
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d S1, S1, 0x08
|
||||
fst.d F1, TD, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
fst.d F2, TD, 0x10
|
||||
addi.d S3, S3, 0x08
|
||||
fst.d F3, TD, 0x18
|
||||
addi.d S4, S4, 0x08
|
||||
fst.d F4, TD, 0x20
|
||||
addi.d S5, S5, 0x08
|
||||
fst.d F5, TD, 0x28
|
||||
addi.d S6, S6, 0x08
|
||||
fst.d F6, TD, 0x30
|
||||
addi.d S7, S7, 0x08
|
||||
fst.d F7, TD, 0x38
|
||||
addi.d S8, S8, 0x08
|
||||
addi.d TD, TD, 0x40
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_II1
|
||||
.L_I0:
|
||||
blt ZERO, J, .L_J1
|
||||
.L_N4:
|
||||
andi J, N, 0x04
|
||||
beq ZERO, J, .L_N2
|
||||
|
||||
move S1, TS
|
||||
add.d S2, TS, TL
|
||||
srai.d I, M, 0x02
|
||||
add.d S3, S2, TL
|
||||
add.d S4, S2, T0
|
||||
add.d TS, S3, T0
|
||||
beq I, ZERO, .L_I3
|
||||
.L_4I1: /* I-- */
|
||||
GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00
|
||||
GINTERLACE v, d, D0, D2, U1, U0
|
||||
GINTERLACE v, d, D1, D3, U3, U2
|
||||
GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30
|
||||
addi.d TD, TD, 0x40
|
||||
|
||||
GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10
|
||||
GINTERLACE v, d, D0, D2, U1, U0
|
||||
GINTERLACE v, d, D1, D3, U3, U2
|
||||
GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d S3, S3, 0x20
|
||||
addi.d S4, S4, 0x20
|
||||
addi.d TD, TD, 0x40
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_4I1
|
||||
.L_I3:
|
||||
andi I, M, 0x03
|
||||
beq I, ZERO, .L_N2
|
||||
.L_4II1:
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
fld.d F2, S3, 0x00
|
||||
fld.d F3, S4, 0x00
|
||||
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d S1, S1, 0x08
|
||||
fst.d F1, TD, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
fst.d F2, TD, 0x10
|
||||
addi.d S3, S3, 0x08
|
||||
fst.d F3, TD, 0x18
|
||||
addi.d S4, S4, 0x08
|
||||
|
||||
addi.d TD, TD, 0x20
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_4II1
|
||||
.L_N2:
|
||||
andi J, N, 0x02
|
||||
beq ZERO, J, .L_N1
|
||||
|
||||
move S1, TS
|
||||
add.d S2, TS, TL
|
||||
srai.d I, M, 0x01
|
||||
add.d TS, S2, TL
|
||||
beq I, ZERO, .L_NI1
|
||||
.L_2I1: /* I-- */
|
||||
GLD v, , U0, S1, 0x00, U1, S2, 0x00
|
||||
GINTERLACE v, d, D0, D1, U1, U0
|
||||
GST v, , D0, TD, 0x00, D1, TD, 0x10
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
addi.d TD, TD, 0x20
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_2I1
|
||||
.L_NI1:
|
||||
andi I, M, 0x01
|
||||
beq I, ZERO, .L_N1
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d S1, S1, 0x08
|
||||
fst.d F1, TD, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d TD, TD, 0x10
|
||||
.L_N1:
|
||||
move S1, TS
|
||||
beq ZERO, M, .L_N0
|
||||
.L_M1:
|
||||
fld.d F0, S1, 0x00
|
||||
addi.d S1, S1, 0x08
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d TD, TD, 0x08
|
||||
addi.d M, M, -1
|
||||
blt ZERO, M, .L_M1
|
||||
.L_N0:
|
||||
pop_if_used 26, 32
|
||||
jirl $r0, $r1, 0x00
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,280 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S0 $r11
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define P0 $r16
|
||||
#define P1 $r17
|
||||
#define P2 $r18
|
||||
#define P3 $r19
|
||||
#define T0 $r20
|
||||
#define T1 $r23
|
||||
#define TL $r7
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
/* LSX vectors */
|
||||
#define U0 $vr0
|
||||
#define U1 $vr1
|
||||
#define U2 $vr2
|
||||
#define U3 $vr3
|
||||
#define U4 $vr4
|
||||
#define U5 $vr5
|
||||
#define U6 $vr6
|
||||
#define U7 $vr7
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 18, 8
|
||||
|
||||
move S0, SRC
|
||||
move P0, DST
|
||||
|
||||
// Find P0, P2, P3
|
||||
srai.d T0, N, 0x02
|
||||
slli.d T0, T0, 0x02
|
||||
srai.d T1, N, 0x01
|
||||
slli.d T1, T1, 0x01
|
||||
mul.d T0, M, T0
|
||||
mul.d T1, M, T1
|
||||
slli.d T0, T0, 0x03
|
||||
slli.d T1, T1, 0x03
|
||||
add.d P2, DST, T0
|
||||
add.d P3, DST, T1
|
||||
|
||||
slli.d TL, LDA, 0x03
|
||||
srai.d J, M, 0x02
|
||||
slli.d T0, TL, 0x01
|
||||
slli.d T1, M, 0x05
|
||||
beq ZERO, J, .L_M3
|
||||
.L_J1: /* J-- */
|
||||
move S1, S0
|
||||
add.d S2, S0, TL
|
||||
add.d S3, S1, T0
|
||||
add.d S4, S2, T0
|
||||
add.d S0, S3, T0
|
||||
|
||||
move P1, P0
|
||||
addi.d P0, P0, 0x80
|
||||
|
||||
srai.d I, N, 0x02
|
||||
addi.d J, J, -1
|
||||
beq ZERO, I, .L_N3
|
||||
.L_I1: /* I-- */
|
||||
vld U0, S1, 0x00
|
||||
vld U1, S1, 0x10
|
||||
vld U2, S2, 0x00
|
||||
vld U3, S2, 0x10
|
||||
vld U4, S3, 0x00
|
||||
vld U5, S3, 0x10
|
||||
vld U6, S4, 0x00
|
||||
vld U7, S4, 0x10
|
||||
|
||||
vst U0, P1, 0x00
|
||||
vst U1, P1, 0x10
|
||||
vst U2, P1, 0x20
|
||||
vst U3, P1, 0x30
|
||||
vst U4, P1, 0x40
|
||||
vst U5, P1, 0x50
|
||||
vst U6, P1, 0x60
|
||||
vst U7, P1, 0x70
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d S3, S3, 0x20
|
||||
addi.d S4, S4, 0x20
|
||||
add.d P1, P1, T1
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_I1
|
||||
.L_N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_N1
|
||||
|
||||
vld U0, S1, 0x00
|
||||
vld U1, S2, 0x00
|
||||
vld U2, S3, 0x00
|
||||
vld U3, S4, 0x00
|
||||
|
||||
vst U0, P2, 0x00
|
||||
vst U1, P2, 0x10
|
||||
vst U2, P2, 0x20
|
||||
vst U3, P2, 0x30
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
addi.d S3, S3, 0x10
|
||||
addi.d S4, S4, 0x10
|
||||
addi.d P2, P2, 0x40
|
||||
.L_N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_N0
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
fld.d F2, S3, 0x00
|
||||
fld.d F3, S4, 0x00
|
||||
|
||||
fst.d F0, P3, 0x00
|
||||
fst.d F1, P3, 0x08
|
||||
fst.d F2, P3, 0x10
|
||||
fst.d F3, P3, 0x18
|
||||
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d S3, S3, 0x08
|
||||
addi.d S4, S4, 0x08
|
||||
addi.d P3, P3, 0x20
|
||||
|
||||
.L_N0:
|
||||
blt ZERO, J, .L_J1
|
||||
|
||||
.L_M3:
|
||||
andi J, M, 0x02
|
||||
beq ZERO, J, .L_M1
|
||||
|
||||
move S1, S0
|
||||
add.d S2, S0, TL
|
||||
add.d S0, S0, T0
|
||||
|
||||
move P1, P0
|
||||
addi.d P0, P0, 0x40
|
||||
|
||||
srai.d I, N, 0x02
|
||||
beq ZERO, I, .L_2N3
|
||||
|
||||
.L_2I1: /* I-- */
|
||||
vld U0, S1, 0x00
|
||||
vld U1, S1, 0x10
|
||||
vld U2, S2, 0x00
|
||||
vld U3, S2, 0x10
|
||||
|
||||
vst U0, P1, 0x00
|
||||
vst U1, P1, 0x10
|
||||
vst U2, P1, 0x20
|
||||
vst U3, P1, 0x30
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d I, I, -1
|
||||
add.d P1, P1, T1
|
||||
|
||||
blt ZERO, I, .L_2I1
|
||||
|
||||
.L_2N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_2N1
|
||||
|
||||
vld U0, S1, 0x00
|
||||
vld U1, S2, 0x00
|
||||
|
||||
vst U0, P2, 0x00
|
||||
vst U1, P2, 0x10
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
addi.d P2, P2, 0x20
|
||||
|
||||
.L_2N1:
|
||||
addi.d I, N, 0x01
|
||||
beq ZERO, I, .L_M1
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
|
||||
fst.d F0, P3, 0x00
|
||||
fst.d F1, P3, 0x08
|
||||
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d P3, P3, 0x10
|
||||
.L_M1:
|
||||
andi J, M, 0x01
|
||||
beq ZERO, J, .L_M0
|
||||
|
||||
move S1, S0
|
||||
move P1, P0
|
||||
|
||||
srai.d I, N, 0x02
|
||||
beq ZERO, I, .L_1N3
|
||||
|
||||
.L_1I1:
|
||||
vld U0, S1, 0x00
|
||||
vld U1, S1, 0x10
|
||||
|
||||
vst U0, P1, 0x00
|
||||
vst U1, P1, 0x10
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d I, I, -1
|
||||
add.d P1, P1, T1
|
||||
|
||||
blt ZERO, I, .L_1I1
|
||||
|
||||
.L_1N3:
|
||||
andi I, N, 0x02
|
||||
beq I, ZERO, .L_1N1
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S1, 0x08
|
||||
|
||||
fst.d F0, P2, 0x00
|
||||
fst.d F1, P2, 0x08
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d P2, P2, 0x10
|
||||
|
||||
.L_1N1:
|
||||
andi I, N, 0x01
|
||||
beq I, ZERO, .L_M0
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
|
||||
fst.d F0, P3, 0x00
|
||||
|
||||
.L_M0:
|
||||
pop_if_used 18, 8
|
||||
jirl $r0, $r1, 0x00
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,597 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S0 $r11
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define S5 $r16
|
||||
#define S6 $r17
|
||||
#define S7 $r18
|
||||
#define S8 $r19
|
||||
#define P0 $r20
|
||||
#define P1 $r23
|
||||
#define P2 $r24
|
||||
#define P3 $r25
|
||||
#define P4 $r26
|
||||
#define P5 $r27
|
||||
#define T0 $r28
|
||||
#define T1 $r29
|
||||
#define TL $r7
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
#define F4 $f4
|
||||
#define F5 $f5
|
||||
#define F6 $f6
|
||||
#define F7 $f7
|
||||
/* LASX vectors */
|
||||
#define U0 $vr0
|
||||
#define U1 $vr1
|
||||
#define U2 $vr2
|
||||
#define U3 $vr3
|
||||
#define U4 $vr4
|
||||
#define U5 $vr5
|
||||
#define U6 $vr6
|
||||
#define U7 $vr7
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 24, 8
|
||||
|
||||
move S0, SRC
|
||||
move P0, DST
|
||||
|
||||
srai.d T0, N, 0x03
|
||||
srai.d T1, N, 0x02
|
||||
slli.d T0, T0, 0x03
|
||||
slli.d T1, T1, 0x02
|
||||
mul.d P2, M, T0
|
||||
mul.d P3, M, T1
|
||||
slli.d P2, P2, 0x03
|
||||
slli.d P3, P3, 0x03
|
||||
add.d P2, DST, P2
|
||||
add.d P3, DST, P3
|
||||
|
||||
srai.d T0, N, 0x01
|
||||
slli.d T0, T0, 0x01
|
||||
mul.d P4, M, T0
|
||||
slli.d P4, P4, 0x03
|
||||
add.d P4, DST, P4
|
||||
|
||||
slli.d TL, LDA, 0x03
|
||||
srai.d J, M, 0x03
|
||||
slli.d T0, TL, 0x01
|
||||
slli.d T1, M, 0x06
|
||||
beq ZERO, J, .L_M7
|
||||
.L_J1: /* J-- */
|
||||
move S1, S0
|
||||
add.d S2, S0, TL
|
||||
add.d S3, S1, T0
|
||||
add.d S4, S2, T0
|
||||
add.d S5, S3, T0
|
||||
add.d S6, S4, T0
|
||||
add.d S7, S5, T0
|
||||
add.d S8, S6, T0
|
||||
add.d S0, S7, T0
|
||||
|
||||
move P1, P0
|
||||
addi.d P0, P0, 0x200
|
||||
|
||||
srai.d I, N, 0x03
|
||||
addi.d J, J, -1
|
||||
beq ZERO, I, .L_N7
|
||||
|
||||
.L_I1: /* I-- */
|
||||
vld U0, S1, 0x00
|
||||
vld U1, S1, 0x10
|
||||
vld U2, S1, 0x20
|
||||
vld U3, S1, 0x30
|
||||
vld U4, S2, 0x00
|
||||
vld U5, S2, 0x10
|
||||
vld U6, S2, 0x20
|
||||
vld U7, S2, 0x30
|
||||
|
||||
vst U0, P1, 0x00
|
||||
vst U1, P1, 0x10
|
||||
vst U2, P1, 0x20
|
||||
vst U3, P1, 0x30
|
||||
vst U4, P1, 0x40
|
||||
vst U5, P1, 0x50
|
||||
vst U6, P1, 0x60
|
||||
vst U7, P1, 0x70
|
||||
|
||||
vld U0, S3, 0x00
|
||||
vld U1, S3, 0x10
|
||||
vld U2, S3, 0x20
|
||||
vld U3, S3, 0x30
|
||||
vld U4, S4, 0x00
|
||||
vld U5, S4, 0x10
|
||||
vld U6, S4, 0x20
|
||||
vld U7, S4, 0x30
|
||||
|
||||
vst U0, P1, 0x80
|
||||
vst U1, P1, 0x90
|
||||
vst U2, P1, 0xa0
|
||||
vst U3, P1, 0xb0
|
||||
vst U4, P1, 0xc0
|
||||
vst U5, P1, 0xd0
|
||||
vst U6, P1, 0xe0
|
||||
vst U7, P1, 0xf0
|
||||
|
||||
vld U0, S5, 0x00
|
||||
vld U1, S5, 0x10
|
||||
vld U2, S5, 0x20
|
||||
vld U3, S5, 0x30
|
||||
vld U4, S6, 0x00
|
||||
vld U5, S6, 0x10
|
||||
vld U6, S6, 0x20
|
||||
vld U7, S6, 0x30
|
||||
|
||||
vst U0, P1, 0x100
|
||||
vst U1, P1, 0x110
|
||||
vst U2, P1, 0x120
|
||||
vst U3, P1, 0x130
|
||||
vst U4, P1, 0x140
|
||||
vst U5, P1, 0x150
|
||||
vst U6, P1, 0x160
|
||||
vst U7, P1, 0x170
|
||||
|
||||
vld U0, S7, 0x00
|
||||
vld U1, S7, 0x10
|
||||
vld U2, S7, 0x20
|
||||
vld U3, S7, 0x30
|
||||
vld U4, S8, 0x00
|
||||
vld U5, S8, 0x10
|
||||
vld U6, S8, 0x20
|
||||
vld U7, S8, 0x30
|
||||
|
||||
vst U0, P1, 0x180
|
||||
vst U1, P1, 0x190
|
||||
vst U2, P1, 0x1a0
|
||||
vst U3, P1, 0x1b0
|
||||
vst U4, P1, 0x1c0
|
||||
vst U5, P1, 0x1d0
|
||||
vst U6, P1, 0x1e0
|
||||
vst U7, P1, 0x1f0
|
||||
|
||||
addi.d S1, S1, 0x40
|
||||
addi.d S2, S2, 0x40
|
||||
addi.d S3, S3, 0x40
|
||||
addi.d S4, S4, 0x40
|
||||
addi.d S5, S5, 0x40
|
||||
addi.d S6, S6, 0x40
|
||||
addi.d S7, S7, 0x40
|
||||
addi.d S8, S8, 0x40
|
||||
addi.d I, I, -1
|
||||
add.d P1, P1, T1
|
||||
blt ZERO, I, .L_I1
|
||||
.L_N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_N3
|
||||
|
||||
vld U0, S1, 0x00
|
||||
vld U1, S1, 0x10
|
||||
vld U2, S2, 0x00
|
||||
vld U3, S2, 0x10
|
||||
vld U4, S3, 0x00
|
||||
vld U5, S3, 0x10
|
||||
vld U6, S4, 0x00
|
||||
vld U7, S4, 0x10
|
||||
|
||||
vst U0, P2, 0x00
|
||||
vst U1, P2, 0x10
|
||||
vst U2, P2, 0x20
|
||||
vst U3, P2, 0x30
|
||||
vst U4, P2, 0x40
|
||||
vst U5, P2, 0x50
|
||||
vst U6, P2, 0x60
|
||||
vst U7, P2, 0x70
|
||||
|
||||
vld U0, S5, 0x00
|
||||
vld U1, S5, 0x10
|
||||
vld U2, S6, 0x00
|
||||
vld U3, S6, 0x10
|
||||
vld U4, S7, 0x00
|
||||
vld U5, S7, 0x10
|
||||
vld U6, S8, 0x00
|
||||
vld U7, S8, 0x10
|
||||
|
||||
vst U0, P2, 0x80
|
||||
vst U1, P2, 0x90
|
||||
vst U2, P2, 0xa0
|
||||
vst U3, P2, 0xb0
|
||||
vst U4, P2, 0xc0
|
||||
vst U5, P2, 0xd0
|
||||
vst U6, P2, 0xe0
|
||||
vst U7, P2, 0xf0
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d S3, S3, 0x20
|
||||
addi.d S4, S4, 0x20
|
||||
addi.d S5, S5, 0x20
|
||||
addi.d S6, S6, 0x20
|
||||
addi.d S7, S7, 0x20
|
||||
addi.d S8, S8, 0x20
|
||||
addi.d P2, P2, 0x100
|
||||
|
||||
.L_N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_N1
|
||||
|
||||
vld U0, S1, 0x00
|
||||
vld U1, S2, 0x00
|
||||
vld U2, S3, 0x00
|
||||
vld U3, S4, 0x00
|
||||
vld U4, S5, 0x00
|
||||
vld U5, S6, 0x00
|
||||
vld U6, S7, 0x00
|
||||
vld U7, S8, 0x00
|
||||
|
||||
vst U0, P3, 0x00
|
||||
vst U1, P3, 0x10
|
||||
vst U2, P3, 0x20
|
||||
vst U3, P3, 0x30
|
||||
vst U4, P3, 0x40
|
||||
vst U5, P3, 0x50
|
||||
vst U6, P3, 0x60
|
||||
vst U7, P3, 0x70
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
addi.d S3, S3, 0x10
|
||||
addi.d S4, S4, 0x10
|
||||
addi.d S5, S5, 0x10
|
||||
addi.d S6, S6, 0x10
|
||||
addi.d S7, S7, 0x10
|
||||
addi.d S8, S8, 0x10
|
||||
addi.d P3, P3, 0x80
|
||||
|
||||
.L_N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_N0
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
fld.d F2, S3, 0x00
|
||||
fld.d F3, S4, 0x00
|
||||
fld.d F4, S5, 0x00
|
||||
fld.d F5, S6, 0x00
|
||||
fld.d F6, S7, 0x00
|
||||
fld.d F7, S8, 0x00
|
||||
|
||||
fst.d F0, P4, 0x00
|
||||
fst.d F1, P4, 0x08
|
||||
fst.d F2, P4, 0x10
|
||||
fst.d F3, P4, 0x18
|
||||
fst.d F4, P4, 0x20
|
||||
fst.d F5, P4, 0x28
|
||||
|
||||
fst.d F6, P4, 0x30
|
||||
fst.d F7, P4, 0x38
|
||||
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d S3, S3, 0x08
|
||||
addi.d S4, S4, 0x08
|
||||
addi.d S5, S5, 0x08
|
||||
addi.d S6, S6, 0x08
|
||||
addi.d S7, S7, 0x08
|
||||
addi.d S8, S8, 0x08
|
||||
addi.d P4, P4, 0x40
|
||||
|
||||
.L_N0:
|
||||
blt ZERO, J, .L_J1
|
||||
.L_M7:
|
||||
andi J, M, 0x04
|
||||
beq ZERO, J, .L_M3
|
||||
|
||||
move S1, S0
|
||||
add.d S2, S0, TL
|
||||
add.d S3, S1, T0
|
||||
add.d S4, S2, T0
|
||||
add.d S0, S3, T0
|
||||
|
||||
move P1, P0
|
||||
addi.d P0, P0, 0x100
|
||||
|
||||
srai.d I, N, 0x03
|
||||
beq ZERO, I, .L_4N7
|
||||
.L_4I1: /* I-- */
|
||||
vld U0, S1, 0x00
|
||||
vld U1, S1, 0x10
|
||||
vld U2, S1, 0x20
|
||||
vld U3, S1, 0x30
|
||||
vld U4, S2, 0x00
|
||||
vld U5, S2, 0x10
|
||||
vld U6, S2, 0x20
|
||||
vld U7, S2, 0x30
|
||||
|
||||
vst U0, P1, 0x00
|
||||
vst U1, P1, 0x10
|
||||
vst U2, P1, 0x20
|
||||
vst U3, P1, 0x30
|
||||
vst U4, P1, 0x40
|
||||
vst U5, P1, 0x50
|
||||
vst U6, P1, 0x60
|
||||
vst U7, P1, 0x70
|
||||
|
||||
vld U0, S3, 0x00
|
||||
vld U1, S3, 0x10
|
||||
vld U2, S3, 0x20
|
||||
vld U3, S3, 0x30
|
||||
vld U4, S4, 0x00
|
||||
vld U5, S4, 0x10
|
||||
vld U6, S4, 0x20
|
||||
vld U7, S4, 0x30
|
||||
|
||||
vst U0, P1, 0x80
|
||||
vst U1, P1, 0x90
|
||||
vst U2, P1, 0xa0
|
||||
vst U3, P1, 0xb0
|
||||
vst U4, P1, 0xc0
|
||||
vst U5, P1, 0xd0
|
||||
vst U6, P1, 0xe0
|
||||
vst U7, P1, 0xf0
|
||||
|
||||
addi.d S1, S1, 0x40
|
||||
addi.d S2, S2, 0x40
|
||||
addi.d S3, S3, 0x40
|
||||
addi.d S4, S4, 0x40
|
||||
addi.d I, I, -1
|
||||
add.d P1, P1, T1
|
||||
blt ZERO, I, .L_4I1
|
||||
.L_4N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_4N3
|
||||
|
||||
vld U0, S1, 0x00
|
||||
vld U1, S1, 0x10
|
||||
vld U2, S2, 0x00
|
||||
vld U3, S2, 0x10
|
||||
vld U4, S3, 0x00
|
||||
vld U5, S3, 0x10
|
||||
vld U6, S4, 0x00
|
||||
vld U7, S4, 0x10
|
||||
|
||||
vst U0, P2, 0x00
|
||||
vst U1, P2, 0x10
|
||||
vst U2, P2, 0x20
|
||||
vst U3, P2, 0x30
|
||||
vst U4, P2, 0x40
|
||||
vst U5, P2, 0x50
|
||||
vst U6, P2, 0x60
|
||||
vst U7, P2, 0x70
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d S3, S3, 0x20
|
||||
addi.d S4, S4, 0x20
|
||||
addi.d P2, P2, 0x80
|
||||
|
||||
.L_4N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_4N1
|
||||
|
||||
vld U0, S1, 0x00
|
||||
vld U1, S2, 0x00
|
||||
vld U2, S3, 0x00
|
||||
vld U3, S4, 0x00
|
||||
|
||||
vst U0, P3, 0x00
|
||||
vst U1, P3, 0x10
|
||||
vst U2, P3, 0x20
|
||||
vst U3, P3, 0x30
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
addi.d S3, S3, 0x10
|
||||
addi.d S4, S4, 0x10
|
||||
addi.d P3, P3, 0x40
|
||||
|
||||
.L_4N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M3
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
fld.d F2, S3, 0x00
|
||||
fld.d F3, S4, 0x00
|
||||
|
||||
fst.d F0, P4, 0x00
|
||||
fst.d F1, P4, 0x08
|
||||
fst.d F2, P4, 0x10
|
||||
fst.d F3, P4, 0x18
|
||||
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d S3, S3, 0x08
|
||||
addi.d S4, S4, 0x08
|
||||
addi.d P4, P4, 0x20
|
||||
.L_M3:
|
||||
andi J, M, 0x02
|
||||
beq ZERO, J, .L_M1
|
||||
|
||||
move S1, S0
|
||||
add.d S2, S0, TL
|
||||
add.d S0, S0, T0
|
||||
|
||||
move P1, P0
|
||||
addi.d P0, P0, 0x80
|
||||
|
||||
srai.d I, N, 0x03
|
||||
beq ZERO, I, .L_2N7
|
||||
.L_2I1: /* I-- */
|
||||
vld U0, S1, 0x00
|
||||
vld U1, S1, 0x10
|
||||
vld U2, S1, 0x20
|
||||
vld U3, S1, 0x30
|
||||
vld U4, S2, 0x00
|
||||
vld U5, S2, 0x10
|
||||
vld U6, S2, 0x20
|
||||
vld U7, S2, 0x30
|
||||
|
||||
vst U0, P1, 0x00
|
||||
vst U1, P1, 0x10
|
||||
vst U2, P1, 0x20
|
||||
vst U3, P1, 0x30
|
||||
vst U4, P1, 0x40
|
||||
vst U5, P1, 0x50
|
||||
vst U6, P1, 0x60
|
||||
vst U7, P1, 0x70
|
||||
|
||||
addi.d S1, S1, 0x40
|
||||
addi.d S2, S2, 0x40
|
||||
addi.d I, I, -1
|
||||
add.d P1, P1, T1
|
||||
blt ZERO, I, .L_2I1
|
||||
.L_2N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_2N3
|
||||
|
||||
vld U0, S1, 0x00
|
||||
vld U1, S1, 0x10
|
||||
vld U2, S2, 0x00
|
||||
vld U3, S2, 0x10
|
||||
|
||||
vst U0, P2, 0x00
|
||||
vst U1, P2, 0x10
|
||||
vst U2, P2, 0x20
|
||||
vst U3, P2, 0x30
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d P2, P2, 0x40
|
||||
|
||||
.L_2N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_2N1
|
||||
|
||||
vld U0, S1, 0x00
|
||||
vld U1, S2, 0x00
|
||||
|
||||
vst U0, P3, 0x00
|
||||
vst U1, P3, 0x10
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
addi.d P3, P3, 0x20
|
||||
|
||||
.L_2N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M1
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
|
||||
fst.d F0, P4, 0x00
|
||||
fst.d F1, P4, 0x08
|
||||
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d P4, P4, 0x10
|
||||
.L_M1:
|
||||
andi J, M, 0x01
|
||||
beq ZERO, J, .L_M0
|
||||
|
||||
move S1, S0
|
||||
add.d S2, S0, TL
|
||||
|
||||
move P1, P0
|
||||
addi.d P0, P0, 0x40
|
||||
|
||||
srai.d I, N, 0x03
|
||||
beq ZERO, I, .L_1N7
|
||||
.L_1I1: /* I-- */
|
||||
vld U0, S1, 0x00
|
||||
vld U1, S1, 0x10
|
||||
vld U2, S1, 0x20
|
||||
vld U3, S1, 0x30
|
||||
|
||||
vst U0, P1, 0x00
|
||||
vst U1, P1, 0x10
|
||||
vst U2, P1, 0x20
|
||||
vst U3, P1, 0x30
|
||||
|
||||
addi.d S1, S1, 0x40
|
||||
addi.d I, I, -1
|
||||
add.d P1, P1, T1
|
||||
blt ZERO, I, .L_1I1
|
||||
|
||||
.L_1N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_1N3
|
||||
|
||||
vld U0, S1, 0x00
|
||||
vld U1, S1, 0x10
|
||||
|
||||
vst U0, P2, 0x00
|
||||
vst U1, P2, 0x10
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d P2, P2, 0x20
|
||||
|
||||
.L_1N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_1N1
|
||||
|
||||
vld U0, S1, 0x00
|
||||
vst U0, P3, 0x00
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d P3, P3, 0x10
|
||||
|
||||
.L_1N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M0
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
|
||||
fst.d F0, P4, 0x00
|
||||
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d P4, P4, 0x08
|
||||
.L_M0:
|
||||
pop_if_used 24, 8
|
||||
jirl $r0, $r1, 0x00
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,257 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define XX $r19
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define t1 $r12
|
||||
#define t2 $r13
|
||||
#define t3 $r14
|
||||
#define t4 $r15
|
||||
|
||||
/* Don't change following FR unless you know the effects. */
|
||||
#define VX0 $xr15
|
||||
#define VX1 $xr16
|
||||
#define VM0 $xr17
|
||||
#define VM1 $xr18
|
||||
#define VM2 $xr13
|
||||
#define VM3 $xr14
|
||||
#define res1 $xr19
|
||||
#define res2 $xr20
|
||||
#define VALPHA $xr21
|
||||
#define INF $f23
|
||||
#define a1 $f22
|
||||
#define max $f17
|
||||
#define ALPHA $f12
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
xvxor.v res1, res1, res1
|
||||
xvxor.v res2, res2, res2
|
||||
xvxor.v VM0, VM0, VM0
|
||||
bge $r0, N, .L999
|
||||
beq $r0, INCX, .L999
|
||||
move XX, X
|
||||
// Init INF
|
||||
addi.d TEMP, $r0, 0x7FF
|
||||
slli.d TEMP, TEMP, 52
|
||||
MTC INF, TEMP
|
||||
li.d TEMP, SIZE
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L97
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
xvld VX0, X, 0
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvfmaxa.d VM1, VX1, VX0
|
||||
xvfmaxa.d VM0, VM0, VM1
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 8 * SIZE
|
||||
blt $r0, I, .L10
|
||||
b .L96
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
bge $r0, I, .L97
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
ld.d t1, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
ld.d t2, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
ld.d t3, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
ld.d t4, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
ld.d t1, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
ld.d t2, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
ld.d t3, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
ld.d t4, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvfmaxa.d VM1, VX0, VX1
|
||||
xvfmaxa.d VM0, VM0, VM1
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
b .L96
|
||||
.align 3
|
||||
|
||||
.L96:
|
||||
xvpickve.d VX0, VM0, 1
|
||||
xvpickve.d VX1, VM0, 2
|
||||
xvpickve.d VM3, VM0, 3
|
||||
fmaxa.d $f17, $f17, $f14
|
||||
fmaxa.d $f17, $f17, $f15
|
||||
fmaxa.d $f17, $f17, $f16
|
||||
.align 3
|
||||
|
||||
.L97:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L99
|
||||
.align 3
|
||||
|
||||
.L98:
|
||||
xvld VX1, X, 0
|
||||
xvfmaxa.d VM0, VM0, VX1
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L98
|
||||
.align 3
|
||||
|
||||
.L99:
|
||||
fabs.d max, max
|
||||
lu12i.w TEMP, 0x3f800 // 1
|
||||
movgr2fr.d a1, $r0
|
||||
movgr2fr.w ALPHA, TEMP
|
||||
CMPEQ $fcc0, max, a1
|
||||
fcvt.d.s ALPHA, ALPHA
|
||||
bcnez $fcc0, .L999
|
||||
fdiv.d ALPHA, ALPHA, max
|
||||
CMPEQ $fcc0, INF, ALPHA
|
||||
bcnez $fcc0, .L999
|
||||
movfr2gr.d TEMP, ALPHA
|
||||
xvreplgr2vr.d VALPHA, TEMP
|
||||
|
||||
.L100:
|
||||
li.d TEMP, SIZE
|
||||
bne INCX, TEMP, .L120
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L110:
|
||||
xvld VX0, XX, 0
|
||||
xvld VX1, XX, 4 * SIZE
|
||||
xvfmul.d VM2, VX0, VALPHA
|
||||
xvfmul.d VM3, VX1, VALPHA
|
||||
xvfmadd.d res1, VM2, VM2, res1
|
||||
xvfmadd.d res2, VM3, VM3, res2
|
||||
addi.d XX, XX, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L110
|
||||
b .L996
|
||||
.align 3
|
||||
|
||||
.L120:
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L997
|
||||
|
||||
.L121:
|
||||
ld.d t1, XX, 0
|
||||
add.d XX, XX, INCX
|
||||
ld.d t2, XX, 0
|
||||
add.d XX, XX, INCX
|
||||
ld.d t3, XX, 0
|
||||
add.d XX, XX, INCX
|
||||
ld.d t4, XX, 0
|
||||
add.d XX, XX, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
ld.d t1, XX, 0
|
||||
add.d XX, XX, INCX
|
||||
ld.d t2, XX, 0
|
||||
add.d XX, XX, INCX
|
||||
ld.d t3, XX, 0
|
||||
add.d XX, XX, INCX
|
||||
ld.d t4, XX, 0
|
||||
add.d XX, XX, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvfmul.d VM2, VX0, VALPHA
|
||||
xvfmul.d VM3, VX1, VALPHA
|
||||
xvfmadd.d res1, VM2, VM2, res1
|
||||
xvfmadd.d res2, VM3, VM3, res2
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L121
|
||||
b .L996
|
||||
.align 3
|
||||
|
||||
.L996:
|
||||
xvfadd.d res1, res1, res2
|
||||
xvpickve.d VX0, res1, 1
|
||||
xvpickve.d VX1, res1, 2
|
||||
xvpickve.d VM2, res1, 3
|
||||
fadd.d $f19, $f19, $f15
|
||||
fadd.d $f19, $f19, $f16
|
||||
fadd.d $f19, $f19, $f13
|
||||
.align 3
|
||||
|
||||
.L997:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
fld.d $f15, XX, 0
|
||||
addi.d I, I, -1
|
||||
fmul.d $f15, $f15, ALPHA
|
||||
fmadd.d $f19, $f15, $f15, $f19
|
||||
add.d XX, XX , INCX
|
||||
blt $r0, I, .L998
|
||||
|
||||
.L999:
|
||||
fsqrt.d $f19, $f19
|
||||
fmul.d $f0, max, $f19
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,268 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define XX $r19
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define t1 $r12
|
||||
#define t2 $r13
|
||||
#define t3 $r14
|
||||
#define t4 $r15
|
||||
|
||||
/* Don't change following FR unless you know the effects. */
|
||||
#define VX0 $vr15
|
||||
#define VX1 $vr16
|
||||
#define VM0 $vr17
|
||||
#define VM1 $vr18
|
||||
#define VM2 $vr13
|
||||
#define VM3 $vr14
|
||||
#define res1 $vr19
|
||||
#define res2 $vr20
|
||||
#define VALPHA $vr21
|
||||
#define INF $f23
|
||||
#define a1 $f22
|
||||
#define max $f17
|
||||
#define ALPHA $f12
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
vxor.v res1, res1, res1
|
||||
vxor.v res2, res2, res2
|
||||
vxor.v VM0, VM0, VM0
|
||||
bge $r0, N, .L999
|
||||
beq $r0, INCX, .L999
|
||||
move XX, X
|
||||
// Init INF
|
||||
addi.d TEMP, $r0, 0x7FF
|
||||
slli.d TEMP, TEMP, 52
|
||||
MTC INF, TEMP
|
||||
li.d TEMP, SIZE
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
|
||||
bge $r0, I, .L97
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vfmaxa.d VM1, VX1, VX0
|
||||
vld VX0, X, 4 * SIZE
|
||||
vld VX1, X, 6 * SIZE
|
||||
vfmaxa.d VM2, VX1, VX0
|
||||
vfmaxa.d VM3, VM1, VM2
|
||||
vfmaxa.d VM0, VM0, VM3
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 8 * SIZE
|
||||
blt $r0, I, .L10
|
||||
b .L96
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
bge $r0, I, .L97
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vfmaxa.d VM1, VX0, VX1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vfmaxa.d VM2, VX0, VX1
|
||||
vfmaxa.d VM3, VM1, VM2
|
||||
vfmaxa.d VM0, VM0, VM3
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
b .L96
|
||||
.align 3
|
||||
|
||||
.L96:
|
||||
vreplvei.d VX0, VM0, 0
|
||||
vreplvei.d VX1, VM0, 1
|
||||
vfmaxa.d VM0, VX0, VX1
|
||||
.align 3
|
||||
|
||||
.L97:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L99
|
||||
.align 3
|
||||
|
||||
.L98:
|
||||
vld VX1, X, 0
|
||||
vfmaxa.d VM0, VM0, VX1
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L98
|
||||
.align 3
|
||||
|
||||
.L99:
|
||||
fabs.d max, max
|
||||
lu12i.w TEMP, 0x3f800 // 1
|
||||
movgr2fr.d a1, $r0
|
||||
movgr2fr.w ALPHA, TEMP
|
||||
CMPEQ $fcc0, max, a1
|
||||
fcvt.d.s ALPHA, ALPHA
|
||||
bcnez $fcc0, .L999
|
||||
fdiv.d ALPHA, ALPHA, max
|
||||
CMPEQ $fcc0, INF, ALPHA
|
||||
bcnez $fcc0, .L999
|
||||
movfr2gr.d TEMP, ALPHA
|
||||
vreplgr2vr.d VALPHA, TEMP
|
||||
|
||||
.L100:
|
||||
li.d TEMP, SIZE
|
||||
bne INCX, TEMP, .L120
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L110:
|
||||
vld VX0, XX, 0 * SIZE
|
||||
vld VX1, XX, 2 * SIZE
|
||||
vfmul.d VM2, VX0, VALPHA
|
||||
vfmul.d VM3, VX1, VALPHA
|
||||
vfmadd.d res1, VM2, VM2, res1
|
||||
vfmadd.d res2, VM3, VM3, res2
|
||||
vld VX0, XX, 4 * SIZE
|
||||
vld VX1, XX, 6 * SIZE
|
||||
vfmul.d VM2, VX0, VALPHA
|
||||
vfmul.d VM3, VX1, VALPHA
|
||||
vfmadd.d res1, VM2, VM2, res1
|
||||
vfmadd.d res2, VM3, VM3, res2
|
||||
addi.d XX, XX, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L110
|
||||
b .L996
|
||||
.align 3
|
||||
|
||||
.L120:
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L997
|
||||
.align 3
|
||||
|
||||
.L121:
|
||||
ld.d t1, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ld.d t2, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ld.d t3, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ld.d t4, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vfmul.d VM2, VX0, VALPHA
|
||||
ld.d t1, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
vfmul.d VM3, VX1, VALPHA
|
||||
ld.d t2, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
vfmadd.d res1, VM2, VM2, res1
|
||||
vfmadd.d res2, VM3, VM3, res2
|
||||
ld.d t3, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ld.d t4, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vfmul.d VM2, VX0, VALPHA
|
||||
vfmul.d VM3, VX1, VALPHA
|
||||
vfmadd.d res1, VM2, VM2, res1
|
||||
vfmadd.d res2, VM3, VM3, res2
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L121
|
||||
b .L996
|
||||
.align 3
|
||||
|
||||
.L996:
|
||||
vfadd.d res1, res1, res2
|
||||
vreplvei.d VX1, res1, 1
|
||||
vfadd.d res1, VX1, res1
|
||||
.align 3
|
||||
|
||||
.L997:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
fld.d $f15, XX, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
fmul.d $f15, $f15, ALPHA
|
||||
fmadd.d $f19, $f15, $f15, $f19
|
||||
add.d XX, XX , INCX
|
||||
blt $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
fsqrt.d $f19, $f19
|
||||
fmul.d $f0, max, $f19
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,368 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define Y $r7
|
||||
#define INCY $r8
|
||||
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
|
||||
/* Don't change following FR unless you know the effects. */
|
||||
#define s1 $f8
|
||||
#define s2 $f9
|
||||
#define a1 $f10
|
||||
#define b1 $f11
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
LDINT INCY, 0(INCY)
|
||||
#endif
|
||||
|
||||
/* init $f8 and $f9 to zero */
|
||||
SUB s1, s1, s1
|
||||
SUB s2, s2, s2
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
li.d TEMP, SIZE
|
||||
slli.d INCY, INCY, BASE_SHIFT
|
||||
bge $r0, N, .L999
|
||||
bne INCX, TEMP, .L20 /* inc_x=1 */
|
||||
bne INCY, TEMP, .L20 /* inc_y=1 */
|
||||
|
||||
/* !((inc_x == 1) && (inc_y == 1)) */
|
||||
|
||||
/* init $xr8 and $xr9 to zero */
|
||||
#ifdef DOUBLE
|
||||
xvldrepl.d $xr0, X, 0
|
||||
#else
|
||||
xvldrepl.w $xr0, X, 0
|
||||
#endif
|
||||
#ifdef DSDOT
|
||||
xvfcvtl.d.s $xr0, $xr0
|
||||
xvfsub.d $xr8, $xr0, $xr0
|
||||
xvfsub.d $xr9, $xr0, $xr0
|
||||
#else
|
||||
XVFSUB $xr8, $xr0, $xr0
|
||||
XVFSUB $xr9, $xr0, $xr0
|
||||
#endif
|
||||
|
||||
#ifdef DOUBLE
|
||||
srai.d I, N, 4
|
||||
#else
|
||||
srai.d I, N, 5
|
||||
#endif
|
||||
bge $r0, I, .L12 /* FLOAT: <32 ; DOUBLE: <16 */
|
||||
.align 3
|
||||
.L11:
|
||||
/* FLOAT: 32~ ; DOUBLE: 16~ */
|
||||
xvld $xr0, X, 0
|
||||
xvld $xr1, X, 32
|
||||
xvld $xr2, X, 64
|
||||
xvld $xr3, X, 96
|
||||
xvld $xr4, Y, 0
|
||||
xvld $xr5, Y, 32
|
||||
xvld $xr6, Y, 64
|
||||
xvld $xr7, Y, 96
|
||||
addi.w I, I, -1
|
||||
addi.d X, X, 128
|
||||
addi.d Y, Y, 128
|
||||
#ifdef DSDOT
|
||||
xvfcvtl.d.s $xr10, $xr0
|
||||
xvfcvtl.d.s $xr11, $xr4
|
||||
xvfcvth.d.s $xr12, $xr0
|
||||
xvfcvth.d.s $xr13, $xr4
|
||||
xvfmadd.d $xr8, $xr10, $xr12, $xr8
|
||||
xvfmadd.d $xr9, $xr11, $xr13, $xr9
|
||||
xvfcvtl.d.s $xr10, $xr1
|
||||
xvfcvtl.d.s $xr11, $xr5
|
||||
xvfcvth.d.s $xr12, $xr1
|
||||
xvfcvth.d.s $xr13, $xr5
|
||||
xvfmadd.d $xr8, $xr10, $xr12, $xr8
|
||||
xvfmadd.d $xr9, $xr11, $xr13, $xr9
|
||||
xvfcvtl.d.s $xr10, $xr2
|
||||
xvfcvtl.d.s $xr11, $xr6
|
||||
xvfcvth.d.s $xr12, $xr2
|
||||
xvfcvth.d.s $xr13, $xr6
|
||||
xvfmadd.d $xr8, $xr10, $xr12, $xr8
|
||||
xvfmadd.d $xr9, $xr11, $xr13, $xr9
|
||||
xvfcvtl.d.s $xr10, $xr3
|
||||
xvfcvtl.d.s $xr11, $xr7
|
||||
xvfcvth.d.s $xr12, $xr3
|
||||
xvfcvth.d.s $xr13, $xr7
|
||||
xvfmadd.d $xr8, $xr10, $xr12, $xr8
|
||||
xvfmadd.d $xr9, $xr11, $xr13, $xr9
|
||||
#else
|
||||
XVFMADD $xr8, $xr0, $xr4, $xr8
|
||||
XVFMADD $xr9, $xr1, $xr5, $xr9
|
||||
XVFMADD $xr8, $xr2, $xr6, $xr8
|
||||
XVFMADD $xr9, $xr3, $xr7, $xr9
|
||||
#endif
|
||||
bnez I, .L11
|
||||
.align 3
|
||||
.L12:
|
||||
#ifdef DOUBLE
|
||||
andi I, N, 0xf
|
||||
srai.d I, I, 2
|
||||
#else
|
||||
andi I, N, 0x1f
|
||||
srai.d I, I, 3
|
||||
#endif
|
||||
bge $r0, I, .L14 /* DOUBLE: <4 ; FLOAT: <8 */
|
||||
.align 3
|
||||
.L13:
|
||||
/* FLOAT: 8~31 ; DOUBLE: 4~15 */
|
||||
xvld $xr0, X, 0
|
||||
xvld $xr4, Y, 0
|
||||
addi.w I, I, -1
|
||||
addi.d X, X, 32
|
||||
addi.d Y, Y, 32
|
||||
#ifdef DSDOT
|
||||
xvfcvtl.d.s $xr10, $xr0
|
||||
xvfcvtl.d.s $xr11, $xr4
|
||||
xvfcvth.d.s $xr12, $xr0
|
||||
xvfcvth.d.s $xr13, $xr4
|
||||
xvfmadd.d $xr8, $xr10, $xr12, $xr8
|
||||
xvfmadd.d $xr9, $xr11, $xr13, $xr9
|
||||
#else
|
||||
XVFMADD $xr8, $xr0, $xr4, $xr8
|
||||
#endif
|
||||
bnez I, .L13
|
||||
.align 3
|
||||
.L14:
|
||||
/* store dot in s1 $f8 */
|
||||
#ifdef DSDOT
|
||||
xvfadd.d $xr8, $xr8, $xr9
|
||||
fsub.s s2, s2, s2, /* set s2 to 0.0 */
|
||||
xvpermi.q $xr0, $xr8, 0x1
|
||||
vfadd.d $vr8, $vr8, $vr0
|
||||
vpackod.d $vr0, $vr8, $vr8
|
||||
vfadd.d $vr8, $vr8, $vr0
|
||||
#else
|
||||
XVFADD $xr8, $xr8, $xr9
|
||||
SUB s2, s2, s2 /* set s2 to 0.0 */
|
||||
xvpermi.q $xr0, $xr8, 0x1
|
||||
VFADD $vr8, $vr8, $vr0
|
||||
vpackod.d $vr0, $vr8, $vr8
|
||||
#ifdef DOUBLE
|
||||
VFADD $vr8, $vr8, $vr0
|
||||
#else
|
||||
VFADD $vr8, $vr8, $vr0
|
||||
vpackod.w $vr0, $vr8, $vr8
|
||||
VFADD $vr8, $vr8, $vr0
|
||||
#endif /* defined DOUBLE */
|
||||
#endif /* defined DSDOT */
|
||||
.align 3
|
||||
.L15:
|
||||
#ifdef DOUBLE
|
||||
andi I, N, 0x3
|
||||
#else
|
||||
andi I, N, 0x7
|
||||
#endif
|
||||
bge $r0, I, .L999 /* =0 */
|
||||
.align 3
|
||||
.L16:
|
||||
/* FLOAT: 1~7 ; DOUBLE: 1~3 */
|
||||
LD a1, X, 0
|
||||
LD b1, Y, 0
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
addi.d Y, Y, SIZE
|
||||
bnez I, .L16
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
/* !((inc_x == 1) && (inc_y == 1)) */
|
||||
srai.d I, N, 3
|
||||
#ifdef F_INTERFACE
|
||||
bgez INCX, .L21
|
||||
addi.d TEMP, N, -1
|
||||
mult TEMP, INCX
|
||||
mflo TEMP
|
||||
dsub X, X, TEMP
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
bgez INCY, .L22
|
||||
addi.d TEMP, N, -1
|
||||
mult TEMP, INCY
|
||||
mflo TEMP
|
||||
dsub Y, Y, TEMP
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#endif
|
||||
bge $r0, I, .L25 /* <8 */
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s2, b1, a1, s2
|
||||
#else
|
||||
MADD s2, b1, a1, s2
|
||||
#endif
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s2, b1, a1, s2
|
||||
#else
|
||||
MADD s2, b1, a1, s2
|
||||
#endif
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s2, b1, a1, s2
|
||||
#else
|
||||
MADD s2, b1, a1, s2
|
||||
#endif
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
addi.d I, I, -1
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s2, b1, a1, s2
|
||||
#else
|
||||
MADD s2, b1, a1, s2
|
||||
#endif
|
||||
blt $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
addi.d I, I, -1
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
blt $r0, I, .L26
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
#ifdef DSDOT
|
||||
fadd.d $f0, s1, s2
|
||||
#else
|
||||
ADD $f0, s1, s2
|
||||
#endif
|
||||
move $r4, $r17
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,364 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define Y $r7
|
||||
#define INCY $r8
|
||||
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
|
||||
/* Don't change following FR unless you know the effects. */
|
||||
#define s1 $f8
|
||||
#define s2 $f9
|
||||
#define a1 $f10
|
||||
#define b1 $f11
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
LDINT INCY, 0(INCY)
|
||||
#endif
|
||||
|
||||
/* init $f8 and $f9 to zero */
|
||||
SUB s1, s1, s1
|
||||
SUB s2, s2, s2
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
li.d TEMP, SIZE
|
||||
slli.d INCY, INCY, BASE_SHIFT
|
||||
bge $r0, N, .L999
|
||||
bne INCX, TEMP, .L20 /* inc_x=1 */
|
||||
bne INCY, TEMP, .L20 /* inc_y=1 */
|
||||
|
||||
/* !((inc_x == 1) && (inc_y == 1)) */
|
||||
|
||||
/* init $vr8 and $vr9 to zero */
|
||||
#ifdef DOUBLE
|
||||
vldrepl.d $vr0, X, 0
|
||||
#else
|
||||
vldrepl.w $vr0, X, 0
|
||||
#endif
|
||||
#ifdef DSDOT
|
||||
vfcvtl.d.s $vr0, $vr0
|
||||
vfsub.d $vr8, $vr0, $vr0
|
||||
vfsub.d $vr9, $vr0, $vr0
|
||||
#else
|
||||
VFSUB $vr8, $vr0, $vr0
|
||||
VFSUB $vr9, $vr0, $vr0
|
||||
#endif
|
||||
|
||||
#ifdef DOUBLE
|
||||
srai.d I, N, 3
|
||||
#else
|
||||
srai.d I, N, 4
|
||||
#endif
|
||||
bge $r0, I, .L12 /* FLOAT: <16 ; DOUBLE: <8 */
|
||||
.align 3
|
||||
.L11:
|
||||
/* FLOAT: 16~ ; DOUBLE: 8~ */
|
||||
vld $vr0, X, 0
|
||||
vld $vr1, X, 16
|
||||
vld $vr2, X, 32
|
||||
vld $vr3, X, 48
|
||||
vld $vr4, Y, 0
|
||||
vld $vr5, Y, 16
|
||||
vld $vr6, Y, 32
|
||||
vld $vr7, Y, 48
|
||||
addi.w I, I, -1
|
||||
addi.d X, X, 64
|
||||
addi.d Y, Y, 64
|
||||
#ifdef DSDOT
|
||||
vfcvtl.d.s $vr10, $vr0
|
||||
vfcvtl.d.s $vr11, $vr4
|
||||
vfcvth.d.s $vr12, $vr0
|
||||
vfcvth.d.s $vr13, $vr4
|
||||
vfmadd.d $vr8, $vr10, $vr12, $vr8
|
||||
vfmadd.d $vr9, $vr11, $vr13, $vr9
|
||||
vfcvtl.d.s $vr10, $vr1
|
||||
vfcvtl.d.s $vr11, $vr5
|
||||
vfcvth.d.s $vr12, $vr1
|
||||
vfcvth.d.s $vr13, $vr5
|
||||
vfmadd.d $vr8, $vr10, $vr12, $vr8
|
||||
vfmadd.d $vr9, $vr11, $vr13, $vr9
|
||||
vfcvtl.d.s $vr10, $vr2
|
||||
vfcvtl.d.s $vr11, $vr6
|
||||
vfcvth.d.s $vr12, $vr2
|
||||
vfcvth.d.s $vr13, $vr6
|
||||
vfmadd.d $vr8, $vr10, $vr12, $vr8
|
||||
vfmadd.d $vr9, $vr11, $vr13, $vr9
|
||||
vfcvtl.d.s $vr10, $vr3
|
||||
vfcvtl.d.s $vr11, $vr7
|
||||
vfcvth.d.s $vr12, $vr3
|
||||
vfcvth.d.s $vr13, $vr7
|
||||
vfmadd.d $vr8, $vr10, $vr12, $vr8
|
||||
vfmadd.d $vr9, $vr11, $vr13, $vr9
|
||||
#else
|
||||
VFMADD $vr8, $vr0, $vr4, $vr8
|
||||
VFMADD $vr9, $vr1, $vr5, $vr9
|
||||
VFMADD $vr8, $vr2, $vr6, $vr8
|
||||
VFMADD $vr9, $vr3, $vr7, $vr9
|
||||
#endif
|
||||
bnez I, .L11
|
||||
.align 3
|
||||
.L12:
|
||||
#ifdef DOUBLE
|
||||
andi I, N, 0x7
|
||||
srai.d I, I, 1
|
||||
#else
|
||||
andi I, N, 0xf
|
||||
srai.d I, I, 2
|
||||
#endif
|
||||
bge $r0, I, .L14 /* DOUBLE: <2 ; FLOAT: <4 */
|
||||
.align 3
|
||||
.L13:
|
||||
/* FLOAT: 4~15 ; DOUBLE: 2~7 */
|
||||
vld $vr0, X, 0
|
||||
vld $vr4, Y, 0
|
||||
addi.w I, I, -1
|
||||
addi.d X, X, 16
|
||||
addi.d Y, Y, 16
|
||||
#ifdef DSDOT
|
||||
vfcvtl.d.s $vr10, $vr0
|
||||
vfcvtl.d.s $vr11, $vr4
|
||||
vfcvth.d.s $vr12, $vr0
|
||||
vfcvth.d.s $vr13, $vr4
|
||||
vfmadd.d $vr8, $vr10, $vr12, $vr8
|
||||
vfmadd.d $vr9, $vr11, $vr13, $vr9
|
||||
#else
|
||||
VFMADD $vr8, $vr0, $vr4, $vr8
|
||||
#endif
|
||||
bnez I, .L13
|
||||
.align 3
|
||||
.L14:
|
||||
/* store dot in s1 $f8 */
|
||||
#ifdef DSDOT
|
||||
vfadd.d $vr8, $vr8, $vr9
|
||||
fsub.s s2, s2, s2, /* set s2 to 0.0 */
|
||||
vpackod.d $vr0, $vr8, $vr8
|
||||
vfadd.d $vr8, $vr8, $vr0
|
||||
#else
|
||||
VFADD $vr8, $vr8, $vr9
|
||||
SUB s2, s2, s2 /* set s2 to 0.0 */
|
||||
vpackod.d $vr0, $vr8, $vr8
|
||||
#ifdef DOUBLE
|
||||
VFADD $vr8, $vr8, $vr0
|
||||
#else
|
||||
VFADD $vr8, $vr8, $vr0
|
||||
vpackod.w $vr0, $vr8, $vr8
|
||||
VFADD $vr8, $vr8, $vr0
|
||||
#endif /* defined DOUBLE */
|
||||
#endif /* defined DSDOT */
|
||||
.align 3
|
||||
.L15:
|
||||
#ifdef DOUBLE
|
||||
andi I, N, 0x1
|
||||
#else
|
||||
andi I, N, 0x3
|
||||
#endif
|
||||
bge $r0, I, .L999 /* =0 */
|
||||
.align 3
|
||||
.L16:
|
||||
/* DOUBLE: 1 ; FLOAT: 1~3 */
|
||||
LD a1, X, 0
|
||||
LD b1, Y, 0
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
addi.d Y, Y, SIZE
|
||||
bnez I, .L16
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
/* !((inc_x == 1) && (inc_y == 1)) */
|
||||
srai.d I, N, 3
|
||||
#ifdef F_INTERFACE
|
||||
bgez INCX, .L21
|
||||
addi.d TEMP, N, -1
|
||||
mult TEMP, INCX
|
||||
mflo TEMP
|
||||
dsub X, X, TEMP
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
bgez INCY, .L22
|
||||
addi.d TEMP, N, -1
|
||||
mult TEMP, INCY
|
||||
mflo TEMP
|
||||
dsub Y, Y, TEMP
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#endif
|
||||
bge $r0, I, .L25 /* <8 */
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s2, b1, a1, s2
|
||||
#else
|
||||
MADD s2, b1, a1, s2
|
||||
#endif
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s2, b1, a1, s2
|
||||
#else
|
||||
MADD s2, b1, a1, s2
|
||||
#endif
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s2, b1, a1, s2
|
||||
#else
|
||||
MADD s2, b1, a1, s2
|
||||
#endif
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
addi.d I, I, -1
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s2, b1, a1, s2
|
||||
#else
|
||||
MADD s2, b1, a1, s2
|
||||
#endif
|
||||
blt $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
addi.d I, I, -1
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
blt $r0, I, .L26
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
#ifdef DSDOT
|
||||
fadd.d $f0, s1, s2
|
||||
#else
|
||||
ADD $f0, s1, s2
|
||||
#endif
|
||||
move $r4, $r17
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,194 @@
|
|||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define ALPHA $f0
|
||||
#define X $r7
|
||||
#define INCX $r8
|
||||
#define I $r12
|
||||
#define TEMP $r13
|
||||
#define t1 $r14
|
||||
#define t2 $r18
|
||||
#define t3 $r15
|
||||
#define t4 $r17
|
||||
#define XX $r16
|
||||
#define VX0 $xr12
|
||||
#define VX1 $xr13
|
||||
#define VT0 $xr14
|
||||
#define VT1 $xr15
|
||||
#define VALPHA $xr19
|
||||
#define a1 $f8
|
||||
#define a2 $f23
|
||||
|
||||
PROLOGUE
|
||||
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
movgr2fr.d a1, $r0
|
||||
ffint.d.l a1, a1
|
||||
movgr2fr.d a2, TEMP
|
||||
ffint.d.l a2, a2
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
fcmp.ceq.d $fcc0, ALPHA, a1
|
||||
bcnez $fcc0, .L20 //ALPHA==0
|
||||
fcmp.ceq.d $fcc0, ALPHA, a2
|
||||
bcnez $fcc0, .L999 //ALPHA==1 return
|
||||
srai.d I, N, 3
|
||||
beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1
|
||||
movfr2gr.d TEMP, ALPHA
|
||||
xvreplgr2vr.d VALPHA, TEMP
|
||||
move XX, X
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
bge $r0, I, .L32
|
||||
.align 3
|
||||
.L11:
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvfmul.d VT0, VX0, VALPHA
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvstelm.d VT0, XX, 0, 0
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.d VT0, XX, 0, 1
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.d VT0, XX, 0, 2
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.d VT0, XX, 0, 3
|
||||
add.d XX, XX, INCX
|
||||
xvfmul.d VT1, VX1, VALPHA
|
||||
xvstelm.d VT1, XX, 0, 0
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.d VT1, XX, 0, 1
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.d VT1, XX, 0, 2
|
||||
add.d XX, XX, INCX
|
||||
xvstelm.d VT1, XX, 0, 3
|
||||
add.d XX, XX, INCX
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L11
|
||||
b .L32
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
srai.d I, N, 3
|
||||
beq INCX, TEMP, .L24
|
||||
bge $r0, I, .L22
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
fst.d a1, X, 0
|
||||
add.d X, X, INCX
|
||||
fst.d a1, X, 0
|
||||
add.d X, X, INCX
|
||||
fst.d a1, X, 0
|
||||
add.d X, X, INCX
|
||||
fst.d a1, X, 0
|
||||
add.d X, X, INCX
|
||||
fst.d a1, X, 0
|
||||
add.d X, X, INCX
|
||||
fst.d a1, X, 0
|
||||
add.d X, X, INCX
|
||||
fst.d a1, X, 0
|
||||
add.d X, X, INCX
|
||||
fst.d a1, X, 0
|
||||
add.d X, X, INCX
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
.L23:
|
||||
fst.d a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L23
|
||||
jirl $r0, $r1, 0
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
bge $r0, I, .L26 /*N<8 INCX==1*/
|
||||
.align 3
|
||||
.L25:
|
||||
xvxor.v VX0, VX0, VX0
|
||||
xvst VX0, X, 0 * SIZE
|
||||
xvst VX0, X, 4 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 8 * SIZE
|
||||
blt $r0, I, .L25
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
.L27:
|
||||
fst.d a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
blt $r0, I, .L27
|
||||
jirl $r0, $r1, 0
|
||||
.align 3
|
||||
|
||||
.L30:
|
||||
bge $r0, I, .L32/*N<8 INCX==1*/
|
||||
movfr2gr.d TEMP, ALPHA
|
||||
xvreplgr2vr.d VALPHA , TEMP
|
||||
.align 3
|
||||
|
||||
.L31:
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvfmul.d VT0, VX0, VALPHA
|
||||
xvfmul.d VT1, VX1, VALPHA
|
||||
addi.d I, I, -1
|
||||
xvst VT0, X, 0 * SIZE
|
||||
xvst VT1, X, 4 * SIZE
|
||||
addi.d X, X, 8 * SIZE
|
||||
blt $r0, I, .L31
|
||||
.align 3
|
||||
|
||||
.L32:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
.L33:
|
||||
fld.d a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
fmul.d a1, ALPHA, a1
|
||||
fst.d a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L33
|
||||
jirl $r0, $r1, 0
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,205 @@
|
|||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define ALPHA $f0
|
||||
#define X $r7
|
||||
#define INCX $r8
|
||||
#define I $r12
|
||||
#define TEMP $r13
|
||||
#define t1 $r14
|
||||
#define t2 $r18
|
||||
#define t3 $r15
|
||||
#define t4 $r17
|
||||
#define XX $r16
|
||||
#define VX0 $vr12
|
||||
#define VX1 $vr13
|
||||
#define VT0 $vr14
|
||||
#define VT1 $vr15
|
||||
#define VALPHA $vr19
|
||||
#define a1 $f8
|
||||
#define a2 $f23
|
||||
|
||||
PROLOGUE
|
||||
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
movgr2fr.d a1, $r0
|
||||
ffint.d.l a1, a1
|
||||
movgr2fr.d a2, TEMP
|
||||
ffint.d.l a2, a2
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
fcmp.ceq.d $fcc0, ALPHA, a1
|
||||
bcnez $fcc0, .L20 //ALPHA==0
|
||||
fcmp.ceq.d $fcc0, ALPHA, a2
|
||||
bcnez $fcc0, .L999 //ALPHA==1 return
|
||||
srai.d I, N, 3
|
||||
beq INCX, TEMP, .L30 //ALPHA!=0|1 and INCX==1
|
||||
movfr2gr.d TEMP, ALPHA
|
||||
vreplgr2vr.d VALPHA, TEMP
|
||||
move XX, X
|
||||
.align 3
|
||||
|
||||
.L10: //ALPHA!=0|1 and INCX!=1
|
||||
bge $r0, I, .L32
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vfmul.d VT0, VX0, VALPHA
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vstelm.d VT0, XX, 0, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d VT0, XX, 0, 1
|
||||
add.d XX, XX, INCX
|
||||
vfmul.d VT1, VX1, VALPHA
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vstelm.d VT1, XX, 0, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d VT1, XX, 0, 1
|
||||
add.d XX, XX, INCX
|
||||
vfmul.d VT0, VX0, VALPHA
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vstelm.d VT0, XX, 0, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d VT0, XX, 0, 1
|
||||
add.d XX, XX, INCX
|
||||
vfmul.d VT1, VX1, VALPHA
|
||||
vstelm.d VT1, XX, 0, 0
|
||||
add.d XX, XX, INCX
|
||||
vstelm.d VT1, XX, 0, 1
|
||||
add.d XX, XX, INCX
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L11
|
||||
b .L32
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
srai.d I, N, 3
|
||||
beq INCX, TEMP, .L24
|
||||
bge $r0, I, .L22
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
fst.d a1, X, 0
|
||||
add.d X, X, INCX
|
||||
fst.d a1, X, 0
|
||||
add.d X, X, INCX
|
||||
fst.d a1, X, 0
|
||||
add.d X, X, INCX
|
||||
fst.d a1, X, 0
|
||||
add.d X, X, INCX
|
||||
fst.d a1, X, 0
|
||||
add.d X, X, INCX
|
||||
fst.d a1, X, 0
|
||||
add.d X, X, INCX
|
||||
fst.d a1, X, 0
|
||||
add.d X, X, INCX
|
||||
fst.d a1, X, 0
|
||||
add.d X, X, INCX
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
.L23:
|
||||
fst.d a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L23
|
||||
jirl $r0, $r1, 0
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
bge $r0, I, .L26 /*N<8 INCX==1*/
|
||||
.align 3
|
||||
.L25:
|
||||
vxor.v VX0, VX0, VX0
|
||||
vst VX0, X, 0 * SIZE
|
||||
vst VX0, X, 2 * SIZE
|
||||
vst VX0, X, 4 * SIZE
|
||||
vst VX0, X, 6 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 8 * SIZE
|
||||
blt $r0, I, .L25
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
.L27:
|
||||
fst.d a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
blt $r0, I, .L27
|
||||
jirl $r0, $r1, 0
|
||||
.align 3
|
||||
|
||||
.L30:
|
||||
bge $r0, I, .L32/*N<8 INCX==1*/
|
||||
movfr2gr.d TEMP, ALPHA
|
||||
vreplgr2vr.d VALPHA , TEMP
|
||||
.align 3
|
||||
|
||||
.L31:
|
||||
vld VX0, X, 0 * SIZE
|
||||
vld VX1, X, 2 * SIZE
|
||||
vfmul.d VT0, VX0, VALPHA
|
||||
vfmul.d VT1, VX1, VALPHA
|
||||
vld VX0, X, 4 * SIZE
|
||||
vst VT0, X, 0 * SIZE
|
||||
vst VT1, X, 2 * SIZE
|
||||
vfmul.d VT0, VX0, VALPHA
|
||||
vld VX1, X, 6 * SIZE
|
||||
vst VT0, X, 4 * SIZE
|
||||
vfmul.d VT1, VX1, VALPHA
|
||||
vst VT1, X, 6 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 8 * SIZE
|
||||
blt $r0, I, .L31
|
||||
.align 3
|
||||
|
||||
.L32:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
.L33:
|
||||
fld.d a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
fmul.d a1, ALPHA, a1
|
||||
fst.d a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L33
|
||||
jirl $r0, $r1, 0
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,542 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r12
|
||||
#define t1 $r13
|
||||
#define t2 $r15
|
||||
#define t3 $r18
|
||||
#define t4 $r16
|
||||
#define i0 $r17
|
||||
#define i1 $r14
|
||||
#define TEMP $r19
|
||||
#define x1 $xr9
|
||||
#define x2 $xr10
|
||||
#define x3 $xr11
|
||||
#define x4 $xr12
|
||||
#define VX0 $xr13
|
||||
#define VX1 $xr14
|
||||
#define VM0 $xr15
|
||||
#define VM1 $xr16
|
||||
#define VINC4 $xr17
|
||||
#define VINC8 $xr18
|
||||
#define VI0 $xr20
|
||||
#define VI1 $xr21
|
||||
#define VI2 $xr22
|
||||
#define VI3 $xr8
|
||||
#define VI4 $xr19
|
||||
#define VT0 $xr23
|
||||
|
||||
PROLOGUE
|
||||
li.d i0, 0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bne INCX, TEMP, .L20
|
||||
xvld VM0, X, 0
|
||||
#ifdef DOUBLE
|
||||
addi.d i0, i0, 1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
slli.d i0, i0, 2 //4
|
||||
xvreplgr2vr.d VINC4, i0
|
||||
slli.d i0, i0, 1 //8
|
||||
xvreplgr2vr.d VINC8, i0
|
||||
addi.d i0, i0, -15
|
||||
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 2
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 3
|
||||
addi.d i0, i0, 5
|
||||
xvinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 1 //2
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 2 //3
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 3 //4
|
||||
#else
|
||||
addi.w i0, i0, 1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
slli.w i0, i0, 3 //8
|
||||
xvreplgr2vr.w VINC8, i0
|
||||
addi.w i0, i0, -15
|
||||
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 4
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 5
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 6
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 2 //3
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 3 //4
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 4 //5
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 5 //6
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 6 //7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 7 //8
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0 * SIZE
|
||||
xvadd.d VI1, VI1, VINC8
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvadd.d VI2, VI1, VINC4
|
||||
xvfmaxa.d VM1, VX0, VX1
|
||||
xvfcmp.ceq.d VT0, VX0, VM1
|
||||
addi.d I, I, -1
|
||||
xvbitsel.v VI2, VI2, VI1, VT0
|
||||
xvfmaxa.d VM1, VM0, VM1
|
||||
xvfcmp.ceq.d VT0, VM0, VM1
|
||||
addi.d X, X, 8 * SIZE
|
||||
xvbitsel.v VM0, VM1, VM0, VT0
|
||||
xvbitsel.v VI0, VI2, VI0, VT0
|
||||
#else
|
||||
xvld VX0, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
xvadd.w VI1, VI1, VINC8
|
||||
xvfmaxa.s VM1, VX0, VM0
|
||||
xvfcmp.ceq.s VT0, VM0, VM1
|
||||
addi.d X, X, 8 * SIZE
|
||||
xvbitsel.v VM0, VM1, VM0, VT0
|
||||
xvbitsel.v VI0, VI1, VI0, VT0
|
||||
#endif
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VI1, VI0, 0
|
||||
xvpickve.d VI2, VI0, 1
|
||||
xvpickve.d VI3, VI0, 2
|
||||
xvpickve.d VI4, VI0, 3
|
||||
xvpickve.d x1, VM0, 0
|
||||
xvpickve.d x2, VM0, 1
|
||||
xvpickve.d x3, VM0, 2
|
||||
xvpickve.d x4, VM0, 3
|
||||
#else
|
||||
xvxor.v VX0, VX0, VX0
|
||||
xvor.v VX0, VI0, VX0
|
||||
xvxor.v VX1, VX1, VX1
|
||||
xvor.v VX1, VM0, VX1
|
||||
xvpickve.w VI1, VI0, 0
|
||||
xvpickve.w VI2, VI0, 1
|
||||
xvpickve.w VI3, VI0, 2
|
||||
xvpickve.w VI4, VI0, 3
|
||||
xvpickve.w x1, VM0, 0
|
||||
xvpickve.w x2, VM0, 1
|
||||
xvpickve.w x3, VM0, 2
|
||||
xvpickve.w x4, VM0, 3
|
||||
#endif
|
||||
XVFMAXA VM1, x1, x2
|
||||
XVCMPEQ VT0, x1, VM1
|
||||
xvbitsel.v VINC4, VI2, VI1, VT0
|
||||
XVFMAXA VM0, x3, x4
|
||||
XVCMPEQ VT0, x3, VM0
|
||||
xvbitsel.v VINC8, VI4, VI3, VT0
|
||||
XVFMAXA VM0, VM0, VM1
|
||||
XVCMPEQ VT0, VM0, VM1
|
||||
xvbitsel.v VI0, VINC8, VINC4, VT0
|
||||
CMPEQ $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
XVCMPLT VT0, VI1, VI0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
b .L26
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
move TEMP, X
|
||||
#ifdef DOUBLE
|
||||
addi.d i0, i0, 1
|
||||
ld.d t1, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
xvinsgr2vr.d VM0, t1, 0
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
ld.d t2, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.d t3, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.d t4, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
xvinsgr2vr.d VM0, t2, 1
|
||||
xvinsgr2vr.d VM0, t3, 2
|
||||
xvinsgr2vr.d VM0, t4, 3
|
||||
slli.d i0, i0, 2 //4
|
||||
xvreplgr2vr.d VINC4, i0
|
||||
slli.d i0, i0, 1 //8
|
||||
xvreplgr2vr.d VINC8, i0
|
||||
addi.d i0, i0, -15
|
||||
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 2
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 3
|
||||
addi.d i0, i0, 5
|
||||
xvinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 1 //2
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 2 //3
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 3 //4
|
||||
#else
|
||||
addi.w i0, i0, 1
|
||||
ld.w t1, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
xvinsgr2vr.w VM0, t1, 0
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
ld.w t2, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t3, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t4, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
xvinsgr2vr.w VM0, t2, 1
|
||||
xvinsgr2vr.w VM0, t3, 2
|
||||
xvinsgr2vr.w VM0, t4, 3
|
||||
ld.w t1, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t2, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t3, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t4, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
xvinsgr2vr.w VM0, t1, 4
|
||||
xvinsgr2vr.w VM0, t2, 5
|
||||
xvinsgr2vr.w VM0, t3, 6
|
||||
xvinsgr2vr.w VM0, t4, 7
|
||||
slli.w i0, i0, 3 //8
|
||||
xvreplgr2vr.w VINC8, i0
|
||||
addi.w i0, i0, -15
|
||||
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 4
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 5
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 6
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 2 //3
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 3 //4
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 4 //5
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 5 //6
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 6 //7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 7 //8
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
xvadd.d VI1, VI1, VINC8
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvadd.d VI2, VI1, VINC4
|
||||
xvfmaxa.d VM1, VX0, VX1
|
||||
xvfcmp.ceq.d VT0, VX0, VM1
|
||||
addi.d I, I, -1
|
||||
xvbitsel.v VI2, VI2, VI1, VT0
|
||||
xvfmaxa.d VM1, VM0, VM1
|
||||
xvfcmp.ceq.d VT0, VM0, VM1
|
||||
xvbitsel.v VM0, VM1, VM0, VT0
|
||||
xvbitsel.v VI0, VI2, VI0, VT0
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
xvadd.w VI1, VI1, VINC8
|
||||
xvfmaxa.s VM1, VX0, VM0
|
||||
xvfcmp.ceq.s VT0, VM1, VM0
|
||||
addi.d I, I, -1
|
||||
xvbitsel.v VM0, VM1, VM0, VT0
|
||||
xvbitsel.v VI0, VI1, VI0, VT0
|
||||
#endif
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VI1, VI0, 0
|
||||
xvpickve.d VI2, VI0, 1
|
||||
xvpickve.d VI3, VI0, 2
|
||||
xvpickve.d VI4, VI0, 3
|
||||
xvpickve.d x1, VM0, 0
|
||||
xvpickve.d x2, VM0, 1
|
||||
xvpickve.d x3, VM0, 2
|
||||
xvpickve.d x4, VM0, 3
|
||||
xvfmaxa.d VM1, x1, x2
|
||||
xvfcmp.ceq.d VT0, x1, VM1
|
||||
xvbitsel.v VINC4, VI2, VI1, VT0
|
||||
xvfmaxa.d VM0, x4, x3
|
||||
xvfcmp.ceq.d VT0, x3, VM0
|
||||
xvbitsel.v VINC8, VI4, VI3, VT0
|
||||
xvfmaxa.d VM0, VM0, VM1
|
||||
xvfcmp.ceq.d VT0, VM0, VM1
|
||||
xvbitsel.v VI0, VINC8, VINC4, VT0
|
||||
#else
|
||||
xvxor.v VX0, VX0, VX0
|
||||
xvor.v VX0, VI0, VX0
|
||||
xvxor.v VX1, VX1, VX1
|
||||
xvor.v VX1, VM0, VX1
|
||||
xvpickve.w VI1, VI0, 0
|
||||
xvpickve.w VI2, VI0, 1
|
||||
xvpickve.w VI3, VI0, 2
|
||||
xvpickve.w VI4, VI0, 3
|
||||
xvpickve.w x1, VM0, 0
|
||||
xvpickve.w x2, VM0, 1
|
||||
xvpickve.w x3, VM0, 2
|
||||
xvpickve.w x4, VM0, 3
|
||||
xvfmaxa.s VM1, x1, x2
|
||||
xvfcmp.ceq.s VT0, x1, VM1
|
||||
xvbitsel.v VINC4, VI2, VI1, VT0
|
||||
xvfmaxa.s VM0, x3, x4
|
||||
xvfcmp.ceq.s VT0, x3, VM0
|
||||
xvbitsel.v VINC8, VI3, VI4, VT0
|
||||
xvfmaxa.s VM0, VM0, VM1
|
||||
xvfcmp.ceq.s VT0, VM0, VM1
|
||||
xvbitsel.v VM0, VM0, VM1, VT0
|
||||
xvbitsel.v VI0, VINC8, VINC4, VT0
|
||||
#endif
|
||||
CMPEQ $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
XVCMPLT VT0, VI1, VI0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
fcmp.ceq.d $fcc0, $f15, $f10
|
||||
bceqz $fcc0, .L27
|
||||
XVCMPLT VT0, VI2, VI0
|
||||
xvbitsel.v VI0, VI0, VI2, VT0
|
||||
.align 3
|
||||
|
||||
.L27:
|
||||
fcmp.ceq.d $fcc0, $f15, $f11
|
||||
bceqz $fcc0, .L28
|
||||
XVCMPLT VT0, VI3, VI0
|
||||
xvbitsel.v VI0, VI0, VI3, VT0
|
||||
.align 3
|
||||
|
||||
.L28:
|
||||
fcmp.ceq.d $fcc0, $f15, $f12
|
||||
bceqz $fcc0, .L29
|
||||
XVCMPLT VT0, VI4, VI0
|
||||
xvbitsel.v VI0, VI0, VI4, VT0
|
||||
.align 3
|
||||
|
||||
.L29:
|
||||
#ifdef DOUBLE
|
||||
movfr2gr.d i0, $f20
|
||||
#else
|
||||
fmov.s $f16, $f20
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
#ifdef DOUBLE
|
||||
|
||||
#else
|
||||
.L252:
|
||||
xvxor.v VI0, VI0, VI0
|
||||
xvor.v VI0, VI0, VX0
|
||||
fmov.s $f13, $f15
|
||||
xvxor.v VM0, VM0, VM0
|
||||
xvor.v VM0, VM0, VX1
|
||||
xvpickve.w VI1, VI0, 4
|
||||
xvpickve.w VI2, VI0, 5
|
||||
xvpickve.w VI3, VI0, 6
|
||||
xvpickve.w VI4, VI0, 7
|
||||
xvpickve.w x1, VM0, 4
|
||||
xvpickve.w x2, VM0, 5
|
||||
xvpickve.w x3, VM0, 6
|
||||
xvpickve.w x4, VM0, 7
|
||||
xvfmaxa.s VM1, x1, x2
|
||||
xvfcmp.ceq.s VT0, x1, VM1
|
||||
xvbitsel.v VINC4, VI2, VI1, VT0
|
||||
xvfmaxa.s VM0, x3, x4
|
||||
xvfcmp.ceq.s VT0, x3, VM0
|
||||
xvbitsel.v VINC8, VI4, VI3, VT0
|
||||
xvfmaxa.s VM0, VM0, VM1
|
||||
xvfcmp.ceq.s VT0, VM0, VM1
|
||||
xvbitsel.v VI0, VINC8, VINC4, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L262
|
||||
xvfcmp.clt.s VT0, VI1, VI0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
.align 3
|
||||
|
||||
.L262:
|
||||
fcmp.ceq.d $fcc0, $f15, $f10
|
||||
bceqz $fcc0, .L272
|
||||
xvfcmp.clt.s VT0, VI2, VI0
|
||||
xvbitsel.v VI0, VI0, VI2, VT0
|
||||
.align 3
|
||||
.L272:
|
||||
fcmp.ceq.d $fcc0, $f15, $f11
|
||||
bceqz $fcc0, .L282
|
||||
xvfcmp.clt.s VT0, VI3, VI0
|
||||
xvbitsel.v VI0, VI0, VI3, VT0
|
||||
.align 3
|
||||
|
||||
.L282:
|
||||
fcmp.ceq.d $fcc0, $f15, $f12
|
||||
bceqz $fcc0, .L292
|
||||
xvfcmp.clt.s VT0, VI4, VI0
|
||||
xvbitsel.v VI0, VI0, VI4, VT0
|
||||
.align 3
|
||||
|
||||
.L292:
|
||||
xvfmaxa.s VM0, VX0, VM0
|
||||
xvfcmp.ceq.s VT0, VM0, VX0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
movfr2gr.s i0, $f20
|
||||
#endif
|
||||
|
||||
.L21: //N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
srai.d i1, N, 3
|
||||
slli.d i1, i1, 3
|
||||
addi.d i1, i1, 1 //current index
|
||||
movgr2fr.d $f21, i1
|
||||
movgr2fr.d $f20, i0
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
LD $f9, X, 0
|
||||
addi.d I, I, -1
|
||||
XVFMAXA VM1, x1, VM0
|
||||
XVCMPEQ VT0, VM0, VM1
|
||||
add.d X, X, INCX
|
||||
xvbitsel.v VM0, VM1, VM0, VT0
|
||||
xvbitsel.v VI0, VI1, VI0, VT0
|
||||
addi.d i1, i1, 1
|
||||
movgr2fr.d $f21, i1
|
||||
blt $r0, I, .L22
|
||||
MTG i0, $f20
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,482 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r12
|
||||
#define t1 $r13
|
||||
#define t2 $r15
|
||||
#define t3 $r18
|
||||
#define t4 $r16
|
||||
#define i0 $r17
|
||||
#define i1 $r14
|
||||
#define TEMP $r19
|
||||
#define x1 $vr9
|
||||
#define x2 $vr10
|
||||
#define x3 $vr11
|
||||
#define x4 $vr12
|
||||
#define VX0 $vr13
|
||||
#define VX1 $vr14
|
||||
#define VM0 $vr15
|
||||
#define VM1 $vr16
|
||||
#define VINC2 $vr17
|
||||
#define VINC4 $vr18
|
||||
#define VI0 $vr20
|
||||
#define VI1 $vr21
|
||||
#define VI2 $vr22
|
||||
#define VI3 $vr8
|
||||
#define VI4 $vr19
|
||||
#define VT0 $vr23
|
||||
|
||||
PROLOGUE
|
||||
li.d i0, 0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bne INCX, TEMP, .L20
|
||||
vld VM0, X, 0
|
||||
#ifdef DOUBLE
|
||||
addi.d i0, i0, 1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L11
|
||||
slli.d i0, i0, 1 //2
|
||||
vreplgr2vr.d VINC2, i0
|
||||
slli.d i0, i0, 1 //4
|
||||
vreplgr2vr.d VINC4, i0
|
||||
addi.d i0, i0, -7
|
||||
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, 3
|
||||
vinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI0, i0, 1 //2
|
||||
#else
|
||||
addi.w i0, i0, 1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
slli.w i0, i0, 2 //4
|
||||
vreplgr2vr.w VINC2, i0
|
||||
slli.w i0, i0, 1 //8
|
||||
vreplgr2vr.w VINC4, i0
|
||||
addi.w i0, i0, -15
|
||||
vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, 5
|
||||
vinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 2 //3
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 3 //4
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0 * SIZE
|
||||
vadd.d VI1, VI1, VINC4
|
||||
vld VX1, X, 2 * SIZE
|
||||
vadd.d VI2, VI1, VINC2
|
||||
vfmaxa.d x1, VX0, VX1
|
||||
vfcmp.ceq.d VT0, VX0, x1
|
||||
vbitsel.v x2, VI2, VI1, VT0
|
||||
vld VX0, X, 4 * SIZE
|
||||
vadd.d VI1, VI2, VINC2
|
||||
vld VX1, X, 6 * SIZE
|
||||
vadd.d VI2, VI1, VINC2
|
||||
vfmaxa.d x3, VX0, VX1
|
||||
vfcmp.ceq.d VT0, VX0, x3
|
||||
vbitsel.v x4, VI2, VI1, VT0
|
||||
vfmaxa.d x3, x1, x3
|
||||
vfcmp.ceq.d VT0, x1, x3
|
||||
vbitsel.v x2, x4, x2, VT0
|
||||
vfmaxa.d VM1, VM0, x3
|
||||
vfcmp.ceq.d VT0, VM0, VM1
|
||||
vbitsel.v VM0, VM1, VM0, VT0
|
||||
vbitsel.v VI0, x2, VI0, VT0
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 8 * SIZE
|
||||
#else
|
||||
vld VX0, X, 0 * SIZE
|
||||
vadd.w VI1, VI1, VINC4
|
||||
vld VX1, X, 4 * SIZE
|
||||
vadd.w VI2, VI1, VINC2
|
||||
vfmaxa.s VM1, VX0, VX1
|
||||
vfcmp.ceq.s VT0, VX0, VM1
|
||||
addi.d I, I, -1
|
||||
vbitsel.v VI2, VI2, VI1, VT0
|
||||
vfmaxa.s VM1, VM0, VM1
|
||||
vfcmp.ceq.s VT0, VM0, VM1
|
||||
addi.d X, X, 8 * SIZE
|
||||
vbitsel.v VM0, VM1, VM0, VT0
|
||||
vbitsel.v VI0, VI2, VI0, VT0
|
||||
#endif
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VI1, VI0, 0
|
||||
vreplvei.d VI2, VI0, 1
|
||||
vreplvei.d x1, VM0, 0
|
||||
vreplvei.d x2, VM0, 1
|
||||
fcmp.ceq.d $fcc0, $f10, $f9
|
||||
bceqz $fcc0, .L16
|
||||
vfcmp.clt.d VT0, VI1, VI2
|
||||
vbitsel.v VI0, VI2, VI1, VT0
|
||||
b .L17
|
||||
#else
|
||||
vreplvei.w VI1, VI0, 0
|
||||
vreplvei.w VI2, VI0, 1
|
||||
vreplvei.w VI3, VI0, 2
|
||||
vreplvei.w VI4, VI0, 3
|
||||
vreplvei.w x1, VM0, 0
|
||||
vreplvei.w x2, VM0, 1
|
||||
vreplvei.w x3, VM0, 2
|
||||
vreplvei.w x4, VM0, 3
|
||||
vfmaxa.s VM1, x1, x2
|
||||
vfcmp.ceq.s VT0, VM1, x1
|
||||
vbitsel.v VINC2, VI2, VI1, VT0
|
||||
vfmaxa.s VM0, x3, x4
|
||||
vfcmp.ceq.s VT0, x3, VM0
|
||||
vbitsel.v VINC4, VI4, VI3, VT0
|
||||
vfmaxa.s VM0, VM0, VM1
|
||||
vfcmp.ceq.s VT0, VM0, VM1
|
||||
vbitsel.v VI0, VINC4, VINC2, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
vfcmp.clt.s VT0, VI1, VI0
|
||||
vbitsel.v VI0, VI0, VI1, VT0
|
||||
b .L26
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
#ifdef DOUBLE
|
||||
.L16:
|
||||
vfmaxa.d VM0, x1, x2
|
||||
vfcmp.ceq.d VT0, x1, VM0
|
||||
vbitsel.v VI0, VI2, VI1, VT0
|
||||
.align 3
|
||||
|
||||
.L17:
|
||||
movfr2gr.d i0, $f20
|
||||
.align 3
|
||||
|
||||
.L11: //INCX==1 and N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L14
|
||||
srai.d i1, N, 3
|
||||
slli.d i1, i1, 3
|
||||
addi.d i1, i1, 1 //current index
|
||||
movgr2fr.d $f21, i1
|
||||
movgr2fr.d $f20, i0
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
fld.d $f9, X, 0
|
||||
vfmaxa.d VM1, x1, VM0
|
||||
vfcmp.ceq.d VT0, VM0, VM1
|
||||
vbitsel.v VM0, VM1, VM0, VT0
|
||||
vbitsel.v VI0, VI1, VI0, VT0
|
||||
addi.d I, I, -1
|
||||
addi.d i1, i1, 1
|
||||
addi.d X, X, SIZE
|
||||
movgr2fr.d $f21, i1
|
||||
blt $r0, I, .L13
|
||||
movfr2gr.d i0, $f20
|
||||
.align 3
|
||||
|
||||
.L14:
|
||||
move $r4, $r17
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
move TEMP, X
|
||||
addi.d i0, i0, 1
|
||||
ld.d t1, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
vinsgr2vr.d VM0, t1, 0
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
ld.d t2, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
vinsgr2vr.d VM0, t2, 1
|
||||
slli.d i0, i0, 1 //2
|
||||
vreplgr2vr.d VINC2, i0
|
||||
slli.d i0, i0, 1 //4
|
||||
vreplgr2vr.d VINC4, i0
|
||||
addi.d i0, i0, -7
|
||||
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, 3
|
||||
vinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI0, i0, 1 //2
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vadd.d VI1, VI1, VINC4
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t1, 0
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t2, 1
|
||||
vadd.d VI2, VI1, VINC2
|
||||
vfmaxa.d x1, VX0, VX1
|
||||
vfcmp.ceq.d VT0, VX0, x1
|
||||
vbitsel.v x2, VI2, VI1, VT0
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vadd.d VI1, VI2, VINC2
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t1, 0
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t2, 1
|
||||
vadd.d VI2, VI1, VINC2
|
||||
vfmaxa.d x3, VX0, VX1
|
||||
vfcmp.ceq.d VT0, VX0, x3
|
||||
vbitsel.v x4, VI2, VI1, VT0
|
||||
vfmaxa.d x3, x1, x3
|
||||
vfcmp.ceq.d VT0, x1, x3
|
||||
vbitsel.v x2, x4, x2, VT0
|
||||
vfmaxa.d VM1, VM0, x3
|
||||
vbitsel.v VM0, VM1, VM0, VT0
|
||||
vfcmp.ceq.d VT0, VM0, VM1
|
||||
vbitsel.v VI0, x2, VI0, VT0
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
vreplvei.d VI1, VI0, 0
|
||||
vreplvei.d VI2, VI0, 1
|
||||
vreplvei.d x1, VM0, 0
|
||||
vreplvei.d x2, VM0, 1
|
||||
fcmp.ceq.d $fcc0, $f10, $f9
|
||||
bceqz $fcc0, .L26
|
||||
vfcmp.clt.d VT0, VI1, VI2
|
||||
vbitsel.v VI0, VI2, VI1, VT0
|
||||
b .L27
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
vfmaxa.d VM0, x1, x2
|
||||
vfcmp.ceq.d VT0, x1, VM0
|
||||
vbitsel.v VI0, VI2, VI1, VT0
|
||||
.align 3
|
||||
|
||||
.L27:
|
||||
movfr2gr.d i0, $f20
|
||||
.align 3
|
||||
|
||||
#else
|
||||
.L20: // INCX!=1
|
||||
move TEMP, X
|
||||
addi.w i0, i0, 1
|
||||
ld.w t1, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
vinsgr2vr.w VM0, t1, 0
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
ld.w t2, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t3, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t4, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
vinsgr2vr.w VM0, t2, 1
|
||||
vinsgr2vr.w VM0, t3, 2
|
||||
vinsgr2vr.w VM0, t4, 3
|
||||
slli.w i0, i0, 2 //4
|
||||
vreplgr2vr.w VINC2, i0
|
||||
slli.w i0, i0, 1 //8
|
||||
vreplgr2vr.w VINC4, i0
|
||||
addi.w i0, i0, -15
|
||||
vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, 5
|
||||
vinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 2 //3
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 3 //4
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
vadd.w VI1, VI1, VINC4
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
vadd.w VI2, VI1, VINC2
|
||||
vfmaxa.s VM1, VX0, VX1
|
||||
vfcmp.ceq.s VT0, VX0, VM1
|
||||
vbitsel.v VI2, VI2, VI1, VT0
|
||||
vfmaxa.s VM1, VM0, VM1
|
||||
vfcmp.ceq.s VT0, VM0, VM1
|
||||
addi.d I, I, -1
|
||||
vbitsel.v VM0, VM1, VM0, VT0
|
||||
vbitsel.v VI0, VI2, VI0, VT0
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
vreplvei.w VI1, VI0, 0
|
||||
vreplvei.w VI2, VI0, 1
|
||||
vreplvei.w VI3, VI0, 2
|
||||
vreplvei.w VI4, VI0, 3
|
||||
vreplvei.w x1, VM0, 0
|
||||
vreplvei.w x2, VM0, 1
|
||||
vreplvei.w x3, VM0, 2
|
||||
vreplvei.w x4, VM0, 3
|
||||
vfmaxa.s VM1, x1, x2
|
||||
vfcmp.ceq.s VT0, VM1, x1
|
||||
vbitsel.v VINC2, VI2, VI1, VT0
|
||||
vfmaxa.s VM0, x3, x4
|
||||
vfcmp.ceq.s VT0, x3, VM0
|
||||
vbitsel.v VINC4, VI4, VI3, VT0
|
||||
vfmaxa.s VM0, VM0, VM1
|
||||
vfcmp.ceq.s VT0, VM0, VM1
|
||||
vbitsel.v VI0, VINC4, VINC2, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
vfcmp.clt.s VT0, VI1, VI0
|
||||
vbitsel.v VI0, VI0, VI1, VT0
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
fcmp.ceq.d $fcc0, $f15, $f10
|
||||
bceqz $fcc0, .L27
|
||||
vfcmp.clt.s VT0, VI2, VI0
|
||||
vbitsel.v VI0, VI0, VI2, VT0
|
||||
.align 3
|
||||
|
||||
.L27:
|
||||
fcmp.ceq.d $fcc0, $f15, $f11
|
||||
bceqz $fcc0, .L28
|
||||
vfcmp.clt.s VT0, VI3, VI0
|
||||
vbitsel.v VI0, VI0, VI3, VT0
|
||||
.align 3
|
||||
|
||||
.L28:
|
||||
fcmp.ceq.d $fcc0, $f15, $f12
|
||||
bceqz $fcc0, .L29
|
||||
vfcmp.clt.s VT0, VI4, VI0
|
||||
vbitsel.v VI0, VI0, VI4, VT0
|
||||
.align 3
|
||||
|
||||
.L29:
|
||||
movfr2gr.s i0, $f20
|
||||
.align 3
|
||||
|
||||
#endif
|
||||
.L21: // N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
srai.d i1, N, 3
|
||||
slli.d i1, i1, 3
|
||||
addi.d i1, i1, 1 //current index
|
||||
movgr2fr.d $f21, i1
|
||||
movgr2fr.d $f20, i0
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
LD $f9, X, 0
|
||||
VFMAXA VM1, x1, VM0
|
||||
VCMPEQ VT0, VM0, VM1
|
||||
vbitsel.v VM0, VM1, VM0, VT0
|
||||
vbitsel.v VI0, VI1, VI0, VT0
|
||||
addi.d I, I, -1
|
||||
addi.d i1, i1, 1
|
||||
add.d X, X, INCX
|
||||
movgr2fr.d $f21, i1
|
||||
blt $r0, I, .L22
|
||||
MTG i0, $f20
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,486 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r12
|
||||
#define t1 $r13
|
||||
#define t2 $r15
|
||||
#define t3 $r18
|
||||
#define t4 $r16
|
||||
#define i0 $r17
|
||||
#define i1 $r14
|
||||
#define TEMP $r19
|
||||
#define x1 $xr9
|
||||
#define x2 $xr10
|
||||
#define x3 $xr11
|
||||
#define x4 $xr12
|
||||
#define VX0 $xr13
|
||||
#define VX1 $xr14
|
||||
#define VM0 $xr15
|
||||
#define VM1 $xr16
|
||||
#define VINC4 $xr17
|
||||
#define VINC8 $xr18
|
||||
#define VI0 $xr20
|
||||
#define VI1 $xr21
|
||||
#define VI2 $xr22
|
||||
#define VI3 $xr8
|
||||
#define VI4 $xr19
|
||||
#define VT0 $xr23
|
||||
|
||||
PROLOGUE
|
||||
li.d i0, 0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bne INCX, TEMP, .L20
|
||||
xvld VM0, X, 0
|
||||
#ifdef DOUBLE
|
||||
addi.d i0, i0, 1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
slli.d i0, i0, 2 //4
|
||||
xvreplgr2vr.d VINC4, i0
|
||||
slli.d i0, i0, 1 //8
|
||||
xvreplgr2vr.d VINC8, i0
|
||||
addi.d i0, i0, -15
|
||||
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 2
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 3
|
||||
addi.d i0, i0, 5
|
||||
xvinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 1 //2
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 2 //3
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 3 //4
|
||||
#else
|
||||
addi.w i0, i0, 1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
slli.w i0, i0, 3 //8
|
||||
xvreplgr2vr.w VINC8, i0
|
||||
addi.w i0, i0, -15
|
||||
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 4
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 5
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 6
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 2 //3
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 3 //4
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 4 //5
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 5 //6
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 6 //7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 7 //8
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvadd.d VI1, VI1, VINC8
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvadd.d VI2, VI1, VINC4
|
||||
xvfmina.d VM1, VX0, VX1
|
||||
xvfcmp.ceq.d VT0, VX0, VM1
|
||||
addi.d I, I, -1
|
||||
xvbitsel.v VI2, VI2, VI1, VT0
|
||||
xvfmina.d VM1, VM0, VM1
|
||||
#else
|
||||
addi.d I, I, -1
|
||||
xvadd.w VI2, VI1, VINC8
|
||||
xvfmina.s VM1, VX0, VM0
|
||||
#endif
|
||||
XVCMPEQ VT0, VM0, VM1
|
||||
addi.d X, X, 8 * SIZE
|
||||
xvbitsel.v VM0, VM1, VM0, VT0
|
||||
xvbitsel.v VI0, VI2, VI0, VT0
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VI1, VI0, 0
|
||||
xvpickve.d VI2, VI0, 1
|
||||
xvpickve.d VI3, VI0, 2
|
||||
xvpickve.d VI4, VI0, 3
|
||||
xvpickve.d x1, VM0, 0
|
||||
xvpickve.d x2, VM0, 1
|
||||
xvpickve.d x3, VM0, 2
|
||||
xvpickve.d x4, VM0, 3
|
||||
xvfmina.d VM1, x1, x2
|
||||
xvfcmp.ceq.d VT0, x1, VM1
|
||||
#else
|
||||
xvxor.v VX0, VX0, VX0
|
||||
xvor.v VX0, VI0, VX0
|
||||
xvxor.v VX1, VX1, VX1
|
||||
xvor.v VX1, VM0, VX1
|
||||
xvpickve.w VI1, VI0, 0
|
||||
xvpickve.w VI2, VI0, 1
|
||||
xvpickve.w VI3, VI0, 2
|
||||
xvpickve.w VI4, VI0, 3
|
||||
xvpickve.w x1, VM0, 0
|
||||
xvpickve.w x2, VM0, 1
|
||||
xvpickve.w x3, VM0, 2
|
||||
xvpickve.w x4, VM0, 3
|
||||
xvfmina.s VM1, x1, x2
|
||||
xvfcmp.ceq.s VT0, x1, VM1
|
||||
#endif
|
||||
xvbitsel.v VINC4, VI2, VI1, VT0
|
||||
XVFMINA VM0, x4, x3
|
||||
XVCMPEQ VT0, x3, VM0
|
||||
xvbitsel.v VINC8, VI4, VI3, VT0
|
||||
XVFMINA VM0, VM0, VM1
|
||||
XVCMPEQ VT0, VM0, VM1
|
||||
xvbitsel.v VI0, VINC8, VINC4, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
XVCMPLT VT0, VI1, VI0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
b .L26
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
move TEMP, X
|
||||
#ifdef DOUBLE
|
||||
addi.d i0, i0, 1
|
||||
ld.d t1, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
xvinsgr2vr.d VM0, t1, 0
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
ld.d t2, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.d t3, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.d t4, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
xvinsgr2vr.d VM0, t2, 1
|
||||
xvinsgr2vr.d VM0, t3, 2
|
||||
xvinsgr2vr.d VM0, t4, 3
|
||||
slli.d i0, i0, 2 //4
|
||||
xvreplgr2vr.d VINC4, i0
|
||||
slli.d i0, i0, 1 //8
|
||||
xvreplgr2vr.d VINC8, i0
|
||||
addi.d i0, i0, -15
|
||||
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 2
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 3
|
||||
addi.d i0, i0, 5
|
||||
xvinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 1 //2
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 2 //3
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 3 //4
|
||||
#else
|
||||
addi.w i0, i0, 1
|
||||
ld.w t1, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
xvinsgr2vr.w VM0, t1, 0
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
ld.w t2, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t3, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t4, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
xvinsgr2vr.w VM0, t2, 1
|
||||
xvinsgr2vr.w VM0, t3, 2
|
||||
xvinsgr2vr.w VM0, t4, 3
|
||||
ld.w t1, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t2, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t3, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t4, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
xvinsgr2vr.w VM0, t1, 4
|
||||
xvinsgr2vr.w VM0, t2, 5
|
||||
xvinsgr2vr.w VM0, t3, 6
|
||||
xvinsgr2vr.w VM0, t4, 7
|
||||
slli.w i0, i0, 3 //8
|
||||
xvreplgr2vr.w VINC8, i0
|
||||
addi.w i0, i0, -15
|
||||
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 4
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 5
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 6
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 2 //3
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 3 //4
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 4 //5
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 5 //6
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 6 //7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 7 //8
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
xvadd.d VI1, VI1, VINC8
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvadd.d VI2, VI1, VINC4
|
||||
xvfmina.d VM1, VX0, VX1
|
||||
xvfcmp.ceq.d VT0, VX0, VM1
|
||||
xvbitsel.v VI2, VI2, VI1, VT0
|
||||
xvfmina.d VM1, VM0, VM1
|
||||
xvfcmp.ceq.d VT0, VM0, VM1
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
xvadd.w VI2, VI1, VINC8
|
||||
xvfmina.s VM1, VX0, VM0
|
||||
xvfcmp.ceq.s VT0, VM1, VM0
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
xvbitsel.v VM0, VM1, VM0, VT0
|
||||
xvbitsel.v VI0, VI2, VI0, VT0
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
fcmp.ceq.d $fcc0, $f15, $f10
|
||||
bceqz $fcc0, .L27
|
||||
XVCMPLT VT0, VI2, VI0
|
||||
xvbitsel.v VI0, VI0, VI2, VT0
|
||||
.align 3
|
||||
|
||||
.L27:
|
||||
fcmp.ceq.d $fcc0, $f15, $f11
|
||||
bceqz $fcc0, .L28
|
||||
XVCMPLT VT0, VI3, VI0
|
||||
xvbitsel.v VI0, VI0, VI3, VT0
|
||||
.align 3
|
||||
|
||||
.L28:
|
||||
fcmp.ceq.d $fcc0, $f15, $f12
|
||||
bceqz $fcc0, .L29
|
||||
XVCMPLT VT0, VI4, VI0
|
||||
xvbitsel.v VI0, VI0, VI4, VT0
|
||||
.align 3
|
||||
|
||||
.L29:
|
||||
#ifdef DOUBLE
|
||||
movfr2gr.d i0, $f20
|
||||
.align 3
|
||||
#else
|
||||
fmov.s $f16, $f20
|
||||
.align 3
|
||||
|
||||
.L252:
|
||||
xvxor.v VI0, VI0, VI0
|
||||
xvor.v VI0, VI0, VX0
|
||||
fmov.s $f13, $f15
|
||||
xvxor.v VM0, VM0, VM0
|
||||
xvor.v VM0, VM0, VX1
|
||||
xvpickve.w VI1, VI0, 4
|
||||
xvpickve.w VI2, VI0, 5
|
||||
xvpickve.w VI3, VI0, 6
|
||||
xvpickve.w VI4, VI0, 7
|
||||
xvpickve.w x1, VM0, 4
|
||||
xvpickve.w x2, VM0, 5
|
||||
xvpickve.w x3, VM0, 6
|
||||
xvpickve.w x4, VM0, 7
|
||||
xvfmina.s VM1, x1, x2
|
||||
xvfcmp.ceq.s VT0, x1, VM1
|
||||
xvbitsel.v VINC4, VI2, VI1, VT0
|
||||
xvfmina.s VM0, x3, x4
|
||||
xvfcmp.ceq.s VT0, x3, VM0
|
||||
xvbitsel.v VINC8, VI4, VI3, VT0
|
||||
xvfmina.s VM0, VM0, VM1
|
||||
xvfcmp.ceq.s VT0, VM0, VM1
|
||||
xvbitsel.v VI0, VINC8, VINC4, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L262
|
||||
xvfcmp.clt.s VT0, VI1, VI0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
.align 3
|
||||
|
||||
.L262:
|
||||
fcmp.ceq.d $fcc0, $f15, $f10
|
||||
bceqz $fcc0, .L272
|
||||
xvfcmp.clt.s VT0, VI2, VI0
|
||||
xvbitsel.v VI0, VI0, VI2, VT0
|
||||
.align 3
|
||||
|
||||
.L272:
|
||||
fcmp.ceq.d $fcc0, $f15, $f11
|
||||
bceqz $fcc0, .L282
|
||||
xvfcmp.clt.s VT0, VI3, VI0
|
||||
xvbitsel.v VI0, VI0, VI3, VT0
|
||||
.align 3
|
||||
|
||||
.L282:
|
||||
fcmp.ceq.d $fcc0, $f15, $f12
|
||||
bceqz $fcc0, .L292
|
||||
xvfcmp.clt.s VT0, VI4, VI0
|
||||
xvbitsel.v VI0, VI0, VI4, VT0
|
||||
.align 3
|
||||
|
||||
.L292:
|
||||
xvfmina.s VM0, VX0, VM0
|
||||
xvfcmp.ceq.s VT0, VM0, VX0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
movfr2gr.s i0, $f20
|
||||
|
||||
#endif
|
||||
|
||||
.L21: // N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
srai.d i1, N, 3
|
||||
slli.d i1, i1, 3
|
||||
addi.d i1, i1, 1 //current index
|
||||
movgr2fr.d $f21, i1
|
||||
movgr2fr.d $f20, i0
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
LD $f9, X, 0
|
||||
addi.d I, I, -1
|
||||
XVFMINA VM1, x1, VM0
|
||||
XVCMPEQ VT0, VM0, VM1
|
||||
add.d X, X, INCX
|
||||
xvbitsel.v VM0, VM1, VM0, VT0
|
||||
xvbitsel.v VI0, VI1, VI0, VT0
|
||||
addi.d i1, i1, 1
|
||||
movgr2fr.d $f21, i1
|
||||
blt $r0, I, .L22
|
||||
MTG i0, $f20
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,446 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r12
|
||||
#define t1 $r13
|
||||
#define t2 $r15
|
||||
#define t3 $r18
|
||||
#define t4 $r16
|
||||
#define i0 $r17
|
||||
#define i1 $r14
|
||||
#define TEMP $r19
|
||||
#define x1 $vr9
|
||||
#define x2 $vr10
|
||||
#define x3 $vr11
|
||||
#define x4 $vr12
|
||||
#define VX0 $vr13
|
||||
#define VX1 $vr14
|
||||
#define VM0 $vr15
|
||||
#define VM1 $vr16
|
||||
#ifdef DOUBLE
|
||||
#define VINC2 $vr17
|
||||
#define VINC4 $vr18
|
||||
#else
|
||||
#define VINC4 $vr17
|
||||
#define VINC8 $vr18
|
||||
#endif
|
||||
#define VI0 $vr20
|
||||
#define VI1 $vr21
|
||||
#define VI2 $vr22
|
||||
#define VI3 $vr8
|
||||
#define VI4 $vr19
|
||||
#define VT0 $vr23
|
||||
|
||||
PROLOGUE
|
||||
li.d i0, 0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bne INCX, TEMP, .L20
|
||||
vld VM0, X, 0
|
||||
#ifdef DOUBLE
|
||||
addi.d i0, i0, 1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
|
||||
slli.d i0, i0, 1 //2
|
||||
vreplgr2vr.d VINC2, i0
|
||||
slli.d i0, i0, 1 //4
|
||||
vreplgr2vr.d VINC4, i0
|
||||
addi.d i0, i0, -7
|
||||
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, 3
|
||||
vinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI0, i0, 1 //2
|
||||
#else
|
||||
addi.w i0, i0, 1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
|
||||
slli.w i0, i0, 2 //4
|
||||
vreplgr2vr.w VINC4, i0
|
||||
slli.w i0, i0, 1 //8
|
||||
vreplgr2vr.w VINC8, i0
|
||||
addi.w i0, i0, -15
|
||||
vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, 5
|
||||
vinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 2 //3
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 3 //4
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
vld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vadd.d VI1, VI1, VINC4
|
||||
vld VX1, X, 2 * SIZE
|
||||
vadd.d VI2, VI1, VINC2
|
||||
vfmina.d x1, VX0, VX1
|
||||
vfcmp.ceq.d VT0, VX0, x1
|
||||
vbitsel.v x2, VI2, VI1, VT0
|
||||
vld VX0, X, 4 * SIZE
|
||||
vadd.d VI1, VI2, VINC2
|
||||
vld VX1, X, 6 * SIZE
|
||||
vadd.d VI2, VI1, VINC2
|
||||
vfmina.d x3, VX0, VX1
|
||||
vfcmp.ceq.d VT0, VX0, x3
|
||||
vbitsel.v x4, VI2, VI1, VT0
|
||||
vfmina.d x3, x1, x3
|
||||
vfcmp.ceq.d VT0, x1, x3
|
||||
addi.d I, I, -1
|
||||
vbitsel.v x2, x4, x2, VT0
|
||||
vfmina.d VM1, VM0, x3
|
||||
#else
|
||||
vadd.w VI1, VI1, VINC8
|
||||
vld VX1, X, 4 * SIZE
|
||||
vadd.w VI2, VI1, VINC4
|
||||
vfmina.s VM1, VX0, VX1
|
||||
vfcmp.ceq.s VT0, VX0, VM1
|
||||
addi.d I, I, -1
|
||||
vbitsel.v x2, VI2, VI1, VT0
|
||||
vfmina.s VM1, VM0, VM1
|
||||
#endif
|
||||
VCMPEQ VT0, VM0, VM1
|
||||
addi.d X, X, 8 * SIZE
|
||||
vbitsel.v VM0, VM1, VM0, VT0
|
||||
vbitsel.v VI0, x2, VI0, VT0
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VI1, VI0, 0
|
||||
vreplvei.d VI2, VI0, 1
|
||||
vreplvei.d x1, VM0, 0
|
||||
vreplvei.d x2, VM0, 1
|
||||
fcmp.ceq.d $fcc0, $f10, $f9
|
||||
bceqz $fcc0, .L26
|
||||
vfcmp.clt.d VT0, VI1, VI2
|
||||
vbitsel.v VI0, VI2, VI1, VT0
|
||||
b .L27
|
||||
#else
|
||||
vreplvei.w VI1, VI0, 0
|
||||
vreplvei.w VI2, VI0, 1
|
||||
vreplvei.w VI3, VI0, 2
|
||||
vreplvei.w VI4, VI0, 3
|
||||
vreplvei.w x1, VM0, 0
|
||||
vreplvei.w x2, VM0, 1
|
||||
vreplvei.w x3, VM0, 2
|
||||
vreplvei.w x4, VM0, 3
|
||||
vfmina.s VM1, x1, x2
|
||||
vfcmp.ceq.s VT0, VM1, x1
|
||||
vbitsel.v VINC4, VI2, VI1, VT0
|
||||
vfmina.s VM0, x3, x4
|
||||
vfcmp.ceq.s VT0, x3, VM0
|
||||
vbitsel.v VINC8, VI4, VI3, VT0
|
||||
vfmina.s VM0, VM0, VM1
|
||||
vfcmp.ceq.s VT0, VM0, VM1
|
||||
vbitsel.v VI0, VINC8, VINC4, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
vfcmp.clt.s VT0, VI1, VI0
|
||||
vbitsel.v VI0, VI0, VI1, VT0
|
||||
b .L26
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
move TEMP, X
|
||||
#ifdef DOUBLE
|
||||
addi.d i0, i0, 1
|
||||
ld.d t1, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
vinsgr2vr.d VM0, t1, 0
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
ld.d t2, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
|
||||
vinsgr2vr.d VM0, t2, 1
|
||||
slli.d i0, i0, 1 //2
|
||||
vreplgr2vr.d VINC2, i0
|
||||
slli.d i0, i0, 1 //4
|
||||
vreplgr2vr.d VINC4, i0
|
||||
addi.d i0, i0, -7
|
||||
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, 3
|
||||
vinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI0, i0, 1 //2
|
||||
#else
|
||||
addi.w i0, i0, 1
|
||||
ld.w t1, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
vinsgr2vr.w VM0, t1, 0
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
ld.w t2, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
|
||||
vreplvei.d VI1, VI0, 0
|
||||
ld.w t3, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t4, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
vinsgr2vr.w VM0, t2, 1
|
||||
vinsgr2vr.w VM0, t3, 2
|
||||
vinsgr2vr.w VM0, t4, 3
|
||||
slli.w i0, i0, 2 //4
|
||||
vreplgr2vr.w VINC4, i0
|
||||
slli.w i0, i0, 1 //8
|
||||
vreplgr2vr.w VINC8, i0
|
||||
addi.w i0, i0, -15
|
||||
vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, 5
|
||||
vinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 2 //3
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 3 //4
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vadd.d VI1, VI1, VINC4
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t1, 0
|
||||
vinsgr2vr.d VX1, t2, 1
|
||||
vadd.d VI2, VI1, VINC2
|
||||
vfmina.d x1, VX0, VX1
|
||||
vfcmp.ceq.d VT0, VX0, x1
|
||||
vbitsel.v x2, VI2, VI1, VT0
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vadd.d VI1, VI2, VINC2
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t1, 0
|
||||
vinsgr2vr.d VX1, t2, 1
|
||||
vadd.d VI2, VI1, VINC2
|
||||
vfmina.d x3, VX0, VX1
|
||||
vfcmp.ceq.d VT0, VX0, x3
|
||||
vbitsel.v x4, VI2, VI1, VT0
|
||||
vfmina.d x3, x1, x3
|
||||
vfcmp.ceq.d VT0, x1, x3
|
||||
addi.d I, I, -1
|
||||
vbitsel.v x2, x4, x2, VT0
|
||||
vfmina.d VM1, VM0, x3
|
||||
vbitsel.v VM0, VM1, VM0, VT0
|
||||
vfcmp.ceq.d VT0, VM0, VM1
|
||||
vbitsel.v VI0, x2, VI0, VT0
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
vadd.w VI1, VI1, VINC8
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
vadd.w VI2, VI1, VINC4
|
||||
vfmina.s VM1, VX0, VX1
|
||||
vfcmp.ceq.s VT0, VX0, VM1
|
||||
vbitsel.v VI2, VI2, VI1, VT0
|
||||
vfmina.s VM1, VM0, VM1
|
||||
vfcmp.ceq.s VT0, VM0, VM1
|
||||
addi.d I, I, -1
|
||||
vbitsel.v VM0, VM1, VM0, VT0
|
||||
vbitsel.v VI0, VI2, VI0, VT0
|
||||
#endif
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VI1, VI0, 0
|
||||
vreplvei.d VI2, VI0, 1
|
||||
vreplvei.d x1, VM0, 0
|
||||
vreplvei.d x2, VM0, 1
|
||||
fcmp.ceq.d $fcc0, $f10, $f9
|
||||
bceqz $fcc0, .L26
|
||||
vfcmp.clt.d VT0, VI1, VI2
|
||||
vbitsel.v VI0, VI2, VI1, VT0
|
||||
b .L27
|
||||
#else
|
||||
vreplvei.w VI1, VI0, 0
|
||||
vreplvei.w VI2, VI0, 1
|
||||
vreplvei.w VI3, VI0, 2
|
||||
vreplvei.w VI4, VI0, 3
|
||||
vreplvei.w x1, VM0, 0
|
||||
vreplvei.w x2, VM0, 1
|
||||
vreplvei.w x3, VM0, 2
|
||||
vreplvei.w x4, VM0, 3
|
||||
vfmina.s VM1, x1, x2
|
||||
vfcmp.ceq.s VT0, VM1, x1
|
||||
vbitsel.v VINC4, VI2, VI1, VT0
|
||||
vfmina.s VM0, x3, x4
|
||||
vfcmp.ceq.s VT0, x3, VM0
|
||||
vbitsel.v VINC8, VI4, VI3, VT0
|
||||
vfmina.s VM0, VM0, VM1
|
||||
vfcmp.ceq.s VT0, VM0, VM1
|
||||
vbitsel.v VI0, VINC8, VINC4, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
vfcmp.clt.s VT0, VI1, VI0
|
||||
vbitsel.v VI0, VI0, VI1, VT0
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
#ifdef DOUBLE
|
||||
vfmina.d VM0, x1, x2
|
||||
vfcmp.ceq.d VT0, x1, VM0
|
||||
vbitsel.v VI0, VI2, VI1, VT0
|
||||
.align 3
|
||||
|
||||
.L27:
|
||||
movfr2gr.d i0, $f20
|
||||
|
||||
#else
|
||||
fcmp.ceq.d $fcc0, $f15, $f10
|
||||
bceqz $fcc0, .L27
|
||||
vfcmp.clt.s VT0, VI2, VI0
|
||||
vbitsel.v VI0, VI0, VI2, VT0
|
||||
.align 3
|
||||
|
||||
.L27:
|
||||
fcmp.ceq.d $fcc0, $f15, $f11
|
||||
bceqz $fcc0, .L28
|
||||
vfcmp.clt.s VT0, VI3, VI0
|
||||
vbitsel.v VI0, VI0, VI3, VT0
|
||||
.align 3
|
||||
|
||||
.L28:
|
||||
fcmp.ceq.d $fcc0, $f15, $f12
|
||||
bceqz $fcc0, .L29
|
||||
vfcmp.clt.s VT0, VI4, VI0
|
||||
vbitsel.v VI0, VI0, VI4, VT0
|
||||
.align 3
|
||||
|
||||
.L29:
|
||||
movfr2gr.s i0, $f20
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L21: //N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
srai.d i1, N, 3
|
||||
slli.d i1, i1, 3
|
||||
addi.d i1, i1, 1 //current index
|
||||
movgr2fr.d $f21, i1
|
||||
movgr2fr.d $f20, i0
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
LD $f9, X, 0
|
||||
addi.d I, I, -1
|
||||
VFMINA VM1, x1, VM0
|
||||
VCMPEQ VT0, VM0, VM1
|
||||
add.d X, X, INCX
|
||||
vbitsel.v VM0, VM1, VM0, VT0
|
||||
vbitsel.v VI0, VI1, VI0, VT0
|
||||
addi.d i1, i1, 1
|
||||
MTC $f21, i1
|
||||
blt $r0, I, .L22
|
||||
movfr2gr.s i0, $f20
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,562 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r12
|
||||
#define t1 $r13
|
||||
#define t2 $r15
|
||||
#define t3 $r18
|
||||
#define t4 $r16
|
||||
#define i0 $r17
|
||||
#define i1 $r14
|
||||
#define TEMP $r19
|
||||
#define a0 $f12
|
||||
#define a1 $f13
|
||||
#define s1 $f15
|
||||
#define x1 $xr9
|
||||
#define x2 $xr10
|
||||
#define x3 $xr11
|
||||
#define x4 $xr12
|
||||
#define VX0 $xr13
|
||||
#define VX1 $xr14
|
||||
#define VM0 $xr15
|
||||
#define VM1 $xr16
|
||||
#define VINC4 $xr17
|
||||
#define VINC8 $xr18
|
||||
#define VI0 $xr20
|
||||
#define VI1 $xr21
|
||||
#define VI2 $xr22
|
||||
#define VI3 $xr8
|
||||
#define VI4 $xr19
|
||||
#define VT0 $xr23
|
||||
|
||||
PROLOGUE
|
||||
li.d i0, 0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
xvxor.v VM0, VM0, VM0
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
xvxor.v VI3, VI3, VI3 // 0
|
||||
#ifdef DOUBLE
|
||||
li.d I, -1
|
||||
xvreplgr2vr.d VI4, I
|
||||
xvffint.d.l VI4, VI4 // -1
|
||||
bne INCX, TEMP, .L20
|
||||
addi.d i0, i0, 1
|
||||
srai.d I, N, 2
|
||||
bge $r0, I, .L21
|
||||
slli.d i0, i0, 2 //4
|
||||
xvreplgr2vr.d VINC4, i0
|
||||
addi.d i0, i0, -7
|
||||
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 2
|
||||
xvinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, -1
|
||||
xvinsgr2vr.d VI1, i0, 2
|
||||
addi.d i0, i0, 2
|
||||
xvinsgr2vr.d VI1, i0, 3
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 2
|
||||
xvinsgr2vr.d VI0, i0, 1 //3
|
||||
addi.d i0, i0, -1
|
||||
xvinsgr2vr.d VI0, i0, 2 //2
|
||||
addi.d i0, i0, 2
|
||||
xvinsgr2vr.d VI0, i0, 3 //4
|
||||
#else
|
||||
li.w I, -1
|
||||
xvreplgr2vr.w VI4, I
|
||||
xvffint.s.w VI4, VI4 // -1
|
||||
bne INCX, TEMP, .L20
|
||||
addi.w i0, i0, 1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
slli.w i0, i0, 3 //8
|
||||
xvreplgr2vr.w VINC8, i0
|
||||
addi.w i0, i0, -15
|
||||
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 3
|
||||
xvinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, -3
|
||||
xvinsgr2vr.w VI1, i0, 4
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 5
|
||||
addi.w i0, i0, 3
|
||||
xvinsgr2vr.w VI1, i0, 6
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 3
|
||||
xvinsgr2vr.w VI0, i0, 2 //5
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 3 //6
|
||||
addi.w i0, i0, -3
|
||||
xvinsgr2vr.w VI0, i0, 4 //3
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 5 //4
|
||||
addi.w i0, i0, 3
|
||||
xvinsgr2vr.w VI0, i0, 6 //7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 7 //8
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvadd.d VI1, VI1, VINC4
|
||||
xvld VX1, X, 4 * SIZE
|
||||
addi.d I, I, -1
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
xvfmul.d x3, VI4, x1
|
||||
xvfmul.d x4, VI4, x2
|
||||
xvfcmp.clt.d VT0, x1, VI3
|
||||
xvfcmp.clt.d VINC8, x2, VI3
|
||||
xvbitsel.v x1, x1, x3, VT0
|
||||
xvbitsel.v x2, x2, x4, VINC8
|
||||
#else
|
||||
xvadd.w VI1, VI1, VINC8
|
||||
xvld VX1, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
xvfmul.s x3, VI4, x1
|
||||
xvfmul.s x4, VI4, x2
|
||||
xvfcmp.clt.s VT0, x1, VI3
|
||||
xvfcmp.clt.s VINC4, x2, VI3
|
||||
xvbitsel.v x1, x1, x3, VT0
|
||||
xvbitsel.v x2, x2, x4, VINC4
|
||||
#endif
|
||||
XVFADD x1, x1, x2
|
||||
XVFMAX x3, VM0, x1
|
||||
XVCMPEQ VT0, x3, VM0
|
||||
addi.d X, X, 8 * SIZE
|
||||
xvbitsel.v VM0, x3, VM0, VT0
|
||||
xvbitsel.v VI0, VI1, VI0, VT0
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VI1, VI0, 0
|
||||
xvpickve.d VI2, VI0, 1
|
||||
xvpickve.d VI3, VI0, 2
|
||||
xvpickve.d VI4, VI0, 3
|
||||
xvpickve.d x1, VM0, 0
|
||||
xvpickve.d x2, VM0, 1
|
||||
xvpickve.d x3, VM0, 2
|
||||
xvpickve.d x4, VM0, 3
|
||||
xvfmax.d VM1, x1, x2
|
||||
xvfcmp.ceq.d VT0, VM1, x1
|
||||
xvbitsel.v VINC4, VI2, VI1, VT0
|
||||
xvfmax.d VM0, x3, x4
|
||||
xvfcmp.ceq.d VT0, x3, VM0
|
||||
xvbitsel.v VINC8, VI4, VI3, VT0
|
||||
xvfmax.d VM0, VM0, VM1
|
||||
xvfcmp.ceq.d VT0, VM0, VM1
|
||||
xvbitsel.v VI0, VINC8, VINC4, VT0
|
||||
#else
|
||||
xvxor.v VX0, VX0, VX0
|
||||
xvor.v VX0, VI0, VX0
|
||||
xvxor.v VX1, VX1, VX1
|
||||
xvor.v VX1, VM0, VX1
|
||||
xvpickve.w VI1, VI0, 0
|
||||
xvpickve.w VI2, VI0, 1
|
||||
xvpickve.w VI3, VI0, 2
|
||||
xvpickve.w VI4, VI0, 3
|
||||
xvpickve.w x1, VM0, 0
|
||||
xvpickve.w x2, VM0, 1
|
||||
xvpickve.w x3, VM0, 2
|
||||
xvpickve.w x4, VM0, 3
|
||||
xvfcmp.clt.s VT0, x1, x2
|
||||
xvbitsel.v VM1, x1, x2, VT0
|
||||
xvbitsel.v VINC4, VI1, VI2, VT0
|
||||
xvfcmp.clt.s VT0, x3, x4
|
||||
xvbitsel.v VM0, x3, x4, VT0
|
||||
xvbitsel.v VINC8, VI3, VI4, VT0
|
||||
xvfcmp.clt.s VT0, VM0, VM1
|
||||
xvbitsel.v VM0, VM0, VM1, VT0
|
||||
xvbitsel.v VI0, VINC8, VINC4, VT0
|
||||
#endif
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
XVCMPLT VT0, VI1, VI0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
b .L26
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
#ifdef DOUBLE
|
||||
addi.d i0, i0, 1
|
||||
srai.d I, N, 2
|
||||
bge $r0, I, .L21
|
||||
slli.d i0, i0, 2 //4
|
||||
xvreplgr2vr.d VINC4, i0
|
||||
addi.d i0, i0, -7
|
||||
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 2
|
||||
xvinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, -1
|
||||
xvinsgr2vr.d VI1, i0, 2
|
||||
addi.d i0, i0, 2
|
||||
xvinsgr2vr.d VI1, i0, 3
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 2
|
||||
xvinsgr2vr.d VI0, i0, 1 //3
|
||||
addi.d i0, i0, -1
|
||||
xvinsgr2vr.d VI0, i0, 2 //2
|
||||
addi.d i0, i0, 2
|
||||
xvinsgr2vr.d VI0, i0, 3 //4
|
||||
#else
|
||||
addi.w i0, i0, 1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
slli.w i0, i0, 3 //8
|
||||
xvreplgr2vr.w VINC8, i0
|
||||
addi.w i0, i0, -15
|
||||
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 3
|
||||
xvinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, -3
|
||||
xvinsgr2vr.w VI1, i0, 4
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 5
|
||||
addi.w i0, i0, 3
|
||||
xvinsgr2vr.w VI1, i0, 6
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 3
|
||||
xvinsgr2vr.w VI0, i0, 2 //5
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 3 //6
|
||||
addi.w i0, i0, -3
|
||||
xvinsgr2vr.w VI0, i0, 4 //3
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 5 //4
|
||||
addi.w i0, i0, 3
|
||||
xvinsgr2vr.w VI0, i0, 6 //7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 7 //8
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 0
|
||||
xvinsgr2vr.d x2, t2, 0
|
||||
xvinsgr2vr.d x1, t3, 1
|
||||
xvinsgr2vr.d x2, t4, 1
|
||||
xvadd.d VI1, VI1, VINC4
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 2
|
||||
xvinsgr2vr.d x2, t2, 2
|
||||
xvinsgr2vr.d x1, t3, 3
|
||||
xvinsgr2vr.d x2, t4, 3
|
||||
addi.d I, I, -1
|
||||
xvfmul.d x3, VI4, x1
|
||||
xvfmul.d x4, VI4, x2
|
||||
xvfcmp.clt.d VT0, x1, VI3
|
||||
xvfcmp.clt.d VINC8, x2, VI3
|
||||
xvbitsel.v x1, x1, x3, VT0
|
||||
xvbitsel.v x2, x2, x4, VINC8
|
||||
xvfadd.d x1, x1, x2
|
||||
xvfmax.d x3, VM0, x1
|
||||
xvfcmp.ceq.d VT0, x3, VM0
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 0
|
||||
xvinsgr2vr.w x2, t2, 0
|
||||
xvinsgr2vr.w x1, t3, 1
|
||||
xvinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 2
|
||||
xvinsgr2vr.w x2, t2, 2
|
||||
xvinsgr2vr.w x1, t3, 3
|
||||
xvinsgr2vr.w x2, t4, 3
|
||||
xvadd.w VI1, VI1, VINC8
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 4
|
||||
xvinsgr2vr.w x2, t2, 4
|
||||
xvinsgr2vr.w x1, t3, 5
|
||||
xvinsgr2vr.w x2, t4, 5
|
||||
xvadd.w VI1, VI1, VINC8
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 6
|
||||
xvinsgr2vr.w x2, t2, 6
|
||||
xvinsgr2vr.w x1, t3, 7
|
||||
xvinsgr2vr.w x2, t4, 7
|
||||
addi.d I, I, -1
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
xvfmul.s x3, VI4, x1
|
||||
xvfmul.s x4, VI4, x2
|
||||
xvfcmp.clt.s VT0, x1, VI3
|
||||
xvfcmp.clt.s VINC8, x2, VI3
|
||||
xvbitsel.v x1, x1, x3, VT0
|
||||
xvbitsel.v x2, x2, x4, VINC8
|
||||
xvfadd.s x1, x1, x2
|
||||
xvfmax.s x3, VM0, x1
|
||||
xvfcmp.ceq.s VT0, x3, VM0
|
||||
#endif
|
||||
xvbitsel.v VM0, x3, VM0, VT0
|
||||
xvbitsel.v VI0, VI1, VI0, VT0
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VI1, VI0, 0
|
||||
xvpickve.d VI2, VI0, 1
|
||||
xvpickve.d VI3, VI0, 2
|
||||
xvpickve.d VI4, VI0, 3
|
||||
xvpickve.d x1, VM0, 0
|
||||
xvpickve.d x2, VM0, 1
|
||||
xvpickve.d x3, VM0, 2
|
||||
xvpickve.d x4, VM0, 3
|
||||
xvfmaxa.d VM1, x1, x2
|
||||
xvfcmp.ceq.d VT0, VM1, x1
|
||||
xvbitsel.v VINC4, VI2, VI1, VT0
|
||||
xvfmaxa.d VM0, x3, x4
|
||||
xvfcmp.ceq.d VT0, x3, VM0
|
||||
xvbitsel.v VINC8, VI4, VI3, VT0
|
||||
xvfmaxa.d VM0, VM0, VM1
|
||||
xvfcmp.ceq.d VT0, VM0, VM1
|
||||
xvbitsel.v VI0, VINC8, VINC4, VT0
|
||||
#else
|
||||
xvxor.v VX0, VX0, VX0
|
||||
xvor.v VX0, VI0, VX0
|
||||
xvxor.v VX1, VX1, VX1
|
||||
xvor.v VX1, VM0, VX1
|
||||
xvpickve.w VI1, VI0, 0
|
||||
xvpickve.w VI2, VI0, 1
|
||||
xvpickve.w VI3, VI0, 2
|
||||
xvpickve.w VI4, VI0, 3
|
||||
xvpickve.w x1, VM0, 0
|
||||
xvpickve.w x2, VM0, 1
|
||||
xvpickve.w x3, VM0, 2
|
||||
xvpickve.w x4, VM0, 3
|
||||
xvfcmp.clt.s VT0, x1, x2
|
||||
xvbitsel.v VM1, x1, x2, VT0
|
||||
xvbitsel.v VINC4, VI1, VI2, VT0
|
||||
xvfcmp.clt.s VT0, x3, x4
|
||||
xvbitsel.v VM0, x3, x4, VT0
|
||||
xvbitsel.v VINC8, VI3, VI4, VT0
|
||||
xvfcmp.clt.s VT0, VM0, VM1
|
||||
xvbitsel.v VM0, VM0, VM1, VT0
|
||||
xvbitsel.v VI0, VINC8, VINC4, VT0
|
||||
#endif
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
XVCMPLT VT0, VI1, VI0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
fcmp.ceq.d $fcc0, $f15, $f10
|
||||
bceqz $fcc0, .L27
|
||||
XVCMPLT VT0, VI2, VI0
|
||||
xvbitsel.v VI0, VI0, VI2, VT0
|
||||
.align 3
|
||||
|
||||
.L27:
|
||||
fcmp.ceq.d $fcc0, $f15, $f11
|
||||
bceqz $fcc0, .L28
|
||||
XVCMPLT VT0, VI3, VI0
|
||||
xvbitsel.v VI0, VI0, VI3, VT0
|
||||
.align 3
|
||||
|
||||
.L28:
|
||||
fcmp.ceq.d $fcc0, $f15, $f12
|
||||
bceqz $fcc0, .L29
|
||||
XVCMPLT VT0, VI4, VI0
|
||||
xvbitsel.v VI0, VI0, VI4, VT0
|
||||
.align 3
|
||||
|
||||
.L29:
|
||||
#ifdef DOUBLE
|
||||
movfr2gr.d i0, $f20
|
||||
#else
|
||||
fmov.s $f16, $f20
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
#ifdef DOUBLE
|
||||
#else
|
||||
.L252:
|
||||
xvxor.v VI0, VI0, VI0
|
||||
xvor.v VI0, VI0, VX0
|
||||
fmov.s $f13, $f15
|
||||
xvxor.v VM0, VM0, VM0
|
||||
xvor.v VM0, VM0, VX1
|
||||
xvpickve.w VI1, VI0, 4
|
||||
xvpickve.w VI2, VI0, 5
|
||||
xvpickve.w VI3, VI0, 6
|
||||
xvpickve.w VI4, VI0, 7
|
||||
xvpickve.w x1, VM0, 4
|
||||
xvpickve.w x2, VM0, 5
|
||||
xvpickve.w x3, VM0, 6
|
||||
xvpickve.w x4, VM0, 7
|
||||
xvfcmp.clt.s VT0, x1, x2
|
||||
xvbitsel.v x1, x1, x2, VT0
|
||||
xvbitsel.v VINC4, VI1, VI2, VT0
|
||||
xvfcmp.clt.s VT0, x3, x4
|
||||
xvbitsel.v VM0, x3, x4, VT0
|
||||
xvbitsel.v VINC8, VI3, VI4, VT0
|
||||
xvfcmp.clt.s VT0, VM0, x1
|
||||
xvbitsel.v VM0, VM0, x1, VT0
|
||||
xvbitsel.v VI0, VINC8, VINC4, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L262
|
||||
xvfcmp.clt.s VT0, VI1, VI0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
.align 3
|
||||
|
||||
.L262:
|
||||
fcmp.ceq.d $fcc0, $f15, $f10
|
||||
bceqz $fcc0, .L272
|
||||
xvfcmp.clt.s VT0, VI2, VI0
|
||||
xvbitsel.v VI0, VI0, VI2, VT0
|
||||
.align 3
|
||||
|
||||
.L272:
|
||||
fcmp.ceq.d $fcc0, $f15, $f11
|
||||
bceqz $fcc0, .L282
|
||||
xvfcmp.clt.s VT0, VI3, VI0
|
||||
xvbitsel.v VI0, VI0, VI3, VT0
|
||||
.align 3
|
||||
|
||||
.L282:
|
||||
fcmp.ceq.d $fcc0, $f15, $f12
|
||||
bceqz $fcc0, .L292
|
||||
xvfcmp.clt.s VT0, VI4, VI0
|
||||
xvbitsel.v VI0, VI0, VI4, VT0
|
||||
.align 3
|
||||
|
||||
.L292:
|
||||
fcmp.clt.s $fcc0, $f15, $f13
|
||||
fsel $f15, $f15, $f13, $fcc0
|
||||
fsel $f20, $f20, $f16, $fcc0
|
||||
movfr2gr.s i0, $f20
|
||||
|
||||
#endif
|
||||
.L21: //N<8
|
||||
#ifdef DOUBLE
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
srai.d i1, N, 2
|
||||
slli.d i1, i1, 2
|
||||
#else
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
srai.d i1, N, 3
|
||||
slli.d i1, i1, 3
|
||||
#endif
|
||||
addi.d i1, i1, 1 //current index
|
||||
movgr2fr.d $f21, i1
|
||||
movgr2fr.d $f20, i0
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
LD a0, X, 0 * SIZE
|
||||
LD a1, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
FABS a0, a0
|
||||
FABS a1, a1
|
||||
ADD a0, a0, a1
|
||||
FMAX a1, s1, a0
|
||||
CMPEQ $fcc0, s1, a1
|
||||
add.d X, X, INCX
|
||||
fsel s1, a1, s1, $fcc0
|
||||
fsel $f20, $f21, $f20, $fcc0
|
||||
addi.d i1, i1, 1
|
||||
movgr2fr.d $f21, i1
|
||||
blt $r0, I, .L22
|
||||
MTG i0, $f20
|
||||
.align 3
|
||||
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,434 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r12
|
||||
#define t1 $r13
|
||||
#define t2 $r15
|
||||
#define t3 $r18
|
||||
#define t4 $r16
|
||||
#define i0 $r17
|
||||
#define i1 $r14
|
||||
#define TEMP $r19
|
||||
#define a0 $f12
|
||||
#define a1 $f13
|
||||
#define s1 $f15
|
||||
#define x1 $vr9
|
||||
#define x2 $vr10
|
||||
#define x3 $vr11
|
||||
#define x4 $vr12
|
||||
#define VX0 $vr13
|
||||
#define VX1 $vr14
|
||||
#define VM0 $vr15
|
||||
#define VM1 $vr16
|
||||
#define VINC4 $vr17
|
||||
#define VINC8 $vr18
|
||||
#define VI0 $vr20
|
||||
#define VI1 $vr21
|
||||
#define VI2 $vr22
|
||||
#define VI3 $vr8
|
||||
#define VI4 $vr19
|
||||
#define VT0 $vr23
|
||||
|
||||
PROLOGUE
|
||||
li.d i0, 0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
vxor.v VM0, VM0, VM0
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
vxor.v VI3, VI3, VI3 // 0
|
||||
#ifdef DOUBLE
|
||||
li.d I, -1
|
||||
vreplgr2vr.d VI4, I
|
||||
vffint.d.l VI4, VI4 // -1
|
||||
bne INCX, TEMP, .L20
|
||||
addi.d i0, i0, 1
|
||||
srai.d I, N, 2
|
||||
bge $r0, I, .L21
|
||||
slli.d i0, i0, 1 //2
|
||||
vreplgr2vr.d VINC4, i0
|
||||
addi.d i0, i0, -3
|
||||
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI0, i0, 1 //2
|
||||
#else
|
||||
li.w I, -1
|
||||
vreplgr2vr.w VI4, I
|
||||
vffint.s.w VI4, VI4 // -1
|
||||
bne INCX, TEMP, .L20
|
||||
addi.w i0, i0, 1
|
||||
srai.d I, N, 2
|
||||
bge $r0, I, .L21
|
||||
slli.w i0, i0, 2 //4
|
||||
vreplgr2vr.w VINC4, i0
|
||||
addi.w i0, i0, -7
|
||||
vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 2 //3
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 3 //4
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
vld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vadd.d VI1, VI1, VINC4
|
||||
vld VX1, X, 2 * SIZE
|
||||
addi.d I, I, -1
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vfmul.d x3, VI4, x1
|
||||
vfmul.d x4, VI4, x2
|
||||
vfcmp.clt.d VT0, x1, VI3
|
||||
vfcmp.clt.d VINC8, x2, VI3
|
||||
vbitsel.v x1, x1, x3, VT0
|
||||
vbitsel.v x2, x2, x4, VINC8
|
||||
vfadd.d x1, x1, x2
|
||||
vfmax.d x3, VM0, x1
|
||||
vfcmp.ceq.d VT0, x3, VM0
|
||||
vbitsel.v VM0, x3, VM0, VT0
|
||||
vbitsel.v VI0, VI1, VI0, VT0
|
||||
vld VX0, X, 4 * SIZE
|
||||
vadd.d VI1, VI1, VINC4
|
||||
vld VX1, X, 6 * SIZE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vfmul.d x3, VI4, x1
|
||||
vfmul.d x4, VI4, x2
|
||||
#else
|
||||
vadd.w VI1, VI1, VINC4
|
||||
vld VX1, X, 4 * SIZE
|
||||
addi.d I, I, -1
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vfmul.s x3, VI4, x1
|
||||
vfmul.s x4, VI4, x2
|
||||
#endif
|
||||
VCMPLT VT0, x1, VI3
|
||||
VCMPLT VINC8, x2, VI3
|
||||
vbitsel.v x1, x1, x3, VT0
|
||||
vbitsel.v x2, x2, x4, VINC8
|
||||
VFADD x1, x1, x2
|
||||
VFMAX x3, VM0, x1
|
||||
VCMPEQ VT0, x3, VM0
|
||||
addi.d X, X, 8 * SIZE
|
||||
vbitsel.v VM0, x3, VM0, VT0
|
||||
vbitsel.v VI0, VI1, VI0, VT0
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VI1, VI0, 0
|
||||
vreplvei.d VI2, VI0, 1
|
||||
vreplvei.d x1, VM0, 0
|
||||
vreplvei.d x2, VM0, 1
|
||||
fcmp.ceq.d $fcc0, $f10, $f9
|
||||
bceqz $fcc0, .L26
|
||||
vfcmp.clt.d VT0, VI1, VI2
|
||||
vbitsel.v VI0, VI2, VI1, VT0
|
||||
b .L27
|
||||
#else
|
||||
vreplvei.w VI1, VI0, 0
|
||||
vreplvei.w VI2, VI0, 1
|
||||
vreplvei.w VI3, VI0, 2
|
||||
vreplvei.w VI4, VI0, 3
|
||||
vreplvei.w x1, VM0, 0
|
||||
vreplvei.w x2, VM0, 1
|
||||
vreplvei.w x3, VM0, 2
|
||||
vreplvei.w x4, VM0, 3
|
||||
vfmaxa.s VM1, x1, x2
|
||||
vfcmp.ceq.s VT0, VM1, x1
|
||||
vbitsel.v VINC4, VI2, VI1, VT0
|
||||
vfmaxa.s VM0, x3, x4
|
||||
vfcmp.ceq.s VT0, x3, VM0
|
||||
vbitsel.v VINC8, VI4, VI3, VT0
|
||||
vfmaxa.s VM0, VM0, VM1
|
||||
vfcmp.ceq.s VT0, VM0, VM1
|
||||
vbitsel.v VI0, VINC8, VINC4, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
vfcmp.clt.s VT0, VI1, VI0
|
||||
vbitsel.v VI0, VI0, VI1, VT0
|
||||
b .L26
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
#ifdef DOUBLE
|
||||
addi.d i0, i0, 1
|
||||
srai.d I, N, 2
|
||||
bge $r0, I, .L21
|
||||
slli.d i0, i0, 1 //2
|
||||
vreplgr2vr.d VINC4, i0
|
||||
addi.d i0, i0, -3
|
||||
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI0, i0, 1 //2
|
||||
#else
|
||||
addi.w i0, i0, 1
|
||||
srai.d I, N, 2
|
||||
bge $r0, I, .L21
|
||||
slli.w i0, i0, 2 //4
|
||||
vreplgr2vr.w VINC4, i0
|
||||
addi.w i0, i0, -7
|
||||
vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 2 //3
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 3 //4
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
vadd.d VI1, VI1, VINC4
|
||||
vfmul.d x3, VI4, x1
|
||||
vfmul.d x4, VI4, x2
|
||||
vfcmp.clt.d VT0, x1, VI3
|
||||
vfcmp.clt.d VINC8, x2, VI3
|
||||
vbitsel.v x1, x1, x3, VT0
|
||||
vbitsel.v x2, x2, x4, VINC8
|
||||
vfadd.d x1, x1, x2
|
||||
vfmax.d x3, VM0, x1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
vfcmp.ceq.d VT0, x3, VM0
|
||||
ld.d t2, X, 1 * SIZE
|
||||
vbitsel.v VM0, x3, VM0, VT0
|
||||
vbitsel.v VI0, VI1, VI0, VT0
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
vadd.d VI1, VI1, VINC4
|
||||
addi.d I, I, -1
|
||||
vfmul.d x3, VI4, x1
|
||||
vfmul.d x4, VI4, x2
|
||||
vfcmp.clt.d VT0, x1, VI3
|
||||
vfcmp.clt.d VINC8, x2, VI3
|
||||
vbitsel.v x1, x1, x3, VT0
|
||||
vbitsel.v x2, x2, x4, VINC8
|
||||
vfadd.d x1, x1, x2
|
||||
vfmax.d x3, VM0, x1
|
||||
vfcmp.ceq.d VT0, x3, VM0
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 0
|
||||
vinsgr2vr.w x2, t2, 0
|
||||
vinsgr2vr.w x1, t3, 1
|
||||
vinsgr2vr.w x2, t4, 1
|
||||
vadd.w VI1, VI1, VINC4
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 2
|
||||
vinsgr2vr.w x2, t2, 2
|
||||
vinsgr2vr.w x1, t3, 3
|
||||
vinsgr2vr.w x2, t4, 3
|
||||
addi.d I, I, -1
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
vfmul.s x3, VI4, x1
|
||||
vfmul.s x4, VI4, x2
|
||||
vfcmp.clt.s VT0, x1, VI3
|
||||
vfcmp.clt.s VINC8, x2, VI3
|
||||
vbitsel.v x1, x1, x3, VT0
|
||||
vbitsel.v x2, x2, x4, VINC8
|
||||
vfadd.s x1, x1, x2
|
||||
vfmax.s x3, VM0, x1
|
||||
vfcmp.ceq.s VT0, x3, VM0
|
||||
#endif
|
||||
vbitsel.v VM0, x3, VM0, VT0
|
||||
vbitsel.v VI0, VI1, VI0, VT0
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VI1, VI0, 0
|
||||
vreplvei.d VI2, VI0, 1
|
||||
vreplvei.d x1, VM0, 0
|
||||
vreplvei.d x2, VM0, 1
|
||||
fcmp.ceq.d $fcc0, $f10, $f9
|
||||
bceqz $fcc0, .L26
|
||||
vfcmp.clt.d VT0, VI1, VI2
|
||||
vbitsel.v VI0, VI2, VI1, VT0
|
||||
b .L27
|
||||
#else
|
||||
vreplvei.w VI1, VI0, 0
|
||||
vreplvei.w VI2, VI0, 1
|
||||
vreplvei.w VI3, VI0, 2
|
||||
vreplvei.w VI4, VI0, 3
|
||||
vreplvei.w x1, VM0, 0
|
||||
vreplvei.w x2, VM0, 1
|
||||
vreplvei.w x3, VM0, 2
|
||||
vreplvei.w x4, VM0, 3
|
||||
vfmaxa.s VM1, x1, x2
|
||||
vfcmp.ceq.s VT0, VM1, x1
|
||||
vbitsel.v VINC4, VI2, VI1, VT0
|
||||
vfmaxa.s VM0, x3, x4
|
||||
vfcmp.ceq.s VT0, x3, VM0
|
||||
vbitsel.v VINC8, VI4, VI3, VT0
|
||||
vfmaxa.s VM0, VM0, VM1
|
||||
vfcmp.ceq.s VT0, VM0, VM1
|
||||
vbitsel.v VI0, VINC8, VINC4, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
vfcmp.clt.s VT0, VI1, VI0
|
||||
vbitsel.v VI0, VI0, VI1, VT0
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
#ifdef DOUBLE
|
||||
.L26:
|
||||
vfmaxa.d VM0, x1, x2
|
||||
vfcmp.ceq.d VT0, x1, VM0
|
||||
vbitsel.v VI0, VI2, VI1, VT0
|
||||
.align 3
|
||||
|
||||
.L27:
|
||||
movfr2gr.d i0, $f20
|
||||
.align 3
|
||||
#else
|
||||
.L26:
|
||||
fcmp.ceq.d $fcc0, $f15, $f10
|
||||
bceqz $fcc0, .L27
|
||||
vfcmp.clt.s VT0, VI2, VI0
|
||||
vbitsel.v VI0, VI0, VI2, VT0
|
||||
.align 3
|
||||
|
||||
.L27:
|
||||
fcmp.ceq.d $fcc0, $f15, $f11
|
||||
bceqz $fcc0, .L28
|
||||
vfcmp.clt.s VT0, VI3, VI0
|
||||
vbitsel.v VI0, VI0, VI3, VT0
|
||||
.align 3
|
||||
|
||||
.L28:
|
||||
fcmp.ceq.d $fcc0, $f15, $f12
|
||||
bceqz $fcc0, .L29
|
||||
vfcmp.clt.s VT0, VI4, VI0
|
||||
vbitsel.v VI0, VI0, VI4, VT0
|
||||
.align 3
|
||||
|
||||
.L29:
|
||||
movfr2gr.s i0, $f20
|
||||
.align 3
|
||||
|
||||
#endif
|
||||
.L21: //N<4
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
srai.d i1, N, 2
|
||||
slli.d i1, i1, 2
|
||||
addi.d i1, i1, 1 //current index
|
||||
movgr2fr.d $f21, i1
|
||||
movgr2fr.d $f20, i0
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
LD a0, X, 0 * SIZE
|
||||
LD a1, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
FABS a0, a0
|
||||
FABS a1, a1
|
||||
ADD a0, a0, a1
|
||||
FMAX a1, s1, a0
|
||||
CMPEQ $fcc0, s1, a1
|
||||
add.d X, X, INCX
|
||||
fsel s1, a1, s1, $fcc0
|
||||
fsel $f20, $f21, $f20, $fcc0
|
||||
addi.d i1, i1, 1
|
||||
movgr2fr.d $f21, i1
|
||||
blt $r0, I, .L22
|
||||
MTG i0, $f20
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,555 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r12
|
||||
#define t1 $r13
|
||||
#define t2 $r15
|
||||
#define t3 $r18
|
||||
#define t4 $r16
|
||||
#define i0 $r17
|
||||
#define i1 $r14
|
||||
#define TEMP $r19
|
||||
#define a0 $f12
|
||||
#define a1 $f13
|
||||
#define s1 $f15
|
||||
#define x1 $xr9
|
||||
#define x2 $xr10
|
||||
#define x3 $xr11
|
||||
#define x4 $xr12
|
||||
#define VX0 $xr13
|
||||
#define VX1 $xr14
|
||||
#define VM0 $xr15
|
||||
#define VM1 $xr16
|
||||
#define VINC4 $xr17
|
||||
#define VINC8 $xr18
|
||||
#define VI0 $xr20
|
||||
#define VI1 $xr21
|
||||
#define VI2 $xr22
|
||||
#define VI3 $xr8
|
||||
#define VI4 $xr19
|
||||
#define VT0 $xr23
|
||||
|
||||
PROLOGUE
|
||||
li.d i0, 0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
LD a0, X, 0 * SIZE
|
||||
LD a1, X, 1 * SIZE
|
||||
FABS a0, a0
|
||||
FABS a1, a1
|
||||
ADD s1, a1, a0
|
||||
#ifdef DOUBLE
|
||||
xvreplve0.d VM0, VM0
|
||||
xvxor.v VI3, VI3, VI3 // 0
|
||||
li.d I, -1
|
||||
xvreplgr2vr.d VI4, I
|
||||
xvffint.d.l VI4, VI4 // -1
|
||||
bne INCX, TEMP, .L20
|
||||
addi.d i0, i0, 1
|
||||
srai.d I, N, 2
|
||||
bge $r0, I, .L21
|
||||
slli.d i0, i0, 2 //4
|
||||
xvreplgr2vr.d VINC4, i0
|
||||
addi.d i0, i0, -7
|
||||
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 2
|
||||
xvinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, -1
|
||||
xvinsgr2vr.d VI1, i0, 2
|
||||
addi.d i0, i0, 2
|
||||
xvinsgr2vr.d VI1, i0, 3
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 2
|
||||
xvinsgr2vr.d VI0, i0, 1 //3
|
||||
addi.d i0, i0, -1
|
||||
xvinsgr2vr.d VI0, i0, 2 //2
|
||||
addi.d i0, i0, 2
|
||||
xvinsgr2vr.d VI0, i0, 3 //4
|
||||
#else
|
||||
xvreplve0.w VM0, VM0
|
||||
xvxor.v VI3, VI3, VI3 // 0
|
||||
li.w I, -1
|
||||
xvreplgr2vr.w VI4, I
|
||||
xvffint.s.w VI4, VI4 // -1
|
||||
bne INCX, TEMP, .L20
|
||||
addi.w i0, i0, 1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
slli.w i0, i0, 3 //8
|
||||
xvreplgr2vr.w VINC8, i0
|
||||
addi.w i0, i0, -15
|
||||
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 3
|
||||
xvinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, -3
|
||||
xvinsgr2vr.w VI1, i0, 4
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 5
|
||||
addi.w i0, i0, 3
|
||||
xvinsgr2vr.w VI1, i0, 6
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 3
|
||||
xvinsgr2vr.w VI0, i0, 2 //5
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 3 //6
|
||||
addi.w i0, i0, -3
|
||||
xvinsgr2vr.w VI0, i0, 4 //3
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 5 //4
|
||||
addi.w i0, i0, 3
|
||||
xvinsgr2vr.w VI0, i0, 6 //7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 7 //8
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvadd.d VI1, VI1, VINC4
|
||||
xvld VX1, X, 4 * SIZE
|
||||
addi.d I, I, -1
|
||||
xvpickev.d x1, VX1, VX0
|
||||
xvpickod.d x2, VX1, VX0
|
||||
xvfmul.d x3, VI4, x1
|
||||
xvfmul.d x4, VI4, x2
|
||||
xvfcmp.clt.d VT0, x1, VI3
|
||||
xvfcmp.clt.d VINC8, x2, VI3
|
||||
xvbitsel.v x1, x1, x3, VT0
|
||||
xvbitsel.v x2, x2, x4, VINC8
|
||||
#else
|
||||
xvadd.w VI1, VI1, VINC8
|
||||
xvld VX1, X, 8 * SIZE
|
||||
addi.d I, I, -1
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
xvfmul.s x3, VI4, x1
|
||||
xvfmul.s x4, VI4, x2
|
||||
xvfcmp.clt.s VT0, x1, VI3
|
||||
xvfcmp.clt.s VINC4, x2, VI3
|
||||
xvbitsel.v x1, x1, x3, VT0
|
||||
xvbitsel.v x2, x2, x4, VINC4
|
||||
#endif
|
||||
XVFADD x1, x1, x2
|
||||
XVFMIN x3, VM0, x1
|
||||
XVCMPEQ VT0, x3, VM0
|
||||
addi.d X, X, 8 * SIZE
|
||||
xvbitsel.v VM0, x3, VM0, VT0
|
||||
xvbitsel.v VI0, VI1, VI0, VT0
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VI1, VI0, 0
|
||||
xvpickve.d VI2, VI0, 1
|
||||
xvpickve.d VI3, VI0, 2
|
||||
xvpickve.d VI4, VI0, 3
|
||||
xvpickve.d x1, VM0, 0
|
||||
xvpickve.d x2, VM0, 1
|
||||
xvpickve.d x3, VM0, 2
|
||||
xvpickve.d x4, VM0, 3
|
||||
xvfmin.d VM1, x1, x2
|
||||
xvfcmp.ceq.d VT0, VM1, x1
|
||||
xvbitsel.v VINC4, VI2, VI1, VT0
|
||||
xvfmin.d VM0, x3, x4
|
||||
xvfcmp.ceq.d VT0, x3, VM0
|
||||
xvbitsel.v VINC8, VI4, VI3, VT0
|
||||
xvfmin.d VM0, VM0, VM1
|
||||
xvfcmp.ceq.d VT0, VM0, VM1
|
||||
xvbitsel.v VI0, VINC8, VINC4, VT0
|
||||
#else
|
||||
xvxor.v VX0, VX0, VX0
|
||||
xvor.v VX0, VI0, VX0
|
||||
xvxor.v VX1, VX1, VX1
|
||||
xvor.v VX1, VM0, VX1
|
||||
xvpickve.w VI1, VI0, 0
|
||||
xvpickve.w VI2, VI0, 1
|
||||
xvpickve.w VI3, VI0, 2
|
||||
xvpickve.w VI4, VI0, 3
|
||||
xvpickve.w x1, VM0, 0
|
||||
xvpickve.w x2, VM0, 1
|
||||
xvpickve.w x3, VM0, 2
|
||||
xvpickve.w x4, VM0, 3
|
||||
xvfcmp.clt.s VT0, x1, x2
|
||||
xvbitsel.v VM1, x1, x2, VT0
|
||||
xvbitsel.v VINC4, VI1, VI2, VT0
|
||||
xvfcmp.clt.s VT0, x3, x4
|
||||
xvbitsel.v VM0, x3, x4, VT0
|
||||
xvbitsel.v VINC8, VI3, VI4, VT0
|
||||
xvfcmp.clt.s VT0, VM0, VM1
|
||||
xvbitsel.v VM0, VM0, VM1, VT0
|
||||
xvbitsel.v VI0, VINC8, VINC4, VT0
|
||||
#endif
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
XVCMPLT VT0, VI1, VI0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
b .L26
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
#ifdef DOUBLE
|
||||
addi.d i0, i0, 1
|
||||
srai.d I, N, 2
|
||||
bge $r0, I, .L21
|
||||
slli.d i0, i0, 2 //4
|
||||
xvreplgr2vr.d VINC4, i0
|
||||
addi.d i0, i0, -7
|
||||
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 2
|
||||
xvinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, -1
|
||||
xvinsgr2vr.d VI1, i0, 2
|
||||
addi.d i0, i0, 2
|
||||
xvinsgr2vr.d VI1, i0, 3
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 2
|
||||
xvinsgr2vr.d VI0, i0, 1 //3
|
||||
addi.d i0, i0, -1
|
||||
xvinsgr2vr.d VI0, i0, 2 //2
|
||||
addi.d i0, i0, 2
|
||||
xvinsgr2vr.d VI0, i0, 3 //4
|
||||
#else
|
||||
addi.w i0, i0, 1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
slli.w i0, i0, 3 //8
|
||||
xvreplgr2vr.w VINC8, i0
|
||||
addi.w i0, i0, -15
|
||||
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 3
|
||||
xvinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, -3
|
||||
xvinsgr2vr.w VI1, i0, 4
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 5
|
||||
addi.w i0, i0, 3
|
||||
xvinsgr2vr.w VI1, i0, 6
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 3
|
||||
xvinsgr2vr.w VI0, i0, 2 //5
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 3 //6
|
||||
addi.w i0, i0, -3
|
||||
xvinsgr2vr.w VI0, i0, 4 //3
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 5 //4
|
||||
addi.w i0, i0, 3
|
||||
xvinsgr2vr.w VI0, i0, 6 //7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 7 //8
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 0
|
||||
xvinsgr2vr.d x2, t2, 0
|
||||
xvinsgr2vr.d x1, t3, 1
|
||||
xvinsgr2vr.d x2, t4, 1
|
||||
xvadd.d VI1, VI1, VINC4
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d x1, t1, 2
|
||||
xvinsgr2vr.d x2, t2, 2
|
||||
xvinsgr2vr.d x1, t3, 3
|
||||
xvinsgr2vr.d x2, t4, 3
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 0
|
||||
xvinsgr2vr.w x2, t2, 0
|
||||
xvinsgr2vr.w x1, t3, 1
|
||||
xvinsgr2vr.w x2, t4, 1
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 2
|
||||
xvinsgr2vr.w x2, t2, 2
|
||||
xvinsgr2vr.w x1, t3, 3
|
||||
xvinsgr2vr.w x2, t4, 3
|
||||
xvadd.w VI1, VI1, VINC8
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 4
|
||||
xvinsgr2vr.w x2, t2, 4
|
||||
xvinsgr2vr.w x1, t3, 5
|
||||
xvinsgr2vr.w x2, t4, 5
|
||||
xvadd.w VI1, VI1, VINC8
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w x1, t1, 6
|
||||
xvinsgr2vr.w x2, t2, 6
|
||||
xvinsgr2vr.w x1, t3, 7
|
||||
xvinsgr2vr.w x2, t4, 7
|
||||
xvpickev.w x1, VX1, VX0
|
||||
xvpickod.w x2, VX1, VX0
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
XVFMUL x3, VI4, x1
|
||||
XVFMUL x4, VI4, x2
|
||||
XVCMPLT VT0, x1, VI3
|
||||
XVCMPLT VINC8, x2, VI3
|
||||
xvbitsel.v x1, x1, x3, VT0
|
||||
xvbitsel.v x2, x2, x4, VINC8
|
||||
XVFADD x1, x1, x2
|
||||
XVFMIN x3, VM0, x1
|
||||
XVCMPEQ VT0, x3, VM0
|
||||
xvbitsel.v VM0, x3, VM0, VT0
|
||||
xvbitsel.v VI0, VI1, VI0, VT0
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VI1, VI0, 0
|
||||
xvpickve.d VI2, VI0, 1
|
||||
xvpickve.d VI3, VI0, 2
|
||||
xvpickve.d VI4, VI0, 3
|
||||
xvpickve.d x1, VM0, 0
|
||||
xvpickve.d x2, VM0, 1
|
||||
xvpickve.d x3, VM0, 2
|
||||
xvpickve.d x4, VM0, 3
|
||||
xvfmina.d VM1, x1, x2
|
||||
xvfcmp.ceq.d VT0, VM1, x1
|
||||
xvbitsel.v VINC4, VI2, VI1, VT0
|
||||
xvfmina.d VM0, x3, x4
|
||||
xvfcmp.ceq.d VT0, x3, VM0
|
||||
xvbitsel.v VINC8, VI4, VI3, VT0
|
||||
xvfmina.d VM0, VM0, VM1
|
||||
xvfcmp.ceq.d VT0, VM0, VM1
|
||||
#else
|
||||
xvxor.v VX0, VX0, VX0
|
||||
xvor.v VX0, VI0, VX0
|
||||
xvxor.v VX1, VX1, VX1
|
||||
xvor.v VX1, VM0, VX1
|
||||
xvpickve.w VI1, VI0, 0
|
||||
xvpickve.w VI2, VI0, 1
|
||||
xvpickve.w VI3, VI0, 2
|
||||
xvpickve.w VI4, VI0, 3
|
||||
xvpickve.w x1, VM0, 0
|
||||
xvpickve.w x2, VM0, 1
|
||||
xvpickve.w x3, VM0, 2
|
||||
xvpickve.w x4, VM0, 3
|
||||
xvfcmp.clt.s VT0, x1, x2
|
||||
xvbitsel.v VM1, x1, x2, VT0
|
||||
xvbitsel.v VINC4, VI1, VI2, VT0
|
||||
xvfcmp.clt.s VT0, x3, x4
|
||||
xvbitsel.v VM0, x3, x4, VT0
|
||||
xvbitsel.v VINC8, VI3, VI4, VT0
|
||||
xvfcmp.clt.s VT0, VM0, VM1
|
||||
xvbitsel.v VM0, VM0, VM1, VT0
|
||||
#endif
|
||||
xvbitsel.v VI0, VINC8, VINC4, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
XVCMPLT VT0, VI1, VI0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
fcmp.ceq.d $fcc0, $f15, $f10
|
||||
bceqz $fcc0, .L27
|
||||
XVCMPLT VT0, VI2, VI0
|
||||
xvbitsel.v VI0, VI0, VI2, VT0
|
||||
.align 3
|
||||
|
||||
.L27:
|
||||
fcmp.ceq.d $fcc0, $f15, $f11
|
||||
bceqz $fcc0, .L28
|
||||
XVCMPLT VT0, VI3, VI0
|
||||
xvbitsel.v VI0, VI0, VI3, VT0
|
||||
.align 3
|
||||
|
||||
.L28:
|
||||
fcmp.ceq.d $fcc0, $f15, $f12
|
||||
bceqz $fcc0, .L29
|
||||
XVCMPLT VT0, VI4, VI0
|
||||
xvbitsel.v VI0, VI0, VI4, VT0
|
||||
.align 3
|
||||
|
||||
.L29:
|
||||
#ifdef DOUBLE
|
||||
movfr2gr.d i0, $f20
|
||||
.align 3
|
||||
|
||||
.L21: //N<4
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
srai.d i1, N, 2
|
||||
slli.d i1, i1, 2
|
||||
#else
|
||||
fmov.s $f16, $f20
|
||||
.align 3
|
||||
|
||||
.L252:
|
||||
xvxor.v VI0, VI0, VI0
|
||||
xvor.v VI0, VI0, VX0
|
||||
fmov.s $f13, $f15
|
||||
xvxor.v VM0, VM0, VM0
|
||||
xvor.v VM0, VM0, VX1
|
||||
xvpickve.w VI1, VI0, 4
|
||||
xvpickve.w VI2, VI0, 5
|
||||
xvpickve.w VI3, VI0, 6
|
||||
xvpickve.w VI4, VI0, 7
|
||||
xvpickve.w x1, VM0, 4
|
||||
xvpickve.w x2, VM0, 5
|
||||
xvpickve.w x3, VM0, 6
|
||||
xvpickve.w x4, VM0, 7
|
||||
xvfcmp.clt.s VT0, x1, x2
|
||||
xvbitsel.v x1, x1, x2, VT0
|
||||
xvbitsel.v VINC4, VI1, VI2, VT0
|
||||
xvfcmp.clt.s VT0, x3, x4
|
||||
xvbitsel.v VM0, x3, x4, VT0
|
||||
xvbitsel.v VINC8, VI3, VI4, VT0
|
||||
xvfcmp.clt.s VT0, VM0, x1
|
||||
xvbitsel.v VM0, VM0, x1, VT0
|
||||
xvbitsel.v VI0, VINC8, VINC4, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L262
|
||||
xvfcmp.clt.s VT0, VI1, VI0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
.align 3
|
||||
|
||||
.L262:
|
||||
fcmp.ceq.d $fcc0, $f15, $f10
|
||||
bceqz $fcc0, .L272
|
||||
xvfcmp.clt.s VT0, VI2, VI0
|
||||
xvbitsel.v VI0, VI0, VI2, VT0
|
||||
.align 3
|
||||
|
||||
.L272:
|
||||
fcmp.ceq.d $fcc0, $f15, $f11
|
||||
bceqz $fcc0, .L282
|
||||
xvfcmp.clt.s VT0, VI3, VI0
|
||||
xvbitsel.v VI0, VI0, VI3, VT0
|
||||
.align 3
|
||||
|
||||
.L282:
|
||||
fcmp.ceq.d $fcc0, $f15, $f12
|
||||
bceqz $fcc0, .L292
|
||||
xvfcmp.clt.s VT0, VI4, VI0
|
||||
xvbitsel.v VI0, VI0, VI4, VT0
|
||||
.align 3
|
||||
|
||||
.L292:
|
||||
fcmp.clt.s $fcc0, $f15, $f13
|
||||
fsel $f15, $f15, $f13, $fcc0
|
||||
fsel $f20, $f20, $f16, $fcc0
|
||||
movfr2gr.s i0, $f20
|
||||
|
||||
.L21: //N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
srai.d i1, N, 3
|
||||
slli.d i1, i1, 3
|
||||
#endif
|
||||
addi.d i1, i1, 1 //current index
|
||||
movgr2fr.d $f21, i1
|
||||
movgr2fr.d $f20, i0
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
LD a0, X, 0 * SIZE
|
||||
LD a1, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
FABS a0, a0
|
||||
FABS a1, a1
|
||||
ADD a0, a0, a1
|
||||
FMIN a1, s1, a0
|
||||
CMPEQ $fcc0, s1, a1
|
||||
add.d X, X, INCX
|
||||
fsel s1, a1, s1, $fcc0
|
||||
fsel $f20, $f21, $f20, $fcc0
|
||||
addi.d i1, i1, 1
|
||||
movgr2fr.d $f21, i1
|
||||
blt $r0, I, .L22
|
||||
MTG i0, $f20
|
||||
.align 3
|
||||
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,425 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r12
|
||||
#define t1 $r13
|
||||
#define t2 $r15
|
||||
#define t3 $r18
|
||||
#define t4 $r16
|
||||
#define i0 $r17
|
||||
#define i1 $r14
|
||||
#define TEMP $r19
|
||||
#define a0 $f12
|
||||
#define a1 $f13
|
||||
#define s1 $f15
|
||||
#define x1 $vr9
|
||||
#define x2 $vr10
|
||||
#define x3 $vr11
|
||||
#define x4 $vr12
|
||||
#define VX0 $vr13
|
||||
#define VX1 $vr14
|
||||
#define VM0 $vr15
|
||||
#define VM1 $vr16
|
||||
#define VINC4 $vr17
|
||||
#define VINC8 $vr18
|
||||
#define VI0 $vr20
|
||||
#define VI1 $vr21
|
||||
#define VI2 $vr22
|
||||
#define VI3 $vr8
|
||||
#define VI4 $vr19
|
||||
#define VT0 $vr23
|
||||
|
||||
PROLOGUE
|
||||
li.d i0, 0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
LD a0, X, 0 * SIZE
|
||||
LD a1, X, 1 * SIZE
|
||||
FABS a0, a0
|
||||
FABS a1, a1
|
||||
ADD s1, a1, a0
|
||||
vreplvei.w VM0, VM0, 0
|
||||
vxor.v VI3, VI3, VI3 // 0
|
||||
#ifdef DOUBLE
|
||||
li.d I, -1
|
||||
vreplgr2vr.d VI4, I
|
||||
vffint.d.l VI4, VI4 // -1
|
||||
bne INCX, TEMP, .L20
|
||||
addi.d i0, i0, 1
|
||||
srai.d I, N, 2
|
||||
bge $r0, I, .L21
|
||||
slli.d i0, i0, 1 //2
|
||||
vreplgr2vr.d VINC4, i0
|
||||
addi.d i0, i0, -3
|
||||
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI0, i0, 1 //2
|
||||
#else
|
||||
li.w I, -1
|
||||
vreplgr2vr.w VI4, I
|
||||
vffint.s.w VI4, VI4 // -1
|
||||
bne INCX, TEMP, .L20
|
||||
addi.w i0, i0, 1
|
||||
srai.d I, N, 2
|
||||
bge $r0, I, .L21
|
||||
slli.w i0, i0, 2 //4
|
||||
vreplgr2vr.w VINC4, i0
|
||||
addi.w i0, i0, -7
|
||||
vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 2 //3
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 3 //4
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
vld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vadd.d VI1, VI1, VINC4
|
||||
vld VX1, X, 2 * SIZE
|
||||
addi.d I, I, -1
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
vfmul.d x3, VI4, x1
|
||||
vfmul.d x4, VI4, x2
|
||||
vfcmp.clt.d VT0, x1, VI3
|
||||
vfcmp.clt.d VINC8, x2, VI3
|
||||
vbitsel.v x1, x1, x3, VT0
|
||||
vbitsel.v x2, x2, x4, VINC8
|
||||
vfadd.d x1, x1, x2
|
||||
vfmin.d x3, VM0, x1
|
||||
vfcmp.ceq.d VT0, x3, VM0
|
||||
vbitsel.v VM0, x3, VM0, VT0
|
||||
vbitsel.v VI0, VI1, VI0, VT0
|
||||
vld VX0, X, 4 * SIZE
|
||||
vadd.d VI1, VI1, VINC4
|
||||
vld VX1, X, 6 * SIZE
|
||||
vpickev.d x1, VX1, VX0
|
||||
vpickod.d x2, VX1, VX0
|
||||
#else
|
||||
vadd.w VI1, VI1, VINC4
|
||||
vld VX1, X, 4 * SIZE
|
||||
addi.d I, I, -1
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
#endif
|
||||
VFMUL x3, VI4, x1
|
||||
VFMUL x4, VI4, x2
|
||||
VCMPLT VT0, x1, VI3
|
||||
VCMPLT VINC8, x2, VI3
|
||||
vbitsel.v x1, x1, x3, VT0
|
||||
vbitsel.v x2, x2, x4, VINC8
|
||||
VFADD x1, x1, x2
|
||||
VFMIN x3, VM0, x1
|
||||
VCMPEQ VT0, x3, VM0
|
||||
addi.d X, X, 8 * SIZE
|
||||
vbitsel.v VM0, x3, VM0, VT0
|
||||
vbitsel.v VI0, VI1, VI0, VT0
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VI1, VI0, 0
|
||||
vreplvei.d VI2, VI0, 1
|
||||
vreplvei.d x1, VM0, 0
|
||||
vreplvei.d x2, VM0, 1
|
||||
fcmp.ceq.d $fcc0, $f10, $f9
|
||||
bceqz $fcc0, .L26
|
||||
vfcmp.clt.d VT0, VI1, VI2
|
||||
vbitsel.v VI0, VI2, VI1, VT0
|
||||
b .L27
|
||||
#else
|
||||
vreplvei.w VI1, VI0, 0
|
||||
vreplvei.w VI2, VI0, 1
|
||||
vreplvei.w VI3, VI0, 2
|
||||
vreplvei.w VI4, VI0, 3
|
||||
vreplvei.w x1, VM0, 0
|
||||
vreplvei.w x2, VM0, 1
|
||||
vreplvei.w x3, VM0, 2
|
||||
vreplvei.w x4, VM0, 3
|
||||
vfmina.s VM1, x1, x2
|
||||
vfcmp.ceq.s VT0, VM1, x1
|
||||
vbitsel.v VINC4, VI2, VI1, VT0
|
||||
vfmina.s VM0, x3, x4
|
||||
vfcmp.ceq.s VT0, x3, VM0
|
||||
vbitsel.v VINC8, VI4, VI3, VT0
|
||||
vfmina.s VM0, VM0, VM1
|
||||
vfcmp.ceq.s VT0, VM0, VM1
|
||||
vbitsel.v VI0, VINC8, VINC4, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
vfcmp.clt.s VT0, VI1, VI0
|
||||
vbitsel.v VI0, VI0, VI1, VT0
|
||||
b .L26
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
#ifdef DOUBLE
|
||||
addi.d i0, i0, 1
|
||||
srai.d I, N, 2
|
||||
bge $r0, I, .L21
|
||||
slli.d i0, i0, 1 //2
|
||||
vreplgr2vr.d VINC4, i0
|
||||
addi.d i0, i0, -3
|
||||
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI0, i0, 1 //2
|
||||
#else
|
||||
addi.w i0, i0, 1
|
||||
srai.d I, N, 2
|
||||
bge $r0, I, .L21
|
||||
slli.w i0, i0, 2 //4
|
||||
vreplgr2vr.w VINC4, i0
|
||||
addi.w i0, i0, -7
|
||||
vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 2 //3
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 3 //4
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
ld.d t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
vadd.d VI1, VI1, VINC4
|
||||
vfmul.d x3, VI4, x1
|
||||
vfmul.d x4, VI4, x2
|
||||
vfcmp.clt.d VT0, x1, VI3
|
||||
vfcmp.clt.d VINC8, x2, VI3
|
||||
vbitsel.v x1, x1, x3, VT0
|
||||
vbitsel.v x2, x2, x4, VINC8
|
||||
vfadd.d x1, x1, x2
|
||||
vfmin.d x3, VM0, x1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
vfcmp.ceq.d VT0, x3, VM0
|
||||
ld.d t2, X, 1 * SIZE
|
||||
vbitsel.v VM0, x3, VM0, VT0
|
||||
vbitsel.v VI0, VI1, VI0, VT0
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
ld.d t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d x1, t1, 0
|
||||
vinsgr2vr.d x2, t2, 0
|
||||
vinsgr2vr.d x1, t3, 1
|
||||
vinsgr2vr.d x2, t4, 1
|
||||
vadd.d VI1, VI1, VINC4
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 0
|
||||
vinsgr2vr.w x2, t2, 0
|
||||
vinsgr2vr.w x1, t3, 1
|
||||
vinsgr2vr.w x2, t4, 1
|
||||
vadd.w VI1, VI1, VINC4
|
||||
ld.w t1, X, 0 * SIZE
|
||||
ld.w t2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
ld.w t4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w x1, t1, 2
|
||||
vinsgr2vr.w x2, t2, 2
|
||||
vinsgr2vr.w x1, t3, 3
|
||||
vinsgr2vr.w x2, t4, 3
|
||||
vpickev.w x1, VX1, VX0
|
||||
vpickod.w x2, VX1, VX0
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
VFMUL x3, VI4, x1
|
||||
VFMUL x4, VI4, x2
|
||||
VCMPLT VT0, x1, VI3
|
||||
VCMPLT VINC8, x2, VI3
|
||||
vbitsel.v x1, x1, x3, VT0
|
||||
vbitsel.v x2, x2, x4, VINC8
|
||||
VFADD x1, x1, x2
|
||||
VFMIN x3, VM0, x1
|
||||
VCMPEQ VT0, x3, VM0
|
||||
vbitsel.v VM0, x3, VM0, VT0
|
||||
vbitsel.v VI0, VI1, VI0, VT0
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VI1, VI0, 0
|
||||
vreplvei.d VI2, VI0, 1
|
||||
vreplvei.d x1, VM0, 0
|
||||
vreplvei.d x2, VM0, 1
|
||||
fcmp.ceq.d $fcc0, $f10, $f9
|
||||
bceqz $fcc0, .L26
|
||||
vfcmp.clt.d VT0, VI1, VI2
|
||||
vbitsel.v VI0, VI2, VI1, VT0
|
||||
b .L27
|
||||
#else
|
||||
vreplvei.w VI1, VI0, 0
|
||||
vreplvei.w VI2, VI0, 1
|
||||
vreplvei.w VI3, VI0, 2
|
||||
vreplvei.w VI4, VI0, 3
|
||||
vreplvei.w x1, VM0, 0
|
||||
vreplvei.w x2, VM0, 1
|
||||
vreplvei.w x3, VM0, 2
|
||||
vreplvei.w x4, VM0, 3
|
||||
vfmina.s VM1, x1, x2
|
||||
vfcmp.ceq.s VT0, VM1, x1
|
||||
vbitsel.v VINC4, VI2, VI1, VT0
|
||||
vfmina.s VM0, x3, x4
|
||||
vfcmp.ceq.s VT0, x3, VM0
|
||||
vbitsel.v VINC8, VI4, VI3, VT0
|
||||
vfmina.s VM0, VM0, VM1
|
||||
vfcmp.ceq.s VT0, VM0, VM1
|
||||
vbitsel.v VI0, VINC8, VINC4, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
vfcmp.clt.s VT0, VI1, VI0
|
||||
vbitsel.v VI0, VI0, VI1, VT0
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
#ifdef DOUBLE
|
||||
vfmina.d VM0, x1, x2
|
||||
vfcmp.ceq.d VT0, x1, VM0
|
||||
#else
|
||||
fcmp.ceq.d $fcc0, $f15, $f10
|
||||
bceqz $fcc0, .L27
|
||||
vfcmp.clt.s VT0, VI2, VI0
|
||||
#endif
|
||||
vbitsel.v VI0, VI0, VI2, VT0
|
||||
.align 3
|
||||
|
||||
.L27:
|
||||
#ifdef DOUBLE
|
||||
movfr2gr.d i0, $f20
|
||||
.align 3
|
||||
#else
|
||||
fcmp.ceq.d $fcc0, $f15, $f11
|
||||
bceqz $fcc0, .L28
|
||||
vfcmp.clt.s VT0, VI3, VI0
|
||||
vbitsel.v VI0, VI0, VI3, VT0
|
||||
.align 3
|
||||
|
||||
.L28:
|
||||
fcmp.ceq.d $fcc0, $f15, $f12
|
||||
bceqz $fcc0, .L29
|
||||
vfcmp.clt.s VT0, VI4, VI0
|
||||
vbitsel.v VI0, VI0, VI4, VT0
|
||||
.align 3
|
||||
|
||||
.L29:
|
||||
movfr2gr.s i0, $f20
|
||||
.align 3
|
||||
|
||||
#endif
|
||||
.L21: //N<4
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
srai.d i1, N, 2
|
||||
slli.d i1, i1, 2
|
||||
addi.d i1, i1, 1 //current index
|
||||
movgr2fr.d $f21, i1
|
||||
movgr2fr.d $f20, i0
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
LD a0, X, 0 * SIZE
|
||||
LD a1, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
FABS a0, a0
|
||||
FABS a1, a1
|
||||
ADD a0, a0, a1
|
||||
FMIN a1, s1, a0
|
||||
CMPEQ $fcc0, s1, a1
|
||||
add.d X, X, INCX
|
||||
fsel s1, a1, s1, $fcc0
|
||||
fsel $f20, $f21, $f20, $fcc0
|
||||
addi.d i1, i1, 1
|
||||
movgr2fr.d $f21, i1
|
||||
blt $r0, I, .L22
|
||||
MTG i0, $f20
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,533 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r12
|
||||
#define t1 $r13
|
||||
#define t2 $r15
|
||||
#define t3 $r18
|
||||
#define t4 $r16
|
||||
#define i0 $r17
|
||||
#define i1 $r14
|
||||
#define TEMP $r19
|
||||
#define x1 $xr9
|
||||
#define x2 $xr10
|
||||
#define x3 $xr11
|
||||
#define x4 $xr12
|
||||
#define VX0 $xr13
|
||||
#define VX1 $xr14
|
||||
#define VM0 $xr15
|
||||
#define VM1 $xr16
|
||||
#define VINC4 $xr17
|
||||
#define VINC8 $xr18
|
||||
#define VI0 $xr20
|
||||
#define VI1 $xr21
|
||||
#define VI2 $xr22
|
||||
#define VI3 $xr8
|
||||
#define VI4 $xr19
|
||||
#define VT0 $xr23
|
||||
|
||||
PROLOGUE
|
||||
li.d i0, 0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bne INCX, TEMP, .L20
|
||||
xvld VM0, X, 0
|
||||
#ifdef DOUBLE
|
||||
addi.d i0, i0, 1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
slli.d i0, i0, 2 //4
|
||||
xvreplgr2vr.d VINC4, i0
|
||||
slli.d i0, i0, 1 //8
|
||||
xvreplgr2vr.d VINC8, i0
|
||||
addi.d i0, i0, -15
|
||||
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 2
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 3
|
||||
addi.d i0, i0, 5
|
||||
xvinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 1 //2
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 2 //3
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 3 //4
|
||||
#else
|
||||
addi.w i0, i0, 1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
slli.w i0, i0, 3 //8
|
||||
xvreplgr2vr.w VINC8, i0
|
||||
addi.w i0, i0, -15
|
||||
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 4
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 5
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 6
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 2 //3
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 3 //4
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 4 //5
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 5 //6
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 6 //7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 7 //8
|
||||
#endif
|
||||
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvadd.d VI1, VI1, VINC8
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvadd.d VI2, VI1, VINC4
|
||||
xvfcmp.clt.d VT0, VX0, VX1
|
||||
addi.d I, I, -1
|
||||
xvbitsel.v VM1, VX0, VX1, VT0
|
||||
xvbitsel.v VI2, VI1, VI2, VT0
|
||||
xvfcmp.clt.d VT0, VM0, VM1
|
||||
addi.d X, X, 8 * SIZE
|
||||
xvbitsel.v VM0, VM0, VM1, VT0
|
||||
xvbitsel.v VI0, VI0, VI2, VT0
|
||||
#else
|
||||
xvadd.w VI1, VI1, VINC8
|
||||
xvfcmp.clt.s VT0, VM0, VX0
|
||||
addi.d I, I, -1
|
||||
xvbitsel.v VM0, VM0, VX0, VT0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
addi.d X, X, 8 * SIZE
|
||||
#endif
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VI1, VI0, 0
|
||||
xvpickve.d VI2, VI0, 1
|
||||
xvpickve.d VI3, VI0, 2
|
||||
xvpickve.d VI4, VI0, 3
|
||||
xvpickve.d x1, VM0, 0
|
||||
xvpickve.d x2, VM0, 1
|
||||
xvpickve.d x3, VM0, 2
|
||||
xvpickve.d x4, VM0, 3
|
||||
xvfcmp.clt.d VT0, x1, x2
|
||||
#else
|
||||
xvxor.v VX0, VX0, VX0
|
||||
xvor.v VX0, VI0, VX0
|
||||
xvxor.v VX1, VX1, VX1
|
||||
xvor.v VX1, VM0, VX1
|
||||
xvpickve.w VI1, VI0, 0
|
||||
xvpickve.w VI2, VI0, 1
|
||||
xvpickve.w VI3, VI0, 2
|
||||
xvpickve.w VI4, VI0, 3
|
||||
xvpickve.w x1, VM0, 0
|
||||
xvpickve.w x2, VM0, 1
|
||||
xvpickve.w x3, VM0, 2
|
||||
xvpickve.w x4, VM0, 3
|
||||
xvfcmp.clt.s VT0, x1, x2
|
||||
#endif
|
||||
xvbitsel.v VM1, x1, x2, VT0
|
||||
xvbitsel.v VINC4, VI1, VI2, VT0
|
||||
XVCMPLT VT0, x3, x4
|
||||
xvbitsel.v VM0, x3, x4, VT0
|
||||
xvbitsel.v VINC8, VI3, VI4, VT0
|
||||
XVCMPLT VT0, VM0, VM1
|
||||
xvbitsel.v VM0, VM0, VM1, VT0
|
||||
xvbitsel.v VI0, VINC8, VINC4, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
XVCMPLT VT0, VI1, VI0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
b .L26
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
move TEMP, X
|
||||
#ifdef DOUBLE
|
||||
addi.d i0, i0, 1
|
||||
ld.d t1, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
xvinsgr2vr.d VM0, t1, 0
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
ld.d t2, TEMP, 0 * SIZE
|
||||
#else
|
||||
addi.w i0, i0, 1
|
||||
ld.w t1, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
ld.w t2, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t3, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t4, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
xvinsgr2vr.w VM0, t1, 0
|
||||
xvinsgr2vr.w VM0, t2, 1
|
||||
xvinsgr2vr.w VM0, t3, 2
|
||||
xvinsgr2vr.w VM0, t4, 3
|
||||
ld.w t1, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t2, TEMP, 0 * SIZE
|
||||
#endif
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.d t3, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.d t4, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
#ifdef DOUBLE
|
||||
xvinsgr2vr.d VM0, t1, 0
|
||||
xvinsgr2vr.d VM0, t2, 1
|
||||
xvinsgr2vr.d VM0, t3, 2
|
||||
xvinsgr2vr.d VM0, t4, 3
|
||||
slli.d i0, i0, 2 //4
|
||||
xvreplgr2vr.d VINC4, i0
|
||||
slli.d i0, i0, 1 //8
|
||||
xvreplgr2vr.d VINC8, i0
|
||||
addi.d i0, i0, -15
|
||||
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 2
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 3
|
||||
addi.d i0, i0, 5
|
||||
xvinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 1 //2
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 2 //3
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 3 //4
|
||||
#else
|
||||
xvinsgr2vr.w VM0, t1, 4
|
||||
xvinsgr2vr.w VM0, t2, 5
|
||||
xvinsgr2vr.w VM0, t3, 6
|
||||
xvinsgr2vr.w VM0, t4, 7
|
||||
slli.w i0, i0, 3 //8
|
||||
xvreplgr2vr.w VINC8, i0
|
||||
addi.w i0, i0, -15
|
||||
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 4
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 5
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 6
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 2 //3
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 3 //4
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 4 //5
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 5 //6
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 6 //7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 7 //8
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
xvadd.d VI1, VI1, VINC8
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvadd.d VI1, VI1, VINC8
|
||||
xvadd.d VI2, VI1, VINC4
|
||||
xvfcmp.clt.d VT0, VX0, VX1
|
||||
addi.d I, I, -1
|
||||
xvbitsel.v VM1, VX0, VX1, VT0
|
||||
xvbitsel.v VI2, VI1, VI2, VT0
|
||||
xvfcmp.clt.d VT0, VM0, VM1
|
||||
xvbitsel.v VM0, VM0, VM1, VT0
|
||||
xvbitsel.v VI0, VI0, VI2, VT0
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
xvadd.w VI1, VI1, VINC8
|
||||
xvfcmp.clt.s VT0, VM0, VX0
|
||||
addi.d I, I, -1
|
||||
xvbitsel.v VM0, VM0, VX0, VT0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
#endif
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VI1, VI0, 0
|
||||
xvpickve.d VI2, VI0, 1
|
||||
xvpickve.d VI3, VI0, 2
|
||||
xvpickve.d VI4, VI0, 3
|
||||
xvpickve.d x1, VM0, 0
|
||||
xvpickve.d x2, VM0, 1
|
||||
xvpickve.d x3, VM0, 2
|
||||
xvpickve.d x4, VM0, 3
|
||||
#else
|
||||
xvxor.v VX0, VX0, VX0
|
||||
xvor.v VX0, VI0, VX0
|
||||
xvxor.v VX1, VX1, VX1
|
||||
xvor.v VX1, VM0, VX1
|
||||
xvpickve.w VI1, VI0, 0
|
||||
xvpickve.w VI2, VI0, 1
|
||||
xvpickve.w VI3, VI0, 2
|
||||
xvpickve.w VI4, VI0, 3
|
||||
xvpickve.w x1, VM0, 0
|
||||
xvpickve.w x2, VM0, 1
|
||||
xvpickve.w x3, VM0, 2
|
||||
xvpickve.w x4, VM0, 3
|
||||
#endif
|
||||
XVCMPLT VT0, x1, x2
|
||||
xvbitsel.v VM1, x1, x2, VT0
|
||||
xvbitsel.v VINC4, VI1, VI2, VT0
|
||||
XVCMPLT VT0, x3, x4
|
||||
xvbitsel.v VM0, x3, x4, VT0
|
||||
xvbitsel.v VINC8, VI3, VI4, VT0
|
||||
XVCMPLT VT0, VM0, VM1
|
||||
xvbitsel.v VM0, VM0, VM1, VT0
|
||||
xvbitsel.v VI0, VINC8, VINC4, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
XVCMPLT VT0, VI1, VI0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
fcmp.ceq.d $fcc0, $f15, $f10
|
||||
bceqz $fcc0, .L27
|
||||
XVCMPLT VT0, VI2, VI0
|
||||
xvbitsel.v VI0, VI0, VI2, VT0
|
||||
.align 3
|
||||
|
||||
.L27:
|
||||
fcmp.ceq.d $fcc0, $f15, $f11
|
||||
bceqz $fcc0, .L28
|
||||
XVCMPLT VT0, VI3, VI0
|
||||
xvbitsel.v VI0, VI0, VI3, VT0
|
||||
.align 3
|
||||
|
||||
.L28:
|
||||
fcmp.ceq.d $fcc0, $f15, $f12
|
||||
bceqz $fcc0, .L29
|
||||
XVCMPLT VT0, VI4, VI0
|
||||
xvbitsel.v VI0, VI0, VI4, VT0
|
||||
.align 3
|
||||
|
||||
.L29:
|
||||
#ifdef DOUBLE
|
||||
movfr2gr.d i0, $f20
|
||||
#else
|
||||
fmov.s $f16, $f20
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
#ifndef DOUBLE
|
||||
.L252:
|
||||
xvxor.v VI0, VI0, VI0
|
||||
xvor.v VI0, VI0, VX0
|
||||
fmov.s $f13, $f15
|
||||
xvxor.v VM0, VM0, VM0
|
||||
xvor.v VM0, VM0, VX1
|
||||
xvpickve.w VI1, VI0, 4
|
||||
xvpickve.w VI2, VI0, 5
|
||||
xvpickve.w VI3, VI0, 6
|
||||
xvpickve.w VI4, VI0, 7
|
||||
xvpickve.w x1, VM0, 4
|
||||
xvpickve.w x2, VM0, 5
|
||||
xvpickve.w x3, VM0, 6
|
||||
xvpickve.w x4, VM0, 7
|
||||
xvfcmp.clt.s VT0, x1, x2
|
||||
xvbitsel.v x1, x1, x2, VT0
|
||||
xvbitsel.v VINC4, VI1, VI2, VT0
|
||||
xvfcmp.clt.s VT0, x3, x4
|
||||
xvbitsel.v VM0, x3, x4, VT0
|
||||
xvbitsel.v VINC8, VI3, VI4, VT0
|
||||
xvfcmp.clt.s VT0, VM0, x1
|
||||
xvbitsel.v VM0, VM0, x1, VT0
|
||||
xvbitsel.v VI0, VINC8, VINC4, VT0
|
||||
li.d TEMP, 1 //处理尾数相等时取最小序号
|
||||
movgr2fr.w $f17, TEMP
|
||||
ffint.s.w $f17, $f17
|
||||
xvfcmp.ceq.s VT0, VM0, x1
|
||||
fcmp.ceq.s $fcc0, $f23, $f17
|
||||
bceqz $fcc0, .L262
|
||||
xvfcmp.clt.s VT0, VI1, VI0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
.align 3
|
||||
|
||||
.L262:
|
||||
xvfcmp.ceq.s VT0, VM0, x2
|
||||
fcmp.ceq.s $fcc0, $f23, $f17
|
||||
bceqz $fcc0, .L272
|
||||
xvfcmp.clt.s VT0, VI2, VI0
|
||||
xvbitsel.v VI0, VI0, VI2, VT0
|
||||
.align 3
|
||||
|
||||
.L272:
|
||||
xvfcmp.ceq.s VT0, VM0, x3
|
||||
fcmp.ceq.s $fcc0, $f23, $f17
|
||||
bceqz $fcc0, .L282
|
||||
xvfcmp.clt.s VT0, VI3, VI0
|
||||
xvbitsel.v VI0, VI0, VI3, VT0
|
||||
.align 3
|
||||
|
||||
.L282:
|
||||
xvfcmp.ceq.s VT0, VM0, x4
|
||||
fcmp.ceq.s $fcc0, $f23, $f17
|
||||
bceqz $fcc0, .L292
|
||||
xvfcmp.clt.s VT0, VI4, VI0
|
||||
xvbitsel.v VI0, VI0, VI4, VT0
|
||||
.align 3
|
||||
|
||||
.L292:
|
||||
fcmp.clt.s $fcc0, $f15, $f13
|
||||
fsel $f15, $f15, $f13, $fcc0
|
||||
fsel $f20, $f20, $f16, $fcc0
|
||||
movfr2gr.s i0, $f20
|
||||
#endif
|
||||
|
||||
.L21: //N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
srai.d i1, N, 3
|
||||
slli.d i1, i1, 3
|
||||
addi.d i1, i1, 1 //current index
|
||||
movgr2fr.d $f21, i1
|
||||
movgr2fr.d $f20, i0
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
fld.d $f9, X, 0
|
||||
addi.d I, I, -1
|
||||
CMPLT $fcc0, $f15, $f9
|
||||
add.d X, X, INCX
|
||||
fsel $f15, $f15, $f9, $fcc0
|
||||
fsel $f20, $f20, $f21, $fcc0
|
||||
addi.d i1, i1, 1
|
||||
movgr2fr.d $f21, i1
|
||||
blt $r0, I, .L22
|
||||
MTG i0, $f20
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,428 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r12
|
||||
#define t1 $r13
|
||||
#define t2 $r15
|
||||
#define t3 $r18
|
||||
#define t4 $r16
|
||||
#define i0 $r17
|
||||
#define i1 $r14
|
||||
#define TEMP $r19
|
||||
#define x1 $vr9
|
||||
#define x2 $vr10
|
||||
#define x3 $vr11
|
||||
#define x4 $vr12
|
||||
#define VX0 $vr13
|
||||
#define VX1 $vr14
|
||||
#define VM0 $vr15
|
||||
#define VM1 $vr16
|
||||
#define VI0 $vr20
|
||||
#define VI1 $vr21
|
||||
#define VI2 $vr22
|
||||
#define VI3 $vr8
|
||||
#define VI4 $vr19
|
||||
#define VT0 $vr23
|
||||
|
||||
PROLOGUE
|
||||
li.d i0, 0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bne INCX, TEMP, .L20
|
||||
vld VM0, X, 0
|
||||
#ifdef DOUBLE
|
||||
addi.d i0, i0, 1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
slli.d i0, i0, 1 //2
|
||||
vreplgr2vr.d $vr17, i0
|
||||
slli.d i0, i0, 1 //4
|
||||
vreplgr2vr.d $vr18, i0
|
||||
addi.d i0, i0, -7
|
||||
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, 3
|
||||
vinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI0, i0, 1 //2
|
||||
#else
|
||||
addi.w i0, i0, 1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
slli.w i0, i0, 2 //4
|
||||
vreplgr2vr.w $vr17, i0
|
||||
slli.w i0, i0, 1 //8
|
||||
vreplgr2vr.w $vr18, i0
|
||||
addi.w i0, i0, -15
|
||||
vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, 5
|
||||
vinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 2 //3
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 3 //4
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
vld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vadd.d VI1, VI1, $vr18
|
||||
vld VX1, X, 2 * SIZE
|
||||
vadd.d VI2, VI1, $vr17
|
||||
VCMPLT VT0, VX0, VX1
|
||||
vbitsel.v x1, VX0, VX1, VT0
|
||||
vbitsel.v x2, VI1, VI2, VT0
|
||||
vld VX0, X, 4 * SIZE
|
||||
vadd.d VI1, VI2, $vr17
|
||||
vld VX1, X, 6 * SIZE
|
||||
vadd.d VI2, VI1, $vr17
|
||||
VCMPLT VT0, VX0, VX1
|
||||
addi.d I, I, -1
|
||||
vbitsel.v x3, VX0, VX1, VT0
|
||||
vbitsel.v x4, VI1, VI2, VT0
|
||||
VCMPLT VT0, x1, x3
|
||||
vbitsel.v x1, x1, x3, VT0
|
||||
vbitsel.v x2, x2, x4, VT0
|
||||
VCMPLT VT0, VM0, x1
|
||||
addi.d X, X, 8 * SIZE
|
||||
vbitsel.v VM0, VM0, x1, VT0
|
||||
vbitsel.v VI0, VI0, x2, VT0
|
||||
#else
|
||||
vadd.w VI1, VI1, $vr18
|
||||
vld VX1, X, 4 * SIZE
|
||||
vadd.w VI2, VI1, $vr17
|
||||
VCMPLT VT0, VX0, VX1
|
||||
addi.d I, I, -1
|
||||
vbitsel.v VM1, VX0, VX1, VT0
|
||||
vbitsel.v VI2, VI1, VI2, VT0
|
||||
VCMPLT VT0, VM0, VM1
|
||||
addi.d X, X, 8 * SIZE
|
||||
vbitsel.v VM0, VM0, VM1, VT0
|
||||
vbitsel.v VI0, VI0, VI2, VT0
|
||||
#endif
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VI1, VI0, 0
|
||||
vreplvei.d VI2, VI0, 1
|
||||
vreplvei.d x1, VM0, 0
|
||||
vreplvei.d x2, VM0, 1
|
||||
fcmp.ceq.d $fcc0, $f10, $f9
|
||||
bceqz $fcc0, .L26
|
||||
VCMPLT VT0, VI1, VI2
|
||||
vbitsel.v VI0, VI2, VI1, VT0
|
||||
b .L27
|
||||
#else
|
||||
vreplvei.w VI1, VI0, 0
|
||||
vreplvei.w VI2, VI0, 1
|
||||
vreplvei.w VI3, VI0, 2
|
||||
vreplvei.w VI4, VI0, 3
|
||||
vreplvei.w x1, VM0, 0
|
||||
vreplvei.w x2, VM0, 1
|
||||
vreplvei.w x3, VM0, 2
|
||||
vreplvei.w x4, VM0, 3
|
||||
VCMPLT VT0, x1, x2
|
||||
vbitsel.v VM1, x1, x2, VT0
|
||||
vbitsel.v $vr17, VI1, VI2, VT0
|
||||
VCMPLT VT0, x3, x4
|
||||
vbitsel.v VM0, x3, x4, VT0
|
||||
vbitsel.v $vr18, VI3, VI4, VT0
|
||||
VCMPLT VT0, VM0, VM1
|
||||
vbitsel.v VM0, VM0, VM1, VT0
|
||||
vbitsel.v VI0, $vr18, $vr17, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
VCMPLT VT0, VI1, VI0
|
||||
vbitsel.v VI0, VI0, VI1, VT0
|
||||
b .L26
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
move TEMP, X
|
||||
#ifdef DOUBLE
|
||||
addi.d i0, i0, 1
|
||||
ld.d t1, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
vinsgr2vr.d VM0, t1, 0
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
ld.d t2, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
vinsgr2vr.d VM0, t2, 1
|
||||
slli.d i0, i0, 1 //2
|
||||
vreplgr2vr.d $vr17, i0
|
||||
slli.d i0, i0, 1 //4
|
||||
vreplgr2vr.d $vr18, i0
|
||||
addi.d i0, i0, -7
|
||||
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, 3
|
||||
vinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI0, i0, 1 //2
|
||||
#else
|
||||
addi.w i0, i0, 1
|
||||
ld.w t1, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
vinsgr2vr.w VM0, t1, 0
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
ld.w t2, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t3, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t4, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
vinsgr2vr.w VM0, t2, 1
|
||||
vinsgr2vr.w VM0, t3, 2
|
||||
vinsgr2vr.w VM0, t4, 3
|
||||
slli.w i0, i0, 2 //4
|
||||
vreplgr2vr.w $vr17, i0
|
||||
slli.w i0, i0, 1 //8
|
||||
vreplgr2vr.w $vr18, i0
|
||||
addi.w i0, i0, -15
|
||||
vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, 5
|
||||
vinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 2 //3
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 3 //4
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vadd.d VI1, VI1, $vr18
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t1, 0
|
||||
vinsgr2vr.d VX1, t2, 1
|
||||
vadd.d VI2, VI1, $vr17
|
||||
VCMPLT VT0, VX0, VX1
|
||||
vbitsel.v x1, VX0, VX1, VT0
|
||||
vbitsel.v x2, VI1, VI2, VT0
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vadd.d VI1, VI2, $vr17
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t1, 0
|
||||
vinsgr2vr.d VX1, t2, 1
|
||||
vadd.d VI2, VI1, $vr17
|
||||
VCMPLT VT0, VX0, VX1
|
||||
vbitsel.v x3, VX0, VX1, VT0
|
||||
vbitsel.v x4, VI1, VI2, VT0
|
||||
VCMPLT VT0, x1, x3
|
||||
vbitsel.v x1, x1, x3, VT0
|
||||
vbitsel.v x2, x2, x4, VT0
|
||||
VCMPLT VT0, VM0, x1
|
||||
addi.d I, I, -1
|
||||
vbitsel.v VM0, VM0, x1, VT0
|
||||
vbitsel.v VI0, VI0, x2, VT0
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
vadd.w VI1, VI1, $vr18
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
vadd.w VI2, VI1, $vr17
|
||||
VCMPLT VT0, VX0, VX1
|
||||
addi.d I, I, -1
|
||||
vbitsel.v VM1, VX0, VX1, VT0
|
||||
vbitsel.v VI2, VI1, VI2, VT0
|
||||
VCMPLT VT0, VM0, VM1
|
||||
vbitsel.v VM0, VM0, VM1, VT0
|
||||
vbitsel.v VI0, VI0, VI2, VT0
|
||||
#endif
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VI1, VI0, 0
|
||||
vreplvei.d VI2, VI0, 1
|
||||
vreplvei.d x1, VM0, 0
|
||||
vreplvei.d x2, VM0, 1
|
||||
fcmp.ceq.d $fcc0, $f10, $f9
|
||||
bceqz $fcc0, .L26
|
||||
VCMPLT VT0, VI1, VI2
|
||||
vbitsel.v VI0, VI2, VI1, VT0
|
||||
b .L27
|
||||
#else
|
||||
vreplvei.w VI1, VI0, 0
|
||||
vreplvei.w VI2, VI0, 1
|
||||
vreplvei.w VI3, VI0, 2
|
||||
vreplvei.w VI4, VI0, 3
|
||||
vreplvei.w x1, VM0, 0
|
||||
vreplvei.w x2, VM0, 1
|
||||
vreplvei.w x3, VM0, 2
|
||||
vreplvei.w x4, VM0, 3
|
||||
vfcmp.clt.s VT0, x1, x2
|
||||
vbitsel.v VM1, x1, x2, VT0
|
||||
vbitsel.v $vr17, VI1, VI2, VT0
|
||||
vfcmp.clt.s VT0, x3, x4
|
||||
vbitsel.v VM0, x3, x4, VT0
|
||||
vbitsel.v $vr18, VI3, VI4, VT0
|
||||
vfcmp.clt.s VT0, VM0, VM1
|
||||
vbitsel.v VM0, VM0, VM1, VT0
|
||||
vbitsel.v VI0, $vr18, $vr17, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
vfcmp.clt.s VT0, VI1, VI0
|
||||
vbitsel.v VI0, VI0, VI1, VT0
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
#ifdef DOUBLE
|
||||
VCMPLT VT0, x1, x2
|
||||
vbitsel.v VM0, x1, x2, VT0
|
||||
vbitsel.v VI0, VI1, VI2, VT0
|
||||
#else
|
||||
fcmp.ceq.d $fcc0, $f15, $f10
|
||||
bceqz $fcc0, .L27
|
||||
VCMPLT VT0, VI2, VI0
|
||||
vbitsel.v VI0, VI0, VI2, VT0
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L27:
|
||||
#ifndef DOUBLE
|
||||
fcmp.ceq.d $fcc0, $f15, $f11
|
||||
bceqz $fcc0, .L28
|
||||
VCMPLT VT0, VI3, VI0
|
||||
vbitsel.v VI0, VI0, VI3, VT0
|
||||
.align 3
|
||||
|
||||
.L28:
|
||||
fcmp.ceq.d $fcc0, $f15, $f12
|
||||
bceqz $fcc0, .L29
|
||||
VCMPLT VT0, VI4, VI0
|
||||
vbitsel.v VI0, VI0, VI4, VT0
|
||||
.align 3
|
||||
|
||||
.L29:
|
||||
#endif
|
||||
MTG i0, $f20
|
||||
.align 3
|
||||
|
||||
.L21: //N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
srai.d i1, N, 3
|
||||
slli.d i1, i1, 3
|
||||
addi.d i1, i1, 1 //current index
|
||||
movgr2fr.d $f21, i1
|
||||
movgr2fr.d $f20, i0
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
fld.d $f9, X, 0
|
||||
addi.d I, I, -1
|
||||
CMPLT $fcc0, $f15, $f9
|
||||
add.d X, X, INCX
|
||||
fsel $f15, $f15, $f9, $fcc0
|
||||
fsel $f20, $f20, $f21, $fcc0
|
||||
addi.d i1, i1, 1
|
||||
movgr2fr.d $f21, i1
|
||||
blt $r0, I, .L22
|
||||
MTG i0, $f20
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,534 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r12
|
||||
#define t1 $r13
|
||||
#define t2 $r15
|
||||
#define t3 $r18
|
||||
#define t4 $r16
|
||||
#define i0 $r17
|
||||
#define i1 $r14
|
||||
#define TEMP $r19
|
||||
#define x1 $xr9
|
||||
#define x2 $xr10
|
||||
#define x3 $xr11
|
||||
#define x4 $xr12
|
||||
#define VX0 $xr13
|
||||
#define VX1 $xr14
|
||||
#define VM0 $xr15
|
||||
#define VM1 $xr16
|
||||
#define VINC4 $xr17
|
||||
#define VINC8 $xr18
|
||||
#define VI0 $xr20
|
||||
#define VI1 $xr21
|
||||
#define VI2 $xr22
|
||||
#define VI3 $xr8
|
||||
#define VI4 $xr19
|
||||
#define VT0 $xr23
|
||||
|
||||
PROLOGUE
|
||||
li.d i0, 0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bne INCX, TEMP, .L20
|
||||
xvld VM0, X, 0
|
||||
#ifdef DOUBLE
|
||||
addi.d i0, i0, 1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
slli.d i0, i0, 2 //4
|
||||
xvreplgr2vr.d VINC4, i0
|
||||
slli.d i0, i0, 1 //8
|
||||
xvreplgr2vr.d VINC8, i0
|
||||
addi.d i0, i0, -15
|
||||
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 2
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 3
|
||||
addi.d i0, i0, 5
|
||||
xvinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 1 //2
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 2 //3
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 3 //4
|
||||
#else
|
||||
addi.w i0, i0, 1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
slli.w i0, i0, 3 //8
|
||||
xvreplgr2vr.w VINC8, i0
|
||||
addi.w i0, i0, -15
|
||||
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 4
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 5
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 6
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 2 //3
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 3 //4
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 4 //5
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 5 //6
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 6 //7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 7 //8
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
xvld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
xvadd.d VI1, VI1, VINC8
|
||||
xvld VX1, X, 4 * SIZE
|
||||
xvadd.d VI2, VI1, VINC4
|
||||
XVCMPLT VT0, VX1, VX0
|
||||
addi.d I, I, -1
|
||||
xvbitsel.v VM1, VX0, VX1, VT0
|
||||
xvbitsel.v VI2, VI1, VI2, VT0
|
||||
XVCMPLT VT0, VM1, VM0
|
||||
addi.d X, X, 8 * SIZE
|
||||
xvbitsel.v VM0, VM0, VM1, VT0
|
||||
xvbitsel.v VI0, VI0, VI2, VT0
|
||||
#else
|
||||
xvadd.w VI1, VI1, VINC8
|
||||
XVCMPLT VT0, VX0, VM0
|
||||
addi.d I, I, -1
|
||||
xvbitsel.v VM0, VM0, VX0, VT0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
addi.d X, X, 8 * SIZE
|
||||
#endif
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VI1, VI0, 0
|
||||
xvpickve.d VI2, VI0, 1
|
||||
xvpickve.d VI3, VI0, 2
|
||||
xvpickve.d VI4, VI0, 3
|
||||
xvpickve.d x1, VM0, 0
|
||||
xvpickve.d x2, VM0, 1
|
||||
xvpickve.d x3, VM0, 2
|
||||
xvpickve.d x4, VM0, 3
|
||||
#else
|
||||
xvxor.v VX0, VX0, VX0
|
||||
xvor.v VX0, VI0, VX0
|
||||
xvxor.v VX1, VX1, VX1
|
||||
xvor.v VX1, VM0, VX1
|
||||
xvpickve.w VI1, VI0, 0
|
||||
xvpickve.w VI2, VI0, 1
|
||||
xvpickve.w VI3, VI0, 2
|
||||
xvpickve.w VI4, VI0, 3
|
||||
xvpickve.w x1, VM0, 0
|
||||
xvpickve.w x2, VM0, 1
|
||||
xvpickve.w x3, VM0, 2
|
||||
xvpickve.w x4, VM0, 3
|
||||
#endif
|
||||
XVCMPLT VT0, x2, x1
|
||||
xvbitsel.v VM1, x1, x2, VT0
|
||||
xvbitsel.v VINC4, VI1, VI2, VT0
|
||||
XVCMPLT VT0, x4, x3
|
||||
xvbitsel.v VM0, x3, x4, VT0
|
||||
xvbitsel.v VINC8, VI3, VI4, VT0
|
||||
XVCMPLT VT0, VM1, VM0
|
||||
xvbitsel.v VM0, VM0, VM1, VT0
|
||||
xvbitsel.v VI0, VINC8, VINC4, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
XVCMPLT VT0, VI1, VI0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
b .L26
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
move TEMP, X
|
||||
#ifdef DOUBLE
|
||||
addi.d i0, i0, 1
|
||||
ld.d t1, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
xvinsgr2vr.d VM0, t1, 0
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
ld.d t2, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.d t3, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.d t4, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
xvinsgr2vr.d VM0, t1, 0
|
||||
xvinsgr2vr.d VM0, t2, 1
|
||||
xvinsgr2vr.d VM0, t3, 2
|
||||
xvinsgr2vr.d VM0, t4, 3
|
||||
slli.d i0, i0, 2 //4
|
||||
xvreplgr2vr.d VINC4, i0
|
||||
slli.d i0, i0, 1 //8
|
||||
xvreplgr2vr.d VINC8, i0
|
||||
addi.d i0, i0, -15
|
||||
xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 2
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI1, i0, 3
|
||||
addi.d i0, i0, 5
|
||||
xvinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 1 //2
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 2 //3
|
||||
addi.d i0, i0, 1
|
||||
xvinsgr2vr.d VI0, i0, 3 //4
|
||||
#else
|
||||
addi.w i0, i0, 1
|
||||
ld.w t1, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
ld.w t2, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t3, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t4, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
xvinsgr2vr.w VM0, t1, 0
|
||||
xvinsgr2vr.w VM0, t2, 1
|
||||
xvinsgr2vr.w VM0, t3, 2
|
||||
xvinsgr2vr.w VM0, t4, 3
|
||||
ld.w t1, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t2, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t3, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t4, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
xvinsgr2vr.w VM0, t1, 4
|
||||
xvinsgr2vr.w VM0, t2, 5
|
||||
xvinsgr2vr.w VM0, t3, 6
|
||||
xvinsgr2vr.w VM0, t4, 7
|
||||
slli.w i0, i0, 3 //8
|
||||
xvreplgr2vr.w VINC8, i0
|
||||
addi.w i0, i0, -15
|
||||
xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 4
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 5
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 6
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI1, i0, 7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 2 //3
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 3 //4
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 4 //5
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 5 //6
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 6 //7
|
||||
addi.w i0, i0, 1
|
||||
xvinsgr2vr.w VI0, i0, 7 //8
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
xvadd.d VI1, VI1, VINC8
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvadd.d VI1, VI1, VINC8
|
||||
xvadd.d VI2, VI1, VINC4
|
||||
XVCMPLT VT0, VX1, VX0
|
||||
addi.d I, I, -1
|
||||
xvbitsel.v VM1, VX0, VX1, VT0
|
||||
xvbitsel.v VI2, VI1, VI2, VT0
|
||||
XVCMPLT VT0, VM1, VM0
|
||||
xvbitsel.v VM0, VM0, VM1, VT0
|
||||
xvbitsel.v VI0, VI0, VI2, VT0
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 0
|
||||
xvinsgr2vr.w VX0, t2, 1
|
||||
xvinsgr2vr.w VX0, t3, 2
|
||||
xvinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VX0, t1, 4
|
||||
xvinsgr2vr.w VX0, t2, 5
|
||||
xvinsgr2vr.w VX0, t3, 6
|
||||
xvinsgr2vr.w VX0, t4, 7
|
||||
xvadd.w VI1, VI1, VINC8
|
||||
XVCMPLT VT0, VX0, VM0
|
||||
addi.d I, I, -1
|
||||
xvbitsel.v VM0, VM0, VX0, VT0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
#endif
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
#ifdef DOUBLE
|
||||
xvpickve.d VI1, VI0, 0
|
||||
xvpickve.d VI2, VI0, 1
|
||||
xvpickve.d VI3, VI0, 2
|
||||
xvpickve.d VI4, VI0, 3
|
||||
xvpickve.d x1, VM0, 0
|
||||
xvpickve.d x2, VM0, 1
|
||||
xvpickve.d x3, VM0, 2
|
||||
xvpickve.d x4, VM0, 3
|
||||
#else
|
||||
xvxor.v VX0, VX0, VX0
|
||||
xvor.v VX0, VI0, VX0
|
||||
xvxor.v VX1, VX1, VX1
|
||||
xvor.v VX1, VM0, VX1
|
||||
xvpickve.w VI1, VI0, 0
|
||||
xvpickve.w VI2, VI0, 1
|
||||
xvpickve.w VI3, VI0, 2
|
||||
xvpickve.w VI4, VI0, 3
|
||||
xvpickve.w x1, VM0, 0
|
||||
xvpickve.w x2, VM0, 1
|
||||
xvpickve.w x3, VM0, 2
|
||||
xvpickve.w x4, VM0, 3
|
||||
#endif
|
||||
XVCMPLT VT0, x2, x1
|
||||
xvbitsel.v VM1, x1, x2, VT0
|
||||
xvbitsel.v VINC4, VI1, VI2, VT0
|
||||
XVCMPLT VT0, x4, x3
|
||||
xvbitsel.v VM0, x3, x4, VT0
|
||||
xvbitsel.v VINC8, VI3, VI4, VT0
|
||||
XVCMPLT VT0, VM1, VM0
|
||||
xvbitsel.v VM0, VM0, VM1, VT0
|
||||
xvbitsel.v VI0, VINC8, VINC4, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
XVCMPLT VT0, VI1, VI0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
fcmp.ceq.d $fcc0, $f15, $f10
|
||||
bceqz $fcc0, .L27
|
||||
XVCMPLT VT0, VI2, VI0
|
||||
xvbitsel.v VI0, VI0, VI2, VT0
|
||||
.align 3
|
||||
|
||||
.L27:
|
||||
fcmp.ceq.d $fcc0, $f15, $f11
|
||||
bceqz $fcc0, .L28
|
||||
XVCMPLT VT0, VI3, VI0
|
||||
xvbitsel.v VI0, VI0, VI3, VT0
|
||||
.align 3
|
||||
|
||||
.L28:
|
||||
fcmp.ceq.d $fcc0, $f15, $f12
|
||||
bceqz $fcc0, .L29
|
||||
XVCMPLT VT0, VI4, VI0
|
||||
xvbitsel.v VI0, VI0, VI4, VT0
|
||||
.align 3
|
||||
|
||||
.L29:
|
||||
#ifdef DOUBLE
|
||||
MTG i0, $f20
|
||||
#else
|
||||
fmov.s $f16, $f20
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
#ifndef DOUBLE
|
||||
.L252:
|
||||
xvxor.v VI0, VI0, VI0
|
||||
xvor.v VI0, VI0, VX0
|
||||
fmov.s $f13, $f15
|
||||
xvxor.v VM0, VM0, VM0
|
||||
xvor.v VM0, VM0, VX1
|
||||
xvpickve.w VI1, VI0, 4
|
||||
xvpickve.w VI2, VI0, 5
|
||||
xvpickve.w VI3, VI0, 6
|
||||
xvpickve.w VI4, VI0, 7
|
||||
xvpickve.w x1, VM0, 4
|
||||
xvpickve.w x2, VM0, 5
|
||||
xvpickve.w x3, VM0, 6
|
||||
xvpickve.w x4, VM0, 7
|
||||
XVCMPLT VT0, x2, x1
|
||||
xvbitsel.v x1, x1, x2, VT0
|
||||
xvbitsel.v VINC4, VI1, VI2, VT0
|
||||
XVCMPLT VT0, x4, x3
|
||||
xvbitsel.v VM0, x3, x4, VT0
|
||||
xvbitsel.v VINC8, VI3, VI4, VT0
|
||||
XVCMPLT VT0, x1, VM0
|
||||
xvbitsel.v VM0, VM0, x1, VT0
|
||||
xvbitsel.v VI0, VINC8, VINC4, VT0
|
||||
li.d TEMP, 1 //处理尾数相等时取最小序号
|
||||
movgr2fr.w $f17, TEMP
|
||||
ffint.s.w $f17, $f17
|
||||
xvfcmp.ceq.s VT0, VM0, x1
|
||||
fcmp.ceq.s $fcc0, $f23, $f17
|
||||
bceqz $fcc0, .L262
|
||||
XVCMPLT VT0, VI1, VI0
|
||||
xvbitsel.v VI0, VI0, VI1, VT0
|
||||
.align 3
|
||||
|
||||
.L262:
|
||||
xvfcmp.ceq.s VT0, VM0, x2
|
||||
fcmp.ceq.s $fcc0, $f23, $f17
|
||||
bceqz $fcc0, .L272
|
||||
XVCMPLT VT0, VI2, VI0
|
||||
xvbitsel.v VI0, VI0, VI2, VT0
|
||||
.align 3
|
||||
|
||||
.L272:
|
||||
xvfcmp.ceq.s VT0, VM0, x3
|
||||
fcmp.ceq.s $fcc0, $f23, $f17
|
||||
bceqz $fcc0, .L282
|
||||
XVCMPLT VT0, VI3, VI0
|
||||
xvbitsel.v VI0, VI0, VI3, VT0
|
||||
.align 3
|
||||
|
||||
.L282:
|
||||
xvfcmp.ceq.s VT0, VM0, x4
|
||||
fcmp.ceq.s $fcc0, $f23, $f17
|
||||
bceqz $fcc0, .L292
|
||||
XVCMPLT VT0, VI4, VI0
|
||||
xvbitsel.v VI0, VI0, VI4, VT0
|
||||
.align 3
|
||||
|
||||
.L292:
|
||||
CMPLT $fcc0, $f13, $f15
|
||||
fsel $f15, $f15, $f13, $fcc0
|
||||
fsel $f20, $f20, $f16, $fcc0
|
||||
MTG i0, $f20
|
||||
#endif
|
||||
|
||||
.L21: //N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
srai.d i1, N, 3
|
||||
slli.d i1, i1, 3
|
||||
addi.d i1, i1, 1 //current index
|
||||
movgr2fr.d $f21, i1
|
||||
movgr2fr.d $f20, i0
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
fld.d $f9, X, 0
|
||||
addi.d I, I, -1
|
||||
CMPLT $fcc0, $f9, $f15
|
||||
add.d X, X, INCX
|
||||
fsel $f15, $f15, $f9, $fcc0
|
||||
fsel $f20, $f20, $f21, $fcc0
|
||||
addi.d i1, i1, 1
|
||||
movgr2fr.d $f21, i1
|
||||
blt $r0, I, .L22
|
||||
MTG i0, $f20
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,428 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r12
|
||||
#define t1 $r13
|
||||
#define t2 $r15
|
||||
#define t3 $r18
|
||||
#define t4 $r16
|
||||
#define i0 $r17
|
||||
#define i1 $r14
|
||||
#define TEMP $r19
|
||||
#define x1 $vr9
|
||||
#define x2 $vr10
|
||||
#define x3 $vr11
|
||||
#define x4 $vr12
|
||||
#define VX0 $vr13
|
||||
#define VX1 $vr14
|
||||
#define VM0 $vr15
|
||||
#define VM1 $vr16
|
||||
#define VI0 $vr20
|
||||
#define VI1 $vr21
|
||||
#define VI2 $vr22
|
||||
#define VI3 $vr8
|
||||
#define VI4 $vr19
|
||||
#define VT0 $vr23
|
||||
|
||||
PROLOGUE
|
||||
li.d i0, 0
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bne INCX, TEMP, .L20
|
||||
vld VM0, X, 0
|
||||
#ifdef DOUBLE
|
||||
addi.d i0, i0, 1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
slli.d i0, i0, 1 //2
|
||||
vreplgr2vr.d $vr17, i0
|
||||
slli.d i0, i0, 1 //4
|
||||
vreplgr2vr.d $vr18, i0
|
||||
addi.d i0, i0, -7
|
||||
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, 3
|
||||
vinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI0, i0, 1 //2
|
||||
#else
|
||||
addi.w i0, i0, 1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
slli.w i0, i0, 2 //4
|
||||
vreplgr2vr.w $vr17, i0
|
||||
slli.w i0, i0, 1 //8
|
||||
vreplgr2vr.w $vr18, i0
|
||||
addi.w i0, i0, -15
|
||||
vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, 5
|
||||
vinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 2 //3
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 3 //4
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
vld VX0, X, 0 * SIZE
|
||||
#ifdef DOUBLE
|
||||
vadd.d VI1, VI1, $vr18
|
||||
vld VX1, X, 2 * SIZE
|
||||
vadd.d VI2, VI1, $vr17
|
||||
VCMPLT VT0, VX1, VX0
|
||||
vbitsel.v x1, VX0, VX1, VT0
|
||||
vbitsel.v x2, VI1, VI2, VT0
|
||||
vld VX0, X, 4 * SIZE
|
||||
vadd.d VI1, VI2, $vr17
|
||||
vld VX1, X, 6 * SIZE
|
||||
vadd.d VI2, VI1, $vr17
|
||||
VCMPLT VT0, VX1, VX0
|
||||
addi.d I, I, -1
|
||||
vbitsel.v x3, VX0, VX1, VT0
|
||||
vbitsel.v x4, VI1, VI2, VT0
|
||||
VCMPLT VT0, x3, x1
|
||||
addi.d X, X, 8 * SIZE
|
||||
vbitsel.v x1, x1, x3, VT0
|
||||
vbitsel.v x2, x2, x4, VT0
|
||||
VCMPLT VT0, x1, VM0
|
||||
vbitsel.v VM0, VM0, x1, VT0
|
||||
vbitsel.v VI0, VI0, x2, VT0
|
||||
#else
|
||||
vadd.w VI1, VI1, $vr18
|
||||
vld VX1, X, 4 * SIZE
|
||||
vadd.w VI2, VI1, $vr17
|
||||
VCMPLT VT0, VX1, VX0
|
||||
addi.d I, I, -1
|
||||
vbitsel.v VM1, VX0, VX1, VT0
|
||||
vbitsel.v VI2, VI1, VI2, VT0
|
||||
VCMPLT VT0, VM1, VM0
|
||||
addi.d X, X, 8 * SIZE
|
||||
vbitsel.v VM0, VM0, VM1, VT0
|
||||
vbitsel.v VI0, VI0, VI2, VT0
|
||||
#endif
|
||||
blt $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VI1, VI0, 0
|
||||
vreplvei.d VI2, VI0, 1
|
||||
vreplvei.d x1, VM0, 0
|
||||
vreplvei.d x2, VM0, 1
|
||||
fcmp.ceq.d $fcc0, $f10, $f9
|
||||
bceqz $fcc0, .L26
|
||||
VCMPLT VT0, VI1, VI2
|
||||
vbitsel.v VI0, VI2, VI1, VT0
|
||||
b .L27
|
||||
#else
|
||||
vreplvei.w VI1, VI0, 0
|
||||
vreplvei.w VI2, VI0, 1
|
||||
vreplvei.w VI3, VI0, 2
|
||||
vreplvei.w VI4, VI0, 3
|
||||
vreplvei.w x1, VM0, 0
|
||||
vreplvei.w x2, VM0, 1
|
||||
vreplvei.w x3, VM0, 2
|
||||
vreplvei.w x4, VM0, 3
|
||||
VCMPLT VT0, x2, x1
|
||||
vbitsel.v VM1, x1, x2, VT0
|
||||
vbitsel.v $vr17, VI1, VI2, VT0
|
||||
VCMPLT VT0, x4, x3
|
||||
vbitsel.v VM0, x3, x4, VT0
|
||||
vbitsel.v $vr18, VI3, VI4, VT0
|
||||
VCMPLT VT0, VM1, VM0
|
||||
vbitsel.v VM0, VM0, VM1, VT0
|
||||
vbitsel.v VI0, $vr18, $vr17, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
VCMPLT VT0, VI1, VI0
|
||||
vbitsel.v VI0, VI0, VI1, VT0
|
||||
b .L26
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
move TEMP, X
|
||||
#ifdef DOUBLE
|
||||
addi.d i0, i0, 1
|
||||
ld.d t1, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
vinsgr2vr.d VM0, t1, 0
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
ld.d t2, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
vinsgr2vr.d VM0, t2, 1
|
||||
slli.d i0, i0, 1 //2
|
||||
vreplgr2vr.d $vr17, i0
|
||||
slli.d i0, i0, 1 //4
|
||||
vreplgr2vr.d $vr18, i0
|
||||
addi.d i0, i0, -7
|
||||
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI1, i0, 1
|
||||
addi.d i0, i0, 3
|
||||
vinsgr2vr.d VI0, i0, 0 //1
|
||||
addi.d i0, i0, 1
|
||||
vinsgr2vr.d VI0, i0, 1 //2
|
||||
#else
|
||||
addi.w i0, i0, 1
|
||||
ld.w t1, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
vinsgr2vr.w VM0, t1, 0
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L21
|
||||
ld.w t2, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t3, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
ld.w t4, TEMP, 0 * SIZE
|
||||
add.d TEMP, TEMP, INCX
|
||||
vinsgr2vr.w VM0, t2, 1
|
||||
vinsgr2vr.w VM0, t3, 2
|
||||
vinsgr2vr.w VM0, t4, 3
|
||||
slli.w i0, i0, 2 //4
|
||||
vreplgr2vr.w $vr17, i0
|
||||
slli.w i0, i0, 1 //8
|
||||
vreplgr2vr.w $vr18, i0
|
||||
addi.w i0, i0, -15
|
||||
vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI1, i0, 3
|
||||
addi.w i0, i0, 5
|
||||
vinsgr2vr.w VI0, i0, 0 //1
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 1 //2
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 2 //3
|
||||
addi.w i0, i0, 1
|
||||
vinsgr2vr.w VI0, i0, 3 //4
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vadd.d VI1, VI1, $vr18
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t1, 0
|
||||
vinsgr2vr.d VX1, t2, 1
|
||||
vadd.d VI2, VI1, $vr17
|
||||
VCMPLT VT0, VX1, VX0
|
||||
vbitsel.v x1, VX0, VX1, VT0
|
||||
vbitsel.v x2, VI1, VI2, VT0
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
vadd.d VI1, VI2, $vr17
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t1, 0
|
||||
vinsgr2vr.d VX1, t2, 1
|
||||
vadd.d VI2, VI1, $vr17
|
||||
VCMPLT VT0, VX1, VX0
|
||||
vbitsel.v x3, VX0, VX1, VT0
|
||||
vbitsel.v x4, VI1, VI2, VT0
|
||||
VCMPLT VT0, x3, x1
|
||||
vbitsel.v x1, x1, x3, VT0
|
||||
vbitsel.v x2, x2, x4, VT0
|
||||
VCMPLT VT0, x1, VM0
|
||||
addi.d I, I, -1
|
||||
vbitsel.v VM0, VM0, x1, VT0
|
||||
vbitsel.v VI0, VI0, x2, VT0
|
||||
#else
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
vadd.w VI1, VI1, $vr18
|
||||
ld.w t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
vadd.w VI2, VI1, $vr17
|
||||
VCMPLT VT0, VX1, VX0
|
||||
addi.d I, I, -1
|
||||
vbitsel.v VM1, VX0, VX1, VT0
|
||||
vbitsel.v VI2, VI1, VI2, VT0
|
||||
VCMPLT VT0, VM1, VM0
|
||||
vbitsel.v VM0, VM0, VM1, VT0
|
||||
vbitsel.v VI0, VI0, VI2, VT0
|
||||
#endif
|
||||
blt $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VI1, VI0, 0
|
||||
vreplvei.d VI2, VI0, 1
|
||||
vreplvei.d x1, VM0, 0
|
||||
vreplvei.d x2, VM0, 1
|
||||
fcmp.ceq.d $fcc0, $f10, $f9
|
||||
bceqz $fcc0, .L26
|
||||
VCMPLT VT0, VI1, VI2
|
||||
vbitsel.v VI0, VI2, VI1, VT0
|
||||
b .L27
|
||||
#else
|
||||
vreplvei.w VI1, VI0, 0
|
||||
vreplvei.w VI2, VI0, 1
|
||||
vreplvei.w VI3, VI0, 2
|
||||
vreplvei.w VI4, VI0, 3
|
||||
vreplvei.w x1, VM0, 0
|
||||
vreplvei.w x2, VM0, 1
|
||||
vreplvei.w x3, VM0, 2
|
||||
vreplvei.w x4, VM0, 3
|
||||
VCMPLT VT0, x2, x1
|
||||
vbitsel.v VM1, x1, x2, VT0
|
||||
vbitsel.v $vr17, VI1, VI2, VT0
|
||||
VCMPLT VT0, x4, x3
|
||||
vbitsel.v VM0, x3, x4, VT0
|
||||
vbitsel.v $vr18, VI3, VI4, VT0
|
||||
VCMPLT VT0, VM1, VM0
|
||||
vbitsel.v VM0, VM0, VM1, VT0
|
||||
vbitsel.v VI0, $vr18, $vr17, VT0
|
||||
fcmp.ceq.d $fcc0, $f15, $f9
|
||||
bceqz $fcc0, .L26
|
||||
VCMPLT VT0, VI1, VI0
|
||||
vbitsel.v VI0, VI0, VI1, VT0
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
#ifdef DOUBLE
|
||||
VCMPLT VT0, x2, x1
|
||||
vbitsel.v VM0, x1, x2, VT0
|
||||
vbitsel.v VI0, VI1, VI2, VT0
|
||||
#else
|
||||
fcmp.ceq.d $fcc0, $f15, $f10
|
||||
bceqz $fcc0, .L27
|
||||
VCMPLT VT0, VI2, VI0
|
||||
vbitsel.v VI0, VI0, VI2, VT0
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L27:
|
||||
#ifndef DOUBLE
|
||||
fcmp.ceq.d $fcc0, $f15, $f11
|
||||
bceqz $fcc0, .L28
|
||||
VCMPLT VT0, VI3, VI0
|
||||
vbitsel.v VI0, VI0, VI3, VT0
|
||||
.align 3
|
||||
|
||||
.L28:
|
||||
fcmp.ceq.d $fcc0, $f15, $f12
|
||||
bceqz $fcc0, .L29
|
||||
VCMPLT VT0, VI4, VI0
|
||||
vbitsel.v VI0, VI0, VI4, VT0
|
||||
.align 3
|
||||
|
||||
.L29:
|
||||
#endif
|
||||
MTG i0, $f20
|
||||
.align 3
|
||||
|
||||
.L21: //N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
srai.d i1, N, 3
|
||||
slli.d i1, i1, 3
|
||||
addi.d i1, i1, 1 //current index
|
||||
movgr2fr.d $f21, i1
|
||||
movgr2fr.d $f20, i0
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
fld.d $f9, X, 0
|
||||
addi.d I, I, -1
|
||||
CMPLT $fcc0, $f9, $f15
|
||||
add.d X, X, INCX
|
||||
fsel $f15, $f15, $f9, $fcc0
|
||||
fsel $f20, $f20, $f21, $fcc0
|
||||
addi.d i1, i1, 1
|
||||
movgr2fr.d $f21, i1
|
||||
blt $r0, I, .L22
|
||||
MTG i0, $f20
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,229 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
|
||||
#define I $r12
|
||||
#define TEMP $r13
|
||||
|
||||
#define VM0 $xr0
|
||||
#define VM1 $xr1
|
||||
#define VM2 $xr2
|
||||
#define VX0 $xr3
|
||||
#define VX1 $xr4
|
||||
#define VX2 $xr5
|
||||
#define VX3 $xr6
|
||||
|
||||
#define t1 $r14
|
||||
#define t2 $r15
|
||||
#define t3 $r16
|
||||
#define t4 $r17
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
#ifdef DOUBLE
|
||||
xvldrepl.d VM0, X, 0
|
||||
#else
|
||||
xvldrepl.w VM0, X, 0
|
||||
#endif
|
||||
bne INCX, TEMP, .L20
|
||||
|
||||
srai.d I, N, 4
|
||||
bge $r0, I, .L11
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
#ifdef DOUBLE
|
||||
xvld VX0, X, 0
|
||||
xvld VX1, X, 32
|
||||
xvld VX2, X, 64
|
||||
xvld VX3, X, 96
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 128
|
||||
XVFMAX VM1, VX0, VX1
|
||||
XVFMAX VM2, VX2, VX3
|
||||
XVFMAX VM0, VM0, VM1
|
||||
XVFMAX VM0, VM0, VM2
|
||||
#else
|
||||
xvld VX0, X, 0
|
||||
xvld VX1, X, 32
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 64
|
||||
XVFMAX VM1, VX0, VX1
|
||||
XVFMAX VM0, VM0, VM1
|
||||
#endif
|
||||
blt $r0, I, .L10
|
||||
|
||||
#ifdef DOUBLE
|
||||
xvrepl128vei.d VX0, VM0, 0
|
||||
xvrepl128vei.d VX1, VM0, 1
|
||||
XVFMAX VM0, VX0, VX1
|
||||
#else
|
||||
xvrepl128vei.w VX0, VM0, 0
|
||||
xvrepl128vei.w VX1, VM0, 1
|
||||
xvrepl128vei.w VX2, VM0, 2
|
||||
xvrepl128vei.w VX3, VM0, 3
|
||||
XVFMAX VM1, VX0, VX1
|
||||
XVFMAX VM2, VX2, VX3
|
||||
XVFMAX VM0, VM1, VM2
|
||||
#endif
|
||||
xvpermi.q VM1, VM0, 0x1
|
||||
XVFMAX VM0, VM0, VM1
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
andi I, N, 0x0f
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12: /* 0 < N < 16 */
|
||||
LD $f1, X, 0
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
FMAX $f0, $f0, $f1
|
||||
bnez I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX0, t1, 0
|
||||
xvinsgr2vr.d VX0, t2, 1
|
||||
xvinsgr2vr.d VX0, t3, 2
|
||||
xvinsgr2vr.d VX0, t4, 3
|
||||
ld.d t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.d VX1, t1, 0
|
||||
xvinsgr2vr.d VX1, t2, 1
|
||||
xvinsgr2vr.d VX1, t3, 2
|
||||
xvinsgr2vr.d VX1, t4, 3
|
||||
xvfmaxa.d VM1, VX0, VX1
|
||||
xvfmaxa.d VM0, VM0, VM1
|
||||
#else
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VM1, t1, 0
|
||||
xvinsgr2vr.w VM1, t2, 1
|
||||
xvinsgr2vr.w VM1, t3, 2
|
||||
xvinsgr2vr.w VM1, t4, 3
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
xvinsgr2vr.w VM1, t1, 4
|
||||
xvinsgr2vr.w VM1, t2, 5
|
||||
xvinsgr2vr.w VM1, t3, 6
|
||||
xvinsgr2vr.w VM1, t4, 7
|
||||
xvfmaxa.s VM0, VM0, VM1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#ifdef DOUBLE
|
||||
xvrepl128vei.d VX0, VM0, 0
|
||||
xvrepl128vei.d VX1, VM0, 1
|
||||
XVFMAX VM0, VX0, VX1
|
||||
#else
|
||||
xvrepl128vei.w VX0, VM0, 0
|
||||
xvrepl128vei.w VX1, VM0, 1
|
||||
xvrepl128vei.w VX2, VM0, 2
|
||||
xvrepl128vei.w VX3, VM0, 3
|
||||
XVFMAX VM1, VX0, VX1
|
||||
XVFMAX VM2, VX2, VX3
|
||||
XVFMAX VM0, VM1, VM2
|
||||
#endif
|
||||
xvpermi.q VM1, VM0, 1
|
||||
XVFMAX VM0, VM0, VM1
|
||||
.align 3
|
||||
|
||||
.L23: //INCX!=1 and N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24: /* 0 < N < 8 */
|
||||
LD $f1, X, 0
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
FMAX $f0, $f0, $f1
|
||||
bnez I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,228 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
|
||||
#define I $r12
|
||||
#define TEMP $r13
|
||||
|
||||
#define VM0 $vr0
|
||||
#define VM1 $vr1
|
||||
#define VM2 $vr2
|
||||
#define VX0 $vr3
|
||||
#define VX1 $vr4
|
||||
#define VX2 $vr5
|
||||
#define VX3 $vr6
|
||||
|
||||
#define t1 $r14
|
||||
#define t2 $r15
|
||||
#define t3 $r16
|
||||
#define t4 $r17
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
bge $r0, N, .L999
|
||||
bge $r0, INCX, .L999
|
||||
li.d TEMP, 1
|
||||
slli.d TEMP, TEMP, BASE_SHIFT
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
#ifdef DOUBLE
|
||||
vldrepl.d VM0, X, 0
|
||||
#else
|
||||
vldrepl.w VM0, X, 0
|
||||
#endif
|
||||
bne INCX, TEMP, .L20
|
||||
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L11
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
#ifdef DOUBLE
|
||||
vld VX0, X, 0
|
||||
vld VX1, X, 16
|
||||
vld VX2, X, 32
|
||||
vld VX3, X, 48
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 64
|
||||
VFMAX VM1, VX0, VX1
|
||||
VFMAX VM2, VX2, VX3
|
||||
VFMAX VM0, VM0, VM1
|
||||
VFMAX VM0, VM0, VM2
|
||||
#else
|
||||
vld VX0, X, 0
|
||||
vld VX1, X, 16
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 32
|
||||
VFMAX VM1, VX0, VX1
|
||||
VFMAX VM0, VM0, VM1
|
||||
#endif
|
||||
blt $r0, I, .L10
|
||||
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX0, VM0, 0
|
||||
vreplvei.d VX1, VM0, 1
|
||||
VFMAX VM0, VX0, VX1
|
||||
#else
|
||||
vreplvei.w VX0, VM0, 0
|
||||
vreplvei.w VX1, VM0, 1
|
||||
vreplvei.w VX2, VM0, 2
|
||||
vreplvei.w VX3, VM0, 3
|
||||
VFMAX VM1, VX0, VX1
|
||||
VFMAX VM2, VX2, VX3
|
||||
VFMAX VM0, VM1, VM2
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
LD $f1, X, 0
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
FMAX $f0, $f0, $f1
|
||||
bnez I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
.L20: // INCX!=1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
#ifdef DOUBLE
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vfmaxa.d VM1, VX0, VX1
|
||||
ld.d t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX0, t1, 0
|
||||
vinsgr2vr.d VX0, t2, 1
|
||||
ld.d t3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ld.d t4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.d VX1, t3, 0
|
||||
vinsgr2vr.d VX1, t4, 1
|
||||
vfmaxa.d VM2, VX0, VX1
|
||||
vfmaxa.d VM1, VM1, VM2
|
||||
vfmaxa.d VM0, VM0, VM1
|
||||
#else
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX0, t1, 0
|
||||
vinsgr2vr.w VX0, t2, 1
|
||||
vinsgr2vr.w VX0, t3, 2
|
||||
vinsgr2vr.w VX0, t4, 3
|
||||
ld.w t1, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t2, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t3, X, 0
|
||||
add.d X, X, INCX
|
||||
ld.w t4, X, 0
|
||||
add.d X, X, INCX
|
||||
vinsgr2vr.w VX1, t1, 0
|
||||
vinsgr2vr.w VX1, t2, 1
|
||||
vinsgr2vr.w VX1, t3, 2
|
||||
vinsgr2vr.w VX1, t4, 3
|
||||
vfmaxa.s VM1, VX0, VX1
|
||||
vfmaxa.s VM0, VM0, VM1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L21
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#ifdef DOUBLE
|
||||
vreplvei.d VX0, VM0, 0
|
||||
vreplvei.d VX1, VM0, 1
|
||||
VFMAX VM0, VX0, VX1
|
||||
#else
|
||||
vreplvei.w VX0, VM0, 0
|
||||
vreplvei.w VX1, VM0, 1
|
||||
vreplvei.w VX2, VM0, 2
|
||||
vreplvei.w VX3, VM0, 3
|
||||
VFMAX VM1, VX0, VX1
|
||||
VFMAX VM2, VX2, VX3
|
||||
VFMAX VM0, VM1, VM2
|
||||
#endif
|
||||
.align 3
|
||||
|
||||
.L23: //INCX!=1 and N<8
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
LD $f1, X, 0
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
FMAX $f0, $f0, $f1
|
||||
bnez I, .L24
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue