Merge pull request #3394 from xianyi/develop
Merge from develop for 0.3.18
This commit is contained in:
commit
c75759876c
329
.travis.yml
329
.travis.yml
|
|
@ -1,33 +1,38 @@
|
|||
# XXX: Precise is already deprecated, new default is Trusty.
|
||||
# https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming
|
||||
dist: precise
|
||||
dist: focal
|
||||
sudo: true
|
||||
language: c
|
||||
|
||||
matrix:
|
||||
include:
|
||||
- &test-ubuntu
|
||||
os: linux
|
||||
# os: linux
|
||||
compiler: gcc
|
||||
addons:
|
||||
apt:
|
||||
packages:
|
||||
- gfortran
|
||||
# before_script: &common-before
|
||||
# - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
# script:
|
||||
# - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
# - make -C test $COMMON_FLAGS $BTYPE
|
||||
# - make -C ctest $COMMON_FLAGS $BTYPE
|
||||
# - make -C utest $COMMON_FLAGS $BTYPE
|
||||
# env:
|
||||
# - TARGET_BOX=LINUX64
|
||||
# - BTYPE="BINARY=64"
|
||||
#
|
||||
# - <<: *test-ubuntu
|
||||
os: linux-ppc64le
|
||||
before_script: &common-before
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
|
||||
script:
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
- make -C test $COMMON_FLAGS $BTYPE
|
||||
- make -C ctest $COMMON_FLAGS $BTYPE
|
||||
- make -C utest $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- TARGET_BOX=LINUX64
|
||||
- BTYPE="BINARY=64"
|
||||
|
||||
- <<: *test-ubuntu
|
||||
os: linux-ppc64le
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
|
||||
env:
|
||||
# for matrix annotation only
|
||||
- TARGET_BOX=PPC64LE_LINUX
|
||||
|
|
@ -55,38 +60,38 @@ matrix:
|
|||
- TARGET_BOX=IBMZ_LINUX
|
||||
- BTYPE="BINARY=64 USE_OPENMP=0 CC=clang"
|
||||
|
||||
- <<: *test-ubuntu
|
||||
env:
|
||||
- TARGET_BOX=LINUX64
|
||||
- BTYPE="BINARY=64 USE_OPENMP=1"
|
||||
|
||||
- <<: *test-ubuntu
|
||||
env:
|
||||
- TARGET_BOX=LINUX64
|
||||
- BTYPE="BINARY=64 INTERFACE64=1"
|
||||
|
||||
- <<: *test-ubuntu
|
||||
compiler: clang
|
||||
env:
|
||||
- TARGET_BOX=LINUX64
|
||||
- BTYPE="BINARY=64 CC=clang"
|
||||
|
||||
- <<: *test-ubuntu
|
||||
compiler: clang
|
||||
env:
|
||||
- TARGET_BOX=LINUX64
|
||||
- BTYPE="BINARY=64 INTERFACE64=1 CC=clang"
|
||||
|
||||
- <<: *test-ubuntu
|
||||
addons:
|
||||
apt:
|
||||
packages:
|
||||
- gcc-multilib
|
||||
- gfortran-multilib
|
||||
env:
|
||||
- TARGET_BOX=LINUX32
|
||||
- BTYPE="BINARY=32"
|
||||
|
||||
# - <<: *test-ubuntu
|
||||
# env:
|
||||
# - TARGET_BOX=LINUX64
|
||||
# - BTYPE="BINARY=64 USE_OPENMP=1"
|
||||
#
|
||||
# - <<: *test-ubuntu
|
||||
# env:
|
||||
# - TARGET_BOX=LINUX64
|
||||
# - BTYPE="BINARY=64 INTERFACE64=1"
|
||||
#
|
||||
# - <<: *test-ubuntu
|
||||
# compiler: clang
|
||||
# env:
|
||||
# - TARGET_BOX=LINUX64
|
||||
# - BTYPE="BINARY=64 CC=clang"
|
||||
#
|
||||
# - <<: *test-ubuntu
|
||||
# compiler: clang
|
||||
# env:
|
||||
# - TARGET_BOX=LINUX64
|
||||
# - BTYPE="BINARY=64 INTERFACE64=1 CC=clang"
|
||||
#
|
||||
# - <<: *test-ubuntu
|
||||
# addons:
|
||||
# apt:
|
||||
# packages:
|
||||
# - gcc-multilib
|
||||
# - gfortran-multilib
|
||||
# env:
|
||||
# - TARGET_BOX=LINUX32
|
||||
# - BTYPE="BINARY=32"
|
||||
#
|
||||
- os: linux
|
||||
arch: ppc64le
|
||||
dist: bionic
|
||||
|
|
@ -121,47 +126,47 @@ matrix:
|
|||
# for matrix annotation only
|
||||
- TARGET_BOX=PPC64LE_LINUX_P9
|
||||
|
||||
- os: linux
|
||||
compiler: gcc
|
||||
addons:
|
||||
apt:
|
||||
packages:
|
||||
- binutils-mingw-w64-x86-64
|
||||
- gcc-mingw-w64-x86-64
|
||||
- gfortran-mingw-w64-x86-64
|
||||
before_script: *common-before
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- TARGET_BOX=WIN64
|
||||
- BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran"
|
||||
|
||||
# - os: linux
|
||||
# compiler: gcc
|
||||
# addons:
|
||||
# apt:
|
||||
# packages:
|
||||
# - binutils-mingw-w64-x86-64
|
||||
# - gcc-mingw-w64-x86-64
|
||||
# - gfortran-mingw-w64-x86-64
|
||||
# before_script: *common-before
|
||||
# script:
|
||||
# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
# env:
|
||||
# - TARGET_BOX=WIN64
|
||||
# - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran"
|
||||
#
|
||||
# Build & test on Alpine Linux inside chroot, i.e. on system with musl libc.
|
||||
# These jobs needs sudo, so Travis runs them on VM-based infrastructure
|
||||
# which is slower than container-based infrastructure used for jobs
|
||||
# that don't require sudo.
|
||||
- &test-alpine
|
||||
os: linux
|
||||
dist: trusty
|
||||
sudo: true
|
||||
language: minimal
|
||||
before_install:
|
||||
- "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
|
||||
&& echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1"
|
||||
- alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
|
||||
install:
|
||||
- sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
|
||||
before_script: *common-before
|
||||
script:
|
||||
# XXX: Disable some warnings for now to avoid exceeding Travis limit for log size.
|
||||
- alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types"
|
||||
- alpine make -C test $COMMON_FLAGS $BTYPE
|
||||
- alpine make -C ctest $COMMON_FLAGS $BTYPE
|
||||
- alpine make -C utest $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- TARGET_BOX=LINUX64_MUSL
|
||||
- BTYPE="BINARY=64"
|
||||
# - &test-alpine
|
||||
# os: linux
|
||||
# dist: trusty
|
||||
# sudo: true
|
||||
# language: minimal
|
||||
# before_install:
|
||||
# - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
|
||||
# && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1"
|
||||
# - alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
|
||||
# install:
|
||||
# - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
|
||||
# before_script: *common-before
|
||||
# script:
|
||||
# # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size.
|
||||
# - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
# CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types"
|
||||
# - alpine make -C test $COMMON_FLAGS $BTYPE
|
||||
# - alpine make -C ctest $COMMON_FLAGS $BTYPE
|
||||
# - alpine make -C utest $COMMON_FLAGS $BTYPE
|
||||
# env:
|
||||
# - TARGET_BOX=LINUX64_MUSL
|
||||
# - BTYPE="BINARY=64"
|
||||
|
||||
# XXX: This job segfaults in TESTS OF THE COMPLEX LEVEL 3 BLAS,
|
||||
# but only on Travis CI, cannot reproduce it elsewhere.
|
||||
|
|
@ -171,98 +176,98 @@ matrix:
|
|||
# - TARGET_BOX=LINUX64_MUSL
|
||||
# - BTYPE="BINARY=64 USE_OPENMP=1"
|
||||
|
||||
- <<: *test-alpine
|
||||
env:
|
||||
- TARGET_BOX=LINUX64_MUSL
|
||||
- BTYPE="BINARY=64 INTERFACE64=1"
|
||||
# - <<: *test-alpine
|
||||
# env:
|
||||
# - TARGET_BOX=LINUX64_MUSL
|
||||
# - BTYPE="BINARY=64 INTERFACE64=1"
|
||||
#
|
||||
# # Build with the same flags as Alpine do in OpenBLAS package.
|
||||
# - <<: *test-alpine
|
||||
# env:
|
||||
# - TARGET_BOX=LINUX64_MUSL
|
||||
# - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2"
|
||||
|
||||
# Build with the same flags as Alpine do in OpenBLAS package.
|
||||
- <<: *test-alpine
|
||||
env:
|
||||
- TARGET_BOX=LINUX64_MUSL
|
||||
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2"
|
||||
# - &test-cmake
|
||||
# os: linux
|
||||
# compiler: clang
|
||||
# addons:
|
||||
# apt:
|
||||
# packages:
|
||||
# - gfortran
|
||||
# - cmake
|
||||
# dist: trusty
|
||||
# sudo: true
|
||||
# before_script:
|
||||
# - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32"
|
||||
# script:
|
||||
# - mkdir build
|
||||
# - CONFIG=Release
|
||||
# - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG
|
||||
# - cmake --build build --config $CONFIG -- -j2
|
||||
# env:
|
||||
# - CMAKE=1
|
||||
# - <<: *test-cmake
|
||||
# env:
|
||||
# - CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1"
|
||||
# - <<: *test-cmake
|
||||
# compiler: gcc
|
||||
# env:
|
||||
# - CMAKE=1
|
||||
|
||||
- &test-cmake
|
||||
os: linux
|
||||
compiler: clang
|
||||
addons:
|
||||
apt:
|
||||
packages:
|
||||
- gfortran
|
||||
- cmake
|
||||
dist: trusty
|
||||
sudo: true
|
||||
before_script:
|
||||
- COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32"
|
||||
script:
|
||||
- mkdir build
|
||||
- CONFIG=Release
|
||||
- cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG
|
||||
- cmake --build build --config $CONFIG -- -j2
|
||||
env:
|
||||
- CMAKE=1
|
||||
- <<: *test-cmake
|
||||
env:
|
||||
- CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1"
|
||||
- <<: *test-cmake
|
||||
compiler: gcc
|
||||
env:
|
||||
- CMAKE=1
|
||||
|
||||
- &test-macos
|
||||
os: osx
|
||||
osx_image: xcode11.5
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode12
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
- brew update
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode12
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
- brew update
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
|
||||
# - &test-macos
|
||||
# os: osx
|
||||
# osx_image: xcode11.5
|
||||
# before_script:
|
||||
# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
# script:
|
||||
# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
# env:
|
||||
# - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9"
|
||||
#
|
||||
# - <<: *test-macos
|
||||
# osx_image: xcode12
|
||||
# before_script:
|
||||
# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
# - brew update
|
||||
# script:
|
||||
# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
# env:
|
||||
# - BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10"
|
||||
#
|
||||
# - <<: *test-macos
|
||||
# osx_image: xcode12
|
||||
# before_script:
|
||||
# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
# - brew update
|
||||
# script:
|
||||
# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
# env:
|
||||
# - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
|
||||
|
||||
# - <<: *test-macos
|
||||
# osx_image: xcode10
|
||||
# env:
|
||||
# - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode11.5
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
- brew update
|
||||
env:
|
||||
# - <<: *test-macos
|
||||
# osx_image: xcode11.5
|
||||
# before_script:
|
||||
# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
# - brew update
|
||||
# env:
|
||||
# - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
|
||||
# - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
- CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
|
||||
- CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
- BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
|
||||
- <<: *test-macos
|
||||
osx_image: xcode11.5
|
||||
env:
|
||||
# - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
|
||||
# - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1"
|
||||
- CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
|
||||
- CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1"
|
||||
- BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1"
|
||||
# - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
|
||||
# - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
# - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
|
||||
# - <<: *test-macos
|
||||
# osx_image: xcode11.5
|
||||
# env:
|
||||
## - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
|
||||
## - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1"
|
||||
# - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
|
||||
# - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1"
|
||||
# - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1"
|
||||
|
||||
- &test-graviton2
|
||||
os: linux
|
||||
|
|
|
|||
|
|
@ -132,7 +132,7 @@ endif ()
|
|||
|
||||
if (BUILD_BFLOAT16)
|
||||
message(STATUS "Building Half Precision")
|
||||
list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing
|
||||
# list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN")
|
||||
|
|
|
|||
|
|
@ -1,4 +1,47 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.18
|
||||
02-Oct-2021
|
||||
|
||||
general:
|
||||
- when the build-time number of preconfigured threads is exceeded
|
||||
at runtime (typically by an external program calling BLAS functions
|
||||
from a larger number of threads in parallel), OpenBLAS will now
|
||||
allocate an auxiliary control structure for up to 512 additional
|
||||
threads instead of aborting
|
||||
- added support for Loongson's LoongArch64 cpu architecture
|
||||
- fixed building OpenBLAS with CMAKE and -DBUILD_BFLOAT16=ON
|
||||
- added support for building OpenBLAS as a CMAKE subproject
|
||||
- added support for building for Windows/ARM64 targets with clang
|
||||
- improved support for building with the IBM xlf compiler
|
||||
- imported Reference-LAPACK PR 625 (out-of-bounds reads in ?LARRV)
|
||||
- imported Reference-LAPACK PR 597 for testsuite compatibility with
|
||||
LLVM's libomp
|
||||
|
||||
x86_64:
|
||||
- added SkylakeX S/DGEMM kernels for small problem sizes (M*N*K<=1000000)
|
||||
- added optimized SBGEMM for Intel Cooper Lake
|
||||
- reinstated the performance patch for AVX512 SGEMV_T with a proper fix
|
||||
- added a workaround for a gcc11 tree-vectorizer bug that caused spurious
|
||||
failures in the test programs for complex BLAS3 when compiling at -O3
|
||||
(the default for cmake "release" builds)
|
||||
- added support for runtime cpu count detection under Haiku OS
|
||||
- worked around a long-standing miscompilation issue of the Haswell DGEMV_T
|
||||
kernel with gcc that could produce NaN output in some corner cases
|
||||
|
||||
POWER:
|
||||
- improved performance of DASUM on POWER10
|
||||
|
||||
ARMV8:
|
||||
- fixed crashes (use of reserved register x18) on Apple M1 under OSX
|
||||
- fixed building with gcc releases earlier than 5.1
|
||||
|
||||
MIPS:
|
||||
- fixed building under BSD
|
||||
|
||||
MIPS64:
|
||||
- fixed building under BSD
|
||||
|
||||
====================================================================
|
||||
Version 0.3.17
|
||||
15-Jul-2021
|
||||
|
|
|
|||
2
Makefile
2
Makefile
|
|
@ -269,7 +269,7 @@ prof_lapack : lapack_prebuild
|
|||
lapack_prebuild :
|
||||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
-@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
|
|
|
|||
|
|
@ -1,4 +1,15 @@
|
|||
ifneq ($(C_COMPILER), PGI)
|
||||
|
||||
ifneq ($(GCCVERSIONGT4), 1)
|
||||
CCOMMON_OPT += -march=armv8-a
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a
|
||||
endif
|
||||
|
||||
|
||||
else
|
||||
|
||||
|
||||
ifeq ($(CORE), ARMV8)
|
||||
CCOMMON_OPT += -march=armv8-a
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
|
|
@ -138,4 +149,7 @@ FCOMMON_OPT += -march=armv8-a -mtune=emag
|
|||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
endif
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
ifdef BINARY64
|
||||
else
|
||||
endif
|
||||
|
|
@ -12,9 +12,13 @@ endif
|
|||
ifeq ($(CORE), POWER10)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
|
||||
ifeq ($(F_COMPILER), IBM)
|
||||
FCOMMON_OPT += -O2 -qrecur -qnosave
|
||||
else
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER9)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
|
|
@ -33,7 +37,11 @@ else
|
|||
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
|
||||
endif
|
||||
ifneq ($(F_COMPILER), PGI)
|
||||
ifeq ($(F_COMPILER), IBM)
|
||||
FCOMMON_OPT += -O2 -qrecur -qnosave
|
||||
else
|
||||
FCOMMON_OPT += -O2 -frecursive -fno-fast-math
|
||||
endif
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
ifneq ($(GCCVERSIONGT4), 1)
|
||||
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
|
||||
|
|
@ -57,7 +65,11 @@ CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
|
|||
endif
|
||||
ifneq ($(F_COMPILER), PGI)
|
||||
ifeq ($(OSNAME), AIX)
|
||||
ifeq ($(F_COMPILER), IBM)
|
||||
FCOMMON_OPT += -O2 -qrecur -qnosave
|
||||
else
|
||||
FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
|
||||
endif
|
||||
else
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
|
||||
endif
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.17
|
||||
VERSION = 0.3.17.dev
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
|
|
|||
|
|
@ -33,6 +33,10 @@ else ifeq ($(ARCH), armv7)
|
|||
override ARCH=arm
|
||||
else ifeq ($(ARCH), aarch64)
|
||||
override ARCH=arm64
|
||||
else ifeq ($(ARCH), mipsel)
|
||||
override ARCH=mips
|
||||
else ifeq ($(ARCH), mips64el)
|
||||
override ARCH=mips64
|
||||
else ifeq ($(ARCH), zarch)
|
||||
override ARCH=zarch
|
||||
endif
|
||||
|
|
@ -244,6 +248,14 @@ else
|
|||
ONLY_CBLAS = 0
|
||||
endif
|
||||
|
||||
#For small matrix optimization
|
||||
ifeq ($(ARCH), x86_64)
|
||||
SMALL_MATRIX_OPT = 1
|
||||
endif
|
||||
ifeq ($(SMALL_MATRIX_OPT), 1)
|
||||
CCOMMON_OPT += -DSMALL_MATRIX_OPT
|
||||
endif
|
||||
|
||||
# This operation is expensive, so execution should be once.
|
||||
ifndef GOTOBLAS_MAKEFILE
|
||||
export GOTOBLAS_MAKEFILE = 1
|
||||
|
|
@ -780,6 +792,11 @@ NO_BINARY_MODE = 1
|
|||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), loongarch64)
|
||||
NO_BINARY_MODE = 1
|
||||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
||||
|
||||
#
|
||||
# C Compiler dependent settings
|
||||
|
|
@ -850,6 +867,13 @@ ifeq ($(OSNAME), AIX)
|
|||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), loongarch64)
|
||||
ifeq ($(CORE), LOONGSON3R5)
|
||||
CCOMMON_OPT += -march=loongarch64 -mabi=lp64
|
||||
FCOMMON_OPT += -march=loongarch64 -mabi=lp64
|
||||
endif
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
ifndef BINARY_DEFINED
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
[](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
|
||||
Travis CI: [](https://travis-ci.org/xianyi/OpenBLAS)
|
||||
Travis CI: [](https://travis-ci.com/xianyi/OpenBLAS)
|
||||
|
||||
AppVeyor: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
|
||||
|
||||
|
|
@ -128,6 +128,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
|||
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
|
||||
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
|
||||
- **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
|
||||
- **Intel Cooper Lake**: as Skylake-X with improved BFLOAT16 support.
|
||||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
|
||||
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
||||
|
|
@ -153,6 +154,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
|||
|
||||
- **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS
|
||||
- **Cortex-A53**: same as ARMV8 (different cpu specifications)
|
||||
- **Cortex-A55**: same as ARMV8 (different cpu specifications)
|
||||
- **Cortex A57**: Optimized Level-3 and Level-2 functions
|
||||
- **Cortex A72**: same as A57 ( different cpu specifications)
|
||||
- **Cortex A73**: same as A57 (different cpu specifications)
|
||||
|
|
@ -178,10 +180,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
|||
|
||||
#### RISC-V
|
||||
|
||||
- **C910V**: Optimized Leve-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1.
|
||||
- **C910V**: Optimized Level-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1.
|
||||
```sh
|
||||
make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran
|
||||
```
|
||||
(also known to work on C906)
|
||||
|
||||
### Support for multiple targets in a single library
|
||||
|
||||
|
|
|
|||
|
|
@ -110,3 +110,5 @@ Z14
|
|||
RISCV64_GENERIC
|
||||
C910V
|
||||
|
||||
11.LOONGARCH64:
|
||||
LOONGSON3R5
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ jobs:
|
|||
# of gcc / glibc
|
||||
- job: manylinux1_gcc
|
||||
pool:
|
||||
vmImage: 'ubuntu-16.04'
|
||||
vmImage: 'ubuntu-latest'
|
||||
steps:
|
||||
- script: |
|
||||
echo "FROM quay.io/pypa/manylinux1_x86_64
|
||||
|
|
@ -35,7 +35,7 @@ jobs:
|
|||
displayName: Run manylinux1 docker build
|
||||
- job: Intel_SDE_skx
|
||||
pool:
|
||||
vmImage: 'ubuntu-16.04'
|
||||
vmImage: 'ubuntu-latest'
|
||||
steps:
|
||||
- script: |
|
||||
# at the time of writing the available Azure Ubuntu vm image
|
||||
|
|
@ -83,6 +83,8 @@ jobs:
|
|||
- script: |
|
||||
brew update
|
||||
make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10
|
||||
make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 PREFIX=../blasinst install
|
||||
ls -lR ../blasinst
|
||||
|
||||
- job: OSX_GCC_Nothreads
|
||||
pool:
|
||||
|
|
@ -104,6 +106,38 @@ jobs:
|
|||
brew install llvm libomp
|
||||
make TARGET=CORE2 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10
|
||||
|
||||
- job: OSX_OpenMP_Clang_cmake
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
variables:
|
||||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
brew install llvm libomp
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNOFORTRAN=1 -DNO_AVX512=1 ..
|
||||
make
|
||||
ctest
|
||||
|
||||
- job: OSX_OpenMP_Clang_gf_cmake
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
variables:
|
||||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
brew install llvm libomp
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNO_AVX512=1 ..
|
||||
make
|
||||
ctest
|
||||
|
||||
- job: OSX_Ifort_Clang
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
|
|
@ -147,13 +181,34 @@ jobs:
|
|||
export ANDROID_NDK_HOME=/usr/local/share/android-ndk
|
||||
make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4
|
||||
|
||||
- job: OSX_IOS_ARMV8
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
variables:
|
||||
CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch arm64 -miphoneos-version-min=10.0
|
||||
steps:
|
||||
- script: |
|
||||
make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
|
||||
|
||||
- job: OSX_IOS_ARMV7
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
variables:
|
||||
CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch armv7 -miphoneos-version-min=5.1
|
||||
steps:
|
||||
- script: |
|
||||
make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
|
||||
|
||||
- job: ALPINE_MUSL
|
||||
pool:
|
||||
vmImage: 'ubuntu-latest'
|
||||
steps:
|
||||
- script: |
|
||||
wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
|
||||
&& echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1
|
||||
wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.1/alpine-chroot-install \
|
||||
&& echo '7c7e3fa378e69aecc7f5f01bbc759e5f0a9d9b74 alpine-chroot-install' | sha1sum -c \
|
||||
|| exit 1
|
||||
alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
|
||||
sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo'
|
||||
alpine make DYNAMIC_ARCH=1 BINARY=64
|
||||
|
|
|
|||
7
c_check
7
c_check
|
|
@ -94,6 +94,7 @@ $architecture = arm if ($data =~ /ARCH_ARM/);
|
|||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
|
||||
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
||||
|
||||
$defined = 0;
|
||||
|
||||
|
|
@ -143,6 +144,11 @@ if ($architecture eq "riscv64") {
|
|||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "loongarch64") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($compiler eq "PGI") {
|
||||
$compiler_name .= " -tp p7" if ($binary eq "32");
|
||||
$compiler_name .= " -tp p7-64" if ($binary eq "64");
|
||||
|
|
@ -226,6 +232,7 @@ $architecture = ia64 if ($data =~ /ARCH_IA64/);
|
|||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
||||
|
||||
$binformat = bin32;
|
||||
$binformat = bin64 if ($data =~ /BINARY_64/);
|
||||
|
|
|
|||
2
cblas.h
2
cblas.h
|
|
@ -400,6 +400,8 @@ void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE
|
|||
float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy);
|
||||
|
||||
void cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
|
||||
OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
|
|
|||
|
|
@ -113,6 +113,10 @@ if (MIPS64)
|
|||
set(NO_BINARY_MODE 1)
|
||||
endif ()
|
||||
|
||||
if (LOONGARCH64)
|
||||
set(NO_BINARY_MODE 1)
|
||||
endif ()
|
||||
|
||||
if (${ARCH} STREQUAL "alpha")
|
||||
set(NO_BINARY_MODE 1)
|
||||
set(BINARY_DEFINED 1)
|
||||
|
|
|
|||
|
|
@ -29,6 +29,15 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS
|
|||
set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64")
|
||||
endif ()
|
||||
|
||||
if (LOONGARCH64)
|
||||
if (BINARY64)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64")
|
||||
else ()
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32")
|
||||
endif ()
|
||||
set(BINARY_DEFINED 1)
|
||||
endif ()
|
||||
|
||||
if (CMAKE_SYSTEM_NAME STREQUAL "AIX")
|
||||
set(BINARY_DEFINED 1)
|
||||
endif ()
|
||||
|
|
@ -124,9 +133,9 @@ if (NOT DYNAMIC_ARCH)
|
|||
if (HAVE_AVX)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mavx")
|
||||
endif ()
|
||||
if (HAVE_FMA3)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mfma")
|
||||
endif ()
|
||||
# if (HAVE_FMA3)
|
||||
#set (CCOMMON_OPT "${CCOMMON_OPT} -mfma")
|
||||
#endif ()
|
||||
if (HAVE_SSE)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -msse")
|
||||
endif ()
|
||||
|
|
|
|||
|
|
@ -61,6 +61,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN")
|
|||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32")
|
||||
endif ()
|
||||
endif ()
|
||||
if (LOONGARCH64)
|
||||
if (BINARY64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
|
||||
else ()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
|
||||
endif ()
|
||||
endif ()
|
||||
else ()
|
||||
if (BINARY64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
|
||||
|
|
@ -97,7 +104,7 @@ endif ()
|
|||
|
||||
if (${F_COMPILER} STREQUAL "IBM")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM")
|
||||
# FCOMMON_OPT += -qarch=440
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -qrecur")
|
||||
if (BINARY64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -q64")
|
||||
if (INTERFACE64)
|
||||
|
|
|
|||
|
|
@ -134,6 +134,8 @@ if (BUILD_BFLOAT16)
|
|||
set(SHSWAPKERNEL ../arm/swap.c)
|
||||
set(TOBF16KERNEL ../x86_64/tobf16.c)
|
||||
set(BF16TOKERNEL ../x86_64/bf16to.c)
|
||||
set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
|
||||
set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
|
||||
endif ()
|
||||
endmacro ()
|
||||
|
||||
|
|
|
|||
|
|
@ -186,11 +186,11 @@ if (DEFINED TARGET)
|
|||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
||||
endif()
|
||||
endif()
|
||||
if (DEFINED HAVE_FMA3)
|
||||
if (NOT NO_AVX2)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
|
||||
endif()
|
||||
endif()
|
||||
# if (DEFINED HAVE_FMA3)
|
||||
# if (NOT NO_AVX2)
|
||||
# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
|
||||
# endif()
|
||||
# endif()
|
||||
if (DEFINED HAVE_SSE)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
|
||||
endif()
|
||||
|
|
@ -258,6 +258,13 @@ if (NEED_PIC)
|
|||
endif()
|
||||
endif ()
|
||||
|
||||
if (X86_64)
|
||||
set(SMALL_MATRIX_OPT TRUE)
|
||||
endif ()
|
||||
if (SMALL_MATRIX_OPT)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT")
|
||||
endif ()
|
||||
|
||||
if (DYNAMIC_ARCH)
|
||||
if (X86 OR X86_64 OR ARM64 OR PPC)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
|
||||
|
|
@ -462,6 +469,9 @@ endif()
|
|||
if (BUILD_COMPLEX16)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16")
|
||||
endif()
|
||||
if (BUILD_BFLOAT16)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_BFLOAT16")
|
||||
endif()
|
||||
if(NOT MSVC)
|
||||
set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}")
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -38,6 +38,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
|
|||
set(PPC 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
|
||||
set(MIPS64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
|
||||
set(LOONGARCH64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
||||
if (NOT BINARY)
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
|
|
@ -95,7 +97,7 @@ else()
|
|||
endif ()
|
||||
|
||||
if (NOT BINARY)
|
||||
if (X86_64 OR ARM64 OR PPC OR MIPS64)
|
||||
if (X86_64 OR ARM64 OR PPC OR MIPS64 OR LOONGARCH64)
|
||||
set(BINARY 64)
|
||||
else ()
|
||||
set(BINARY 32)
|
||||
|
|
|
|||
|
|
@ -157,31 +157,31 @@ endfunction ()
|
|||
# STRING - compiles only the given type (e.g. DOUBLE)
|
||||
function(GenerateNamedObjects sources_in)
|
||||
|
||||
if (DEFINED ARGV1)
|
||||
if (${ARGC} GREATER 1)
|
||||
set(defines_in ${ARGV1})
|
||||
endif ()
|
||||
|
||||
if (DEFINED ARGV2 AND NOT "${ARGV2}" STREQUAL "")
|
||||
if (${ARGC} GREATER 2 AND NOT "${ARGV2}" STREQUAL "")
|
||||
set(name_in ${ARGV2})
|
||||
# strip off extension for kernel files that pass in the object name.
|
||||
get_filename_component(name_in ${name_in} NAME_WE)
|
||||
endif ()
|
||||
|
||||
if (DEFINED ARGV3)
|
||||
if (${ARGC} GREATER 3)
|
||||
set(use_cblas ${ARGV3})
|
||||
else ()
|
||||
set(use_cblas false)
|
||||
endif ()
|
||||
|
||||
if (DEFINED ARGV4)
|
||||
if (${ARGC} GREATER 4)
|
||||
set(replace_last_with ${ARGV4})
|
||||
endif ()
|
||||
|
||||
if (DEFINED ARGV5)
|
||||
if (${ARGC} GREATER 5)
|
||||
set(append_with ${ARGV5})
|
||||
endif ()
|
||||
|
||||
if (DEFINED ARGV6)
|
||||
if (${ARGC} GREATER 6)
|
||||
set(no_float_type ${ARGV6})
|
||||
else ()
|
||||
set(no_float_type false)
|
||||
|
|
@ -196,7 +196,7 @@ function(GenerateNamedObjects sources_in)
|
|||
set(real_only false)
|
||||
set(complex_only false)
|
||||
set(mangle_complex_sources false)
|
||||
if (DEFINED ARGV7 AND NOT "${ARGV7}" STREQUAL "")
|
||||
if (${ARGC} GREATER 7 AND NOT "${ARGV7}" STREQUAL "")
|
||||
if (${ARGV7} EQUAL 1)
|
||||
set(real_only true)
|
||||
elseif (${ARGV7} EQUAL 2)
|
||||
|
|
@ -311,7 +311,15 @@ function(GenerateNamedObjects sources_in)
|
|||
configure_file(${new_source_file}.tmp ${new_source_file} COPYONLY)
|
||||
file(REMOVE ${new_source_file}.tmp)
|
||||
list(APPEND SRC_LIST_OUT ${new_source_file})
|
||||
|
||||
message (STATUS ${new_source_file})
|
||||
if (DEFINED HAVE_FMA3)
|
||||
if ( ${new_source_file} MATCHES "(s|d?)rot_k.*c")
|
||||
set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma")
|
||||
endif ()
|
||||
if ( ${new_source_file} MATCHES "dgemv_t_k.*c")
|
||||
set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma")
|
||||
endif ()
|
||||
endif ()
|
||||
endforeach ()
|
||||
endforeach ()
|
||||
|
||||
|
|
@ -334,17 +342,17 @@ endfunction ()
|
|||
function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_defines_in replace_scheme)
|
||||
|
||||
set(alternate_name_in "")
|
||||
if (DEFINED ARGV5)
|
||||
if (${ARGC} GREATER 5)
|
||||
set(alternate_name_in ${ARGV5})
|
||||
endif ()
|
||||
|
||||
set(no_float_type false)
|
||||
if (DEFINED ARGV6)
|
||||
if (${ARGC} GREATER 6)
|
||||
set(no_float_type ${ARGV6})
|
||||
endif ()
|
||||
|
||||
set(complex_filename_scheme "")
|
||||
if (DEFINED ARGV7)
|
||||
if (${ARGC} GREATER 7)
|
||||
set(complex_filename_scheme ${ARGV7})
|
||||
endif ()
|
||||
|
||||
|
|
|
|||
4
common.h
4
common.h
|
|
@ -470,6 +470,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
|||
#include "common_zarch.h"
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_LOONGARCH64
|
||||
#include "common_loongarch64.h"
|
||||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#ifdef OS_WINDOWSSTORE
|
||||
typedef char env_var_t[MAX_PATH];
|
||||
|
|
|
|||
|
|
@ -120,7 +120,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
.text ;
|
||||
.p2align 2 ;
|
||||
.global REALNAME ;
|
||||
#ifndef __APPLE__
|
||||
#if !defined(__APPLE__) && !defined(_WIN32)
|
||||
.type REALNAME, %function ;
|
||||
#endif
|
||||
REALNAME:
|
||||
|
|
|
|||
45
common_c.h
45
common_c.h
|
|
@ -232,6 +232,8 @@
|
|||
|
||||
#define CGEADD_K cgeadd_k
|
||||
|
||||
#define CGEMM_SMALL_MATRIX_PERMIT cgemm_small_matrix_permit
|
||||
|
||||
#else
|
||||
|
||||
#define CAMAX_K gotoblas -> camax_k
|
||||
|
|
@ -426,8 +428,51 @@
|
|||
|
||||
#define CGEADD_K gotoblas -> cgeadd_k
|
||||
|
||||
#define CGEMM_SMALL_MATRIX_PERMIT gotoblas -> cgemm_small_matrix_permit
|
||||
|
||||
#endif
|
||||
|
||||
#define CGEMM_SMALL_KERNEL_NN FUNC_OFFSET(cgemm_small_kernel_nn)
|
||||
#define CGEMM_SMALL_KERNEL_NT FUNC_OFFSET(cgemm_small_kernel_nt)
|
||||
#define CGEMM_SMALL_KERNEL_NR FUNC_OFFSET(cgemm_small_kernel_nr)
|
||||
#define CGEMM_SMALL_KERNEL_NC FUNC_OFFSET(cgemm_small_kernel_nc)
|
||||
|
||||
#define CGEMM_SMALL_KERNEL_TN FUNC_OFFSET(cgemm_small_kernel_tn)
|
||||
#define CGEMM_SMALL_KERNEL_TT FUNC_OFFSET(cgemm_small_kernel_tt)
|
||||
#define CGEMM_SMALL_KERNEL_TR FUNC_OFFSET(cgemm_small_kernel_tr)
|
||||
#define CGEMM_SMALL_KERNEL_TC FUNC_OFFSET(cgemm_small_kernel_tc)
|
||||
|
||||
#define CGEMM_SMALL_KERNEL_RN FUNC_OFFSET(cgemm_small_kernel_rn)
|
||||
#define CGEMM_SMALL_KERNEL_RT FUNC_OFFSET(cgemm_small_kernel_rt)
|
||||
#define CGEMM_SMALL_KERNEL_RR FUNC_OFFSET(cgemm_small_kernel_rr)
|
||||
#define CGEMM_SMALL_KERNEL_RC FUNC_OFFSET(cgemm_small_kernel_rc)
|
||||
|
||||
#define CGEMM_SMALL_KERNEL_CN FUNC_OFFSET(cgemm_small_kernel_cn)
|
||||
#define CGEMM_SMALL_KERNEL_CT FUNC_OFFSET(cgemm_small_kernel_ct)
|
||||
#define CGEMM_SMALL_KERNEL_CR FUNC_OFFSET(cgemm_small_kernel_cr)
|
||||
#define CGEMM_SMALL_KERNEL_CC FUNC_OFFSET(cgemm_small_kernel_cc)
|
||||
|
||||
#define CGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(cgemm_small_kernel_b0_nn)
|
||||
#define CGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(cgemm_small_kernel_b0_nt)
|
||||
#define CGEMM_SMALL_KERNEL_B0_NR FUNC_OFFSET(cgemm_small_kernel_b0_nr)
|
||||
#define CGEMM_SMALL_KERNEL_B0_NC FUNC_OFFSET(cgemm_small_kernel_b0_nc)
|
||||
|
||||
#define CGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(cgemm_small_kernel_b0_tn)
|
||||
#define CGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(cgemm_small_kernel_b0_tt)
|
||||
#define CGEMM_SMALL_KERNEL_B0_TR FUNC_OFFSET(cgemm_small_kernel_b0_tr)
|
||||
#define CGEMM_SMALL_KERNEL_B0_TC FUNC_OFFSET(cgemm_small_kernel_b0_tc)
|
||||
|
||||
#define CGEMM_SMALL_KERNEL_B0_RN FUNC_OFFSET(cgemm_small_kernel_b0_rn)
|
||||
#define CGEMM_SMALL_KERNEL_B0_RT FUNC_OFFSET(cgemm_small_kernel_b0_rt)
|
||||
#define CGEMM_SMALL_KERNEL_B0_RR FUNC_OFFSET(cgemm_small_kernel_b0_rr)
|
||||
#define CGEMM_SMALL_KERNEL_B0_RC FUNC_OFFSET(cgemm_small_kernel_b0_rc)
|
||||
|
||||
#define CGEMM_SMALL_KERNEL_B0_CN FUNC_OFFSET(cgemm_small_kernel_b0_cn)
|
||||
#define CGEMM_SMALL_KERNEL_B0_CT FUNC_OFFSET(cgemm_small_kernel_b0_ct)
|
||||
#define CGEMM_SMALL_KERNEL_B0_CR FUNC_OFFSET(cgemm_small_kernel_b0_cr)
|
||||
#define CGEMM_SMALL_KERNEL_B0_CC FUNC_OFFSET(cgemm_small_kernel_b0_cc)
|
||||
|
||||
|
||||
#define CGEMM_NN cgemm_nn
|
||||
#define CGEMM_CN cgemm_cn
|
||||
#define CGEMM_TN cgemm_tn
|
||||
|
|
|
|||
15
common_d.h
15
common_d.h
|
|
@ -157,6 +157,8 @@
|
|||
#define DIMATCOPY_K_RT dimatcopy_k_rt
|
||||
#define DGEADD_K dgeadd_k
|
||||
|
||||
#define DGEMM_SMALL_MATRIX_PERMIT dgemm_small_matrix_permit
|
||||
|
||||
#else
|
||||
|
||||
#define DAMAX_K gotoblas -> damax_k
|
||||
|
|
@ -281,8 +283,21 @@
|
|||
|
||||
#define DGEADD_K gotoblas -> dgeadd_k
|
||||
|
||||
#define DGEMM_SMALL_MATRIX_PERMIT gotoblas -> dgemm_small_matrix_permit
|
||||
|
||||
#endif
|
||||
|
||||
#define DGEMM_SMALL_KERNEL_NN FUNC_OFFSET(dgemm_small_kernel_nn)
|
||||
#define DGEMM_SMALL_KERNEL_NT FUNC_OFFSET(dgemm_small_kernel_nt)
|
||||
#define DGEMM_SMALL_KERNEL_TN FUNC_OFFSET(dgemm_small_kernel_tn)
|
||||
#define DGEMM_SMALL_KERNEL_TT FUNC_OFFSET(dgemm_small_kernel_tt)
|
||||
|
||||
#define DGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(dgemm_small_kernel_b0_nn)
|
||||
#define DGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(dgemm_small_kernel_b0_nt)
|
||||
#define DGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(dgemm_small_kernel_b0_tn)
|
||||
#define DGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(dgemm_small_kernel_b0_tt)
|
||||
|
||||
|
||||
#define DGEMM_NN dgemm_nn
|
||||
#define DGEMM_CN dgemm_tn
|
||||
#define DGEMM_TN dgemm_tn
|
||||
|
|
|
|||
123
common_level3.h
123
common_level3.h
|
|
@ -515,6 +515,129 @@ int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xidouble *, xidouble *, xidouble
|
|||
int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);
|
||||
#endif
|
||||
|
||||
#ifdef SMALL_MATRIX_OPT
|
||||
int sbgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
|
||||
|
||||
int sbgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int sbgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int sbgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int sbgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
|
||||
int sgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
|
||||
|
||||
int sgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int sgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int sgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int sgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
|
||||
int dgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta);
|
||||
|
||||
int dgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
|
||||
int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
|
||||
int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
|
||||
int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
|
||||
|
||||
int sbgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int sbgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int sbgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int sbgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
|
||||
int sgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int sgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int sgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int sgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
|
||||
int dgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int dgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int dgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int dgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
|
||||
int cgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha0, float alpha1, float beta0, float beta1);
|
||||
|
||||
int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
|
||||
int cgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
|
||||
int cgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
|
||||
int cgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
|
||||
int zgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha0, double alpha1, double beta0, double beta1);
|
||||
|
||||
int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
|
||||
int zgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
|
||||
int zgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
|
||||
int zgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
|
||||
int cgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
|
||||
int cgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
|
||||
int cgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
|
||||
int cgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
|
||||
int zgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
|
||||
int zgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
|
||||
int zgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
|
||||
int zgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
|
||||
#endif
|
||||
|
||||
int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
|
||||
int cgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
|
||||
int cgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,199 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2011-2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#ifndef COMMON_LOONGARCH64
|
||||
#define COMMON_LOONGARCH64
|
||||
|
||||
#define MB __sync_synchronize()
|
||||
#define WMB __sync_synchronize()
|
||||
#define RMB __sync_synchronize()
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
}
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory")
|
||||
#else
|
||||
#define GET_IMAGE(res) __asm__ __volatile__("fmov.s %0, $f2" : "=f"(res) : : "memory")
|
||||
#endif
|
||||
|
||||
#define GET_IMAGE_CANCEL
|
||||
|
||||
#else
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define LD fld.d
|
||||
#define ST fst.d
|
||||
#define MADD fmadd.d
|
||||
#define NMADD fnmadd.d
|
||||
#define MSUB fmsub.d
|
||||
#define NMSUB fnmsub.d
|
||||
#define ADD fadd.d
|
||||
#define SUB fsub.d
|
||||
#define MUL fmul.d
|
||||
#define MOV fmov.d
|
||||
#define CMOVT fsel
|
||||
#define MTC movgr2fr.d
|
||||
#define FABS fabs.d
|
||||
#define CMPEQ fcmp.ceq.d
|
||||
#define CMPLE fcmp.cle.d
|
||||
#define CMPLT fcmp.clt.d
|
||||
#define NEG fneg.d
|
||||
#else
|
||||
#define LD fld.s
|
||||
#define ST fst.s
|
||||
#define MADD fmadd.s
|
||||
#define NMADD fnmadd.s
|
||||
#define MSUB fmsub.s
|
||||
#define NMSUB fnmsub.s
|
||||
#define ADD fadd.s
|
||||
#define SUB fsub.s
|
||||
#define MUL fmul.s
|
||||
#define MOV fmov.s
|
||||
#define CMOVT fsel
|
||||
#define MTC movgr2fr.w
|
||||
#define FABS fabs.s
|
||||
#define CMPEQ fcmp.ceq.s
|
||||
#define CMPLE fcmp.cle.s
|
||||
#define CMPLT fcmp.clt.s
|
||||
#define NEG fneg.s
|
||||
#endif /* defined(DOUBLE) */
|
||||
|
||||
#if defined(__64BIT__) && defined(USE64BITINT)
|
||||
#define LDINT ld.d
|
||||
#define LDARG ld.d
|
||||
#define SDARG st.d
|
||||
#elif defined(__64BIT__) && !defined(USE64BITINT)
|
||||
#define LDINT ld.w
|
||||
#define LDARG ld.d
|
||||
#define SDARG st.d
|
||||
#else
|
||||
#define LDINT ld.w
|
||||
#define LDARG ld.w
|
||||
#define SDARG st.w
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef F_INTERFACE
|
||||
#define REALNAME ASMNAME
|
||||
#else
|
||||
#define REALNAME ASMFNAME
|
||||
#endif /* defined(F_INTERFACE) */
|
||||
|
||||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||
|
||||
#define PROLOGUE \
|
||||
.text ;\
|
||||
.align 5 ;\
|
||||
.globl REALNAME ;\
|
||||
.type REALNAME, @function ;\
|
||||
REALNAME: ;\
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
#define GNUSTACK .section .note.GNU-stack,"",@progbits
|
||||
#else
|
||||
#define GNUSTACK
|
||||
#endif /* defined(__linux__) && defined(__ELF__) */
|
||||
|
||||
#define EPILOGUE \
|
||||
.end REALNAME ;\
|
||||
GNUSTACK
|
||||
|
||||
#define PROFCODE
|
||||
|
||||
#define MOVT(dst, src, cc) \
|
||||
bceqz cc, 1f; \
|
||||
add.d dst, src, $r0; \
|
||||
1:
|
||||
|
||||
#endif /* defined(ASSEMBLER) && !defined(NEEDPARAM) */
|
||||
|
||||
#endif /* defined(ASSEMBLER) */
|
||||
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#define BUFFER_SIZE ( 32 << 20)
|
||||
|
||||
#define PAGESIZE (16UL << 10)
|
||||
#define FIXED_PAGESIZE (16UL << 10)
|
||||
#define HUGE_PAGESIZE ( 2 << 20)
|
||||
|
||||
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
|
||||
|
||||
#ifndef MAP_ANONYMOUS
|
||||
#define MAP_ANONYMOUS MAP_ANON
|
||||
#endif
|
||||
|
||||
#endif
|
||||
123
common_macro.h
123
common_macro.h
|
|
@ -644,6 +644,17 @@
|
|||
|
||||
#define GEADD_K DGEADD_K
|
||||
|
||||
#define GEMM_SMALL_MATRIX_PERMIT DGEMM_SMALL_MATRIX_PERMIT
|
||||
|
||||
#define GEMM_SMALL_KERNEL_NN DGEMM_SMALL_KERNEL_NN
|
||||
#define GEMM_SMALL_KERNEL_NT DGEMM_SMALL_KERNEL_NT
|
||||
#define GEMM_SMALL_KERNEL_TN DGEMM_SMALL_KERNEL_TN
|
||||
#define GEMM_SMALL_KERNEL_TT DGEMM_SMALL_KERNEL_TT
|
||||
#define GEMM_SMALL_KERNEL_B0_NN DGEMM_SMALL_KERNEL_B0_NN
|
||||
#define GEMM_SMALL_KERNEL_B0_NT DGEMM_SMALL_KERNEL_B0_NT
|
||||
#define GEMM_SMALL_KERNEL_B0_TN DGEMM_SMALL_KERNEL_B0_TN
|
||||
#define GEMM_SMALL_KERNEL_B0_TT DGEMM_SMALL_KERNEL_B0_TT
|
||||
|
||||
#elif defined(BFLOAT16)
|
||||
|
||||
#define D_TO_BF16_K SBDTOBF16_K
|
||||
|
|
@ -931,6 +942,18 @@
|
|||
|
||||
#define GEADD_K SGEADD_K
|
||||
|
||||
#define GEMM_SMALL_MATRIX_PERMIT SBGEMM_SMALL_MATRIX_PERMIT
|
||||
|
||||
#define GEMM_SMALL_KERNEL_NN SBGEMM_SMALL_KERNEL_NN
|
||||
#define GEMM_SMALL_KERNEL_NT SBGEMM_SMALL_KERNEL_NT
|
||||
#define GEMM_SMALL_KERNEL_TN SBGEMM_SMALL_KERNEL_TN
|
||||
#define GEMM_SMALL_KERNEL_TT SBGEMM_SMALL_KERNEL_TT
|
||||
|
||||
#define GEMM_SMALL_KERNEL_B0_NN SBGEMM_SMALL_KERNEL_B0_NN
|
||||
#define GEMM_SMALL_KERNEL_B0_NT SBGEMM_SMALL_KERNEL_B0_NT
|
||||
#define GEMM_SMALL_KERNEL_B0_TN SBGEMM_SMALL_KERNEL_B0_TN
|
||||
#define GEMM_SMALL_KERNEL_B0_TT SBGEMM_SMALL_KERNEL_B0_TT
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
|
@ -1236,6 +1259,19 @@
|
|||
#define IMATCOPY_K_RT SIMATCOPY_K_RT
|
||||
|
||||
#define GEADD_K SGEADD_K
|
||||
|
||||
#define GEMM_SMALL_MATRIX_PERMIT SGEMM_SMALL_MATRIX_PERMIT
|
||||
|
||||
#define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN
|
||||
#define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT
|
||||
#define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN
|
||||
#define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT
|
||||
|
||||
#define GEMM_SMALL_KERNEL_B0_NN SGEMM_SMALL_KERNEL_B0_NN
|
||||
#define GEMM_SMALL_KERNEL_B0_NT SGEMM_SMALL_KERNEL_B0_NT
|
||||
#define GEMM_SMALL_KERNEL_B0_TN SGEMM_SMALL_KERNEL_B0_TN
|
||||
#define GEMM_SMALL_KERNEL_B0_TT SGEMM_SMALL_KERNEL_B0_TT
|
||||
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
|
|
@ -2063,6 +2099,48 @@
|
|||
|
||||
#define GEADD_K ZGEADD_K
|
||||
|
||||
#define GEMM_SMALL_MATRIX_PERMIT ZGEMM_SMALL_MATRIX_PERMIT
|
||||
|
||||
#define GEMM_SMALL_KERNEL_NN ZGEMM_SMALL_KERNEL_NN
|
||||
#define GEMM_SMALL_KERNEL_NT ZGEMM_SMALL_KERNEL_NT
|
||||
#define GEMM_SMALL_KERNEL_NR ZGEMM_SMALL_KERNEL_NR
|
||||
#define GEMM_SMALL_KERNEL_NC ZGEMM_SMALL_KERNEL_NC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_TN ZGEMM_SMALL_KERNEL_TN
|
||||
#define GEMM_SMALL_KERNEL_TT ZGEMM_SMALL_KERNEL_TT
|
||||
#define GEMM_SMALL_KERNEL_TR ZGEMM_SMALL_KERNEL_TR
|
||||
#define GEMM_SMALL_KERNEL_TC ZGEMM_SMALL_KERNEL_TC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_RN ZGEMM_SMALL_KERNEL_RN
|
||||
#define GEMM_SMALL_KERNEL_RT ZGEMM_SMALL_KERNEL_RT
|
||||
#define GEMM_SMALL_KERNEL_RR ZGEMM_SMALL_KERNEL_RR
|
||||
#define GEMM_SMALL_KERNEL_RC ZGEMM_SMALL_KERNEL_RC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_CN ZGEMM_SMALL_KERNEL_CN
|
||||
#define GEMM_SMALL_KERNEL_CT ZGEMM_SMALL_KERNEL_CT
|
||||
#define GEMM_SMALL_KERNEL_CR ZGEMM_SMALL_KERNEL_CR
|
||||
#define GEMM_SMALL_KERNEL_CC ZGEMM_SMALL_KERNEL_CC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_B0_NN ZGEMM_SMALL_KERNEL_B0_NN
|
||||
#define GEMM_SMALL_KERNEL_B0_NT ZGEMM_SMALL_KERNEL_B0_NT
|
||||
#define GEMM_SMALL_KERNEL_B0_NR ZGEMM_SMALL_KERNEL_B0_NR
|
||||
#define GEMM_SMALL_KERNEL_B0_NC ZGEMM_SMALL_KERNEL_B0_NC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_B0_TN ZGEMM_SMALL_KERNEL_B0_TN
|
||||
#define GEMM_SMALL_KERNEL_B0_TT ZGEMM_SMALL_KERNEL_B0_TT
|
||||
#define GEMM_SMALL_KERNEL_B0_TR ZGEMM_SMALL_KERNEL_B0_TR
|
||||
#define GEMM_SMALL_KERNEL_B0_TC ZGEMM_SMALL_KERNEL_B0_TC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_B0_RN ZGEMM_SMALL_KERNEL_B0_RN
|
||||
#define GEMM_SMALL_KERNEL_B0_RT ZGEMM_SMALL_KERNEL_B0_RT
|
||||
#define GEMM_SMALL_KERNEL_B0_RR ZGEMM_SMALL_KERNEL_B0_RR
|
||||
#define GEMM_SMALL_KERNEL_B0_RC ZGEMM_SMALL_KERNEL_B0_RC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_B0_CN ZGEMM_SMALL_KERNEL_B0_CN
|
||||
#define GEMM_SMALL_KERNEL_B0_CT ZGEMM_SMALL_KERNEL_B0_CT
|
||||
#define GEMM_SMALL_KERNEL_B0_CR ZGEMM_SMALL_KERNEL_B0_CR
|
||||
#define GEMM_SMALL_KERNEL_B0_CC ZGEMM_SMALL_KERNEL_B0_CC
|
||||
|
||||
#else
|
||||
|
||||
#define AMAX_K CAMAX_K
|
||||
|
|
@ -2486,11 +2564,54 @@
|
|||
|
||||
#define GEADD_K CGEADD_K
|
||||
|
||||
#define GEMM_SMALL_MATRIX_PERMIT CGEMM_SMALL_MATRIX_PERMIT
|
||||
|
||||
#define GEMM_SMALL_KERNEL_NN CGEMM_SMALL_KERNEL_NN
|
||||
#define GEMM_SMALL_KERNEL_NT CGEMM_SMALL_KERNEL_NT
|
||||
#define GEMM_SMALL_KERNEL_NR CGEMM_SMALL_KERNEL_NR
|
||||
#define GEMM_SMALL_KERNEL_NC CGEMM_SMALL_KERNEL_NC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_TN CGEMM_SMALL_KERNEL_TN
|
||||
#define GEMM_SMALL_KERNEL_TT CGEMM_SMALL_KERNEL_TT
|
||||
#define GEMM_SMALL_KERNEL_TR CGEMM_SMALL_KERNEL_TR
|
||||
#define GEMM_SMALL_KERNEL_TC CGEMM_SMALL_KERNEL_TC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_RN CGEMM_SMALL_KERNEL_RN
|
||||
#define GEMM_SMALL_KERNEL_RT CGEMM_SMALL_KERNEL_RT
|
||||
#define GEMM_SMALL_KERNEL_RR CGEMM_SMALL_KERNEL_RR
|
||||
#define GEMM_SMALL_KERNEL_RC CGEMM_SMALL_KERNEL_RC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_CN CGEMM_SMALL_KERNEL_CN
|
||||
#define GEMM_SMALL_KERNEL_CT CGEMM_SMALL_KERNEL_CT
|
||||
#define GEMM_SMALL_KERNEL_CR CGEMM_SMALL_KERNEL_CR
|
||||
#define GEMM_SMALL_KERNEL_CC CGEMM_SMALL_KERNEL_CC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_B0_NN CGEMM_SMALL_KERNEL_B0_NN
|
||||
#define GEMM_SMALL_KERNEL_B0_NT CGEMM_SMALL_KERNEL_B0_NT
|
||||
#define GEMM_SMALL_KERNEL_B0_NR CGEMM_SMALL_KERNEL_B0_NR
|
||||
#define GEMM_SMALL_KERNEL_B0_NC CGEMM_SMALL_KERNEL_B0_NC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_B0_TN CGEMM_SMALL_KERNEL_B0_TN
|
||||
#define GEMM_SMALL_KERNEL_B0_TT CGEMM_SMALL_KERNEL_B0_TT
|
||||
#define GEMM_SMALL_KERNEL_B0_TR CGEMM_SMALL_KERNEL_B0_TR
|
||||
#define GEMM_SMALL_KERNEL_B0_TC CGEMM_SMALL_KERNEL_B0_TC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_B0_RN CGEMM_SMALL_KERNEL_B0_RN
|
||||
#define GEMM_SMALL_KERNEL_B0_RT CGEMM_SMALL_KERNEL_B0_RT
|
||||
#define GEMM_SMALL_KERNEL_B0_RR CGEMM_SMALL_KERNEL_B0_RR
|
||||
#define GEMM_SMALL_KERNEL_B0_RC CGEMM_SMALL_KERNEL_B0_RC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_B0_CN CGEMM_SMALL_KERNEL_B0_CN
|
||||
#define GEMM_SMALL_KERNEL_B0_CT CGEMM_SMALL_KERNEL_B0_CT
|
||||
#define GEMM_SMALL_KERNEL_B0_CR CGEMM_SMALL_KERNEL_B0_CR
|
||||
#define GEMM_SMALL_KERNEL_B0_CC CGEMM_SMALL_KERNEL_B0_CC
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\
|
||||
|| defined(ARCH_LOONGARCH64)
|
||||
extern BLASLONG gemm_offset_a;
|
||||
extern BLASLONG gemm_offset_b;
|
||||
extern BLASLONG sbgemm_p;
|
||||
|
|
|
|||
132
common_param.h
132
common_param.h
|
|
@ -145,6 +145,19 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG);
|
|||
int (*sbneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*sblaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *);
|
||||
|
||||
#ifdef SMALL_MATRIX_OPT
|
||||
int (*sbgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
|
||||
|
||||
int (*sbgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int (*sbgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int (*sbgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int (*sbgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
|
||||
int (*sbgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*sbgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*sbgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*sbgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX)
|
||||
|
|
@ -207,6 +220,20 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
|
|||
int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||
#endif
|
||||
#ifdef BUILD_SINGLE
|
||||
#ifdef SMALL_MATRIX_OPT
|
||||
int (*sgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
|
||||
|
||||
int (*sgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int (*sgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int (*sgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int (*sgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
|
||||
int (*sgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*sgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*sgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*sgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
#endif
|
||||
|
||||
int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
|
|
@ -314,6 +341,19 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
|
|||
int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
|
||||
#endif
|
||||
#ifdef BUILD_DOUBLE
|
||||
#ifdef SMALL_MATRIX_OPT
|
||||
int (*dgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta);
|
||||
|
||||
int (*dgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
|
||||
int (*dgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
|
||||
int (*dgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
|
||||
int (*dgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
|
||||
|
||||
int (*dgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*dgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*dgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*dgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
#endif
|
||||
int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
|
||||
int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
|
||||
int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
|
||||
|
|
@ -513,6 +553,50 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
|
|||
int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*cgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||
|
||||
#ifdef SMALL_MATRIX_OPT
|
||||
int (*cgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha0, float alpha1, float beta0, float beta1);
|
||||
|
||||
int (*cgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_nr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_nc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
|
||||
int (*cgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_tr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_tc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
|
||||
int (*cgemm_small_kernel_rn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_rt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_rr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_rc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
|
||||
int (*cgemm_small_kernel_cn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_ct )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_cr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_cc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
|
||||
int (*cgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_nr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_nc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
|
||||
int (*cgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_tr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_tc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
|
||||
int (*cgemm_small_kernel_b0_rn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_rt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_rr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_rc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
|
||||
int (*cgemm_small_kernel_b0_cn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_ct )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_cr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_cc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
#endif
|
||||
|
||||
int (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
|
|
@ -679,6 +763,50 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
|
|||
int (*zgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
|
||||
int (*zgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
|
||||
|
||||
#ifdef SMALL_MATRIX_OPT
|
||||
int (*zgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha0, double alpha1, double beta0, double beta1);
|
||||
|
||||
int (*zgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_nr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_nc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
|
||||
int (*zgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_tr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_tc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
|
||||
int (*zgemm_small_kernel_rn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_rt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_rr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_rc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
|
||||
int (*zgemm_small_kernel_cn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_ct )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_cr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_cc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
|
||||
int (*zgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_nr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_nc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
|
||||
int (*zgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_tr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_tc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
|
||||
int (*zgemm_small_kernel_b0_rn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_rt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_rr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_rc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
|
||||
int (*zgemm_small_kernel_b0_cn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_ct )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_cr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_cc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
#endif
|
||||
|
||||
int (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
|
||||
int (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
|
||||
int (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
|
||||
|
|
@ -1069,6 +1197,8 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
|
|||
|
||||
extern gotoblas_t *gotoblas;
|
||||
|
||||
#define FUNC_OFFSET(func) (size_t)(&((gotoblas_t *)NULL)->func)
|
||||
|
||||
#define DTB_ENTRIES gotoblas -> dtb_entries
|
||||
#define GEMM_OFFSET_A gotoblas -> offsetA
|
||||
#define GEMM_OFFSET_B gotoblas -> offsetB
|
||||
|
|
@ -1174,6 +1304,8 @@ extern gotoblas_t *gotoblas;
|
|||
|
||||
#else
|
||||
|
||||
#define FUNC_OFFSET(func) (size_t)(func)
|
||||
|
||||
#define DTB_ENTRIES DTB_DEFAULT_ENTRIES
|
||||
|
||||
#define GEMM_OFFSET_A GEMM_DEFAULT_OFFSET_A
|
||||
|
|
|
|||
15
common_s.h
15
common_s.h
|
|
@ -164,6 +164,8 @@
|
|||
|
||||
#define SGEADD_K sgeadd_k
|
||||
|
||||
#define SGEMM_SMALL_MATRIX_PERMIT sgemm_small_matrix_permit
|
||||
|
||||
#else
|
||||
|
||||
#define SAMAX_K gotoblas -> samax_k
|
||||
|
|
@ -299,8 +301,21 @@
|
|||
|
||||
#define SGEADD_K gotoblas -> sgeadd_k
|
||||
|
||||
#define SGEMM_SMALL_MATRIX_PERMIT gotoblas -> sgemm_small_matrix_permit
|
||||
|
||||
#endif
|
||||
|
||||
#define SGEMM_SMALL_KERNEL_NN FUNC_OFFSET(sgemm_small_kernel_nn)
|
||||
#define SGEMM_SMALL_KERNEL_NT FUNC_OFFSET(sgemm_small_kernel_nt)
|
||||
#define SGEMM_SMALL_KERNEL_TN FUNC_OFFSET(sgemm_small_kernel_tn)
|
||||
#define SGEMM_SMALL_KERNEL_TT FUNC_OFFSET(sgemm_small_kernel_tt)
|
||||
|
||||
#define SGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(sgemm_small_kernel_b0_nn)
|
||||
#define SGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(sgemm_small_kernel_b0_nt)
|
||||
#define SGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(sgemm_small_kernel_b0_tn)
|
||||
#define SGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(sgemm_small_kernel_b0_tt)
|
||||
|
||||
|
||||
#define SGEMM_NN sgemm_nn
|
||||
#define SGEMM_CN sgemm_tn
|
||||
#define SGEMM_TN sgemm_tn
|
||||
|
|
|
|||
12
common_sb.h
12
common_sb.h
|
|
@ -24,6 +24,7 @@
|
|||
#define SBGEMM_BETA sbgemm_beta
|
||||
#define SBGEMM_KERNEL sbgemm_kernel
|
||||
|
||||
#define SBGEMM_SMALL_MATRIX_PERMIT sbgemm_small_matrix_permit
|
||||
#else
|
||||
|
||||
#define SBDOT_K gotoblas -> sbdot_k
|
||||
|
|
@ -41,8 +42,19 @@
|
|||
#define SBGEMM_BETA gotoblas -> sbgemm_beta
|
||||
#define SBGEMM_KERNEL gotoblas -> sbgemm_kernel
|
||||
|
||||
#define SBGEMM_SMALL_MATRIX_PERMIT gotoblas -> sbgemm_small_matrix_permit
|
||||
#endif
|
||||
|
||||
#define SBGEMM_SMALL_KERNEL_NN FUNC_OFFSET(sbgemm_small_kernel_nn)
|
||||
#define SBGEMM_SMALL_KERNEL_NT FUNC_OFFSET(sbgemm_small_kernel_nt)
|
||||
#define SBGEMM_SMALL_KERNEL_TN FUNC_OFFSET(sbgemm_small_kernel_tn)
|
||||
#define SBGEMM_SMALL_KERNEL_TT FUNC_OFFSET(sbgemm_small_kernel_tt)
|
||||
|
||||
#define SBGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(sbgemm_small_kernel_b0_nn)
|
||||
#define SBGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(sbgemm_small_kernel_b0_nt)
|
||||
#define SBGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(sbgemm_small_kernel_b0_tn)
|
||||
#define SBGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(sbgemm_small_kernel_b0_tt)
|
||||
|
||||
#define SBGEMM_NN sbgemm_nn
|
||||
#define SBGEMM_CN sbgemm_tn
|
||||
#define SBGEMM_TN sbgemm_tn
|
||||
|
|
|
|||
45
common_z.h
45
common_z.h
|
|
@ -232,6 +232,8 @@
|
|||
|
||||
#define ZGEADD_K zgeadd_k
|
||||
|
||||
#define ZGEMM_SMALL_MATRIX_PERMIT zgemm_small_matrix_permit
|
||||
|
||||
#else
|
||||
|
||||
#define ZAMAX_K gotoblas -> zamax_k
|
||||
|
|
@ -426,8 +428,51 @@
|
|||
|
||||
#define ZGEADD_K gotoblas -> zgeadd_k
|
||||
|
||||
#define ZGEMM_SMALL_MATRIX_PERMIT gotoblas -> zgemm_small_matrix_permit
|
||||
|
||||
#endif
|
||||
|
||||
#define ZGEMM_SMALL_KERNEL_NN FUNC_OFFSET(zgemm_small_kernel_nn)
|
||||
#define ZGEMM_SMALL_KERNEL_NT FUNC_OFFSET(zgemm_small_kernel_nt)
|
||||
#define ZGEMM_SMALL_KERNEL_NR FUNC_OFFSET(zgemm_small_kernel_nr)
|
||||
#define ZGEMM_SMALL_KERNEL_NC FUNC_OFFSET(zgemm_small_kernel_nc)
|
||||
|
||||
#define ZGEMM_SMALL_KERNEL_TN FUNC_OFFSET(zgemm_small_kernel_tn)
|
||||
#define ZGEMM_SMALL_KERNEL_TT FUNC_OFFSET(zgemm_small_kernel_tt)
|
||||
#define ZGEMM_SMALL_KERNEL_TR FUNC_OFFSET(zgemm_small_kernel_tr)
|
||||
#define ZGEMM_SMALL_KERNEL_TC FUNC_OFFSET(zgemm_small_kernel_tc)
|
||||
|
||||
#define ZGEMM_SMALL_KERNEL_RN FUNC_OFFSET(zgemm_small_kernel_rn)
|
||||
#define ZGEMM_SMALL_KERNEL_RT FUNC_OFFSET(zgemm_small_kernel_rt)
|
||||
#define ZGEMM_SMALL_KERNEL_RR FUNC_OFFSET(zgemm_small_kernel_rr)
|
||||
#define ZGEMM_SMALL_KERNEL_RC FUNC_OFFSET(zgemm_small_kernel_rc)
|
||||
|
||||
#define ZGEMM_SMALL_KERNEL_CN FUNC_OFFSET(zgemm_small_kernel_cn)
|
||||
#define ZGEMM_SMALL_KERNEL_CT FUNC_OFFSET(zgemm_small_kernel_ct)
|
||||
#define ZGEMM_SMALL_KERNEL_CR FUNC_OFFSET(zgemm_small_kernel_cr)
|
||||
#define ZGEMM_SMALL_KERNEL_CC FUNC_OFFSET(zgemm_small_kernel_cc)
|
||||
|
||||
#define ZGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(zgemm_small_kernel_b0_nn)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(zgemm_small_kernel_b0_nt)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_NR FUNC_OFFSET(zgemm_small_kernel_b0_nr)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_NC FUNC_OFFSET(zgemm_small_kernel_b0_nc)
|
||||
|
||||
#define ZGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(zgemm_small_kernel_b0_tn)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(zgemm_small_kernel_b0_tt)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_TR FUNC_OFFSET(zgemm_small_kernel_b0_tr)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_TC FUNC_OFFSET(zgemm_small_kernel_b0_tc)
|
||||
|
||||
#define ZGEMM_SMALL_KERNEL_B0_RN FUNC_OFFSET(zgemm_small_kernel_b0_rn)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_RT FUNC_OFFSET(zgemm_small_kernel_b0_rt)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_RR FUNC_OFFSET(zgemm_small_kernel_b0_rr)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_RC FUNC_OFFSET(zgemm_small_kernel_b0_rc)
|
||||
|
||||
#define ZGEMM_SMALL_KERNEL_B0_CN FUNC_OFFSET(zgemm_small_kernel_b0_cn)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_CT FUNC_OFFSET(zgemm_small_kernel_b0_ct)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_CR FUNC_OFFSET(zgemm_small_kernel_b0_cr)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_CC FUNC_OFFSET(zgemm_small_kernel_b0_cc)
|
||||
|
||||
|
||||
#define ZGEMM_NN zgemm_nn
|
||||
#define ZGEMM_CN zgemm_cn
|
||||
#define ZGEMM_TN zgemm_tn
|
||||
|
|
|
|||
|
|
@ -0,0 +1,110 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2011-2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
**********************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_LOONGSON3R5 1
|
||||
|
||||
#define LOONGARCH_CFG2 0x02
|
||||
#define LOONGARCH_LASX 1<<7
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKNOWN",
|
||||
"LOONGSON3R5"
|
||||
};
|
||||
|
||||
int detect(void) {
|
||||
uint32_t reg = 0;
|
||||
|
||||
__asm__ volatile (
|
||||
"cpucfg %0, %1 \n\t"
|
||||
: "+&r"(reg)
|
||||
: "r"(LOONGARCH_CFG2)
|
||||
);
|
||||
|
||||
if (reg & LOONGARCH_LASX)
|
||||
return CPU_LOONGSON3R5;
|
||||
else
|
||||
return CPU_UNKNOWN;
|
||||
}
|
||||
|
||||
char *get_corename(void) {
|
||||
return cpuname[detect()];
|
||||
}
|
||||
|
||||
void get_architecture(void) {
|
||||
printf("LOONGARCH64");
|
||||
}
|
||||
|
||||
void get_subarchitecture(void) {
|
||||
if (detect() == CPU_LOONGSON3R5) {
|
||||
printf("LOONGSON3R5");
|
||||
} else {
|
||||
printf("UNKNOWN");
|
||||
}
|
||||
}
|
||||
|
||||
void get_subdirname(void) {
|
||||
printf("loongarch64");
|
||||
}
|
||||
|
||||
void get_cpuconfig(void) {
|
||||
if (detect() == CPU_LOONGSON3R5) {
|
||||
printf("#define LOONGSON3R5\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
} else {
|
||||
printf("#define LOONGSON3R5\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
}
|
||||
}
|
||||
|
||||
void get_libname(void){
|
||||
if (detect() == CPU_LOONGSON3R5) {
|
||||
printf("loongson3r5\n");
|
||||
} else {
|
||||
printf("loongarch64\n");
|
||||
}
|
||||
}
|
||||
8
ctest.c
8
ctest.c
|
|
@ -84,7 +84,7 @@ OS_AIX
|
|||
OS_OSF
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32) || defined(__WIN64) || defined(__WINNT)
|
||||
#if defined(__WIN32) || defined(__WIN64) || defined(_WIN32) || defined(_WIN64) || defined(__WINNT)
|
||||
OS_WINNT
|
||||
#endif
|
||||
|
||||
|
|
@ -141,7 +141,7 @@ ARCH_SPARC
|
|||
ARCH_IA64
|
||||
#endif
|
||||
|
||||
#if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__)
|
||||
#if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) || defined(__aarch64__)
|
||||
BINARY_64
|
||||
#endif
|
||||
|
||||
|
|
@ -157,6 +157,10 @@ ARCH_ARM64
|
|||
ARCH_RISCV64
|
||||
#endif
|
||||
|
||||
#ifdef __loongarch64
|
||||
ARCH_LOONGARCH64
|
||||
#endif
|
||||
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
|
||||
HAVE_C11
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -4,6 +4,9 @@ include_directories(${PROJECT_BINARY_DIR})
|
|||
enable_language(Fortran)
|
||||
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS")
|
||||
if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
|
||||
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize")
|
||||
endif()
|
||||
|
||||
if(WIN32)
|
||||
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1
|
||||
|
|
|
|||
|
|
@ -6,6 +6,9 @@ TOPDIR = ..
|
|||
include $(TOPDIR)/Makefile.system
|
||||
|
||||
override CFLAGS += -DADD$(BU) -DCBLAS
|
||||
ifeq ($(F_COMPILER),GFORTRAN)
|
||||
override FFLAGS += -fno-tree-vectorize
|
||||
endif
|
||||
override TARGET_ARCH=
|
||||
override TARGET_MACH=
|
||||
|
||||
|
|
|
|||
|
|
@ -81,6 +81,7 @@ foreach (float_type ${FLOAT_TYPES})
|
|||
GenerateNamedObjects("gbmv_thread.c" "TRANSA" "gbmv_thread_t" false "" "" false ${float_type})
|
||||
endif ()
|
||||
|
||||
# special defines for complex
|
||||
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
|
||||
|
||||
foreach (u_source ${U_SOURCES})
|
||||
|
|
@ -197,6 +198,13 @@ foreach (float_type ${FLOAT_TYPES})
|
|||
endif ()
|
||||
endforeach ()
|
||||
|
||||
if (BUILD_BFLOAT16)
|
||||
if (USE_THREAD)
|
||||
GenerateNamedObjects("sbgemv_thread.c" "" "gemv_thread_n" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("sbgemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "BFLOAT16")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if ( BUILD_COMPLEX AND NOT BUILD_SINGLE)
|
||||
if (USE_THREAD)
|
||||
GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "SINGLE")
|
||||
|
|
|
|||
|
|
@ -12,6 +12,12 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES})
|
|||
if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3)
|
||||
GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0)
|
||||
endif ()
|
||||
if (BUILD_BFLOAT16)
|
||||
GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16")
|
||||
if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3)
|
||||
GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16")
|
||||
endif ()
|
||||
endif ()
|
||||
endforeach ()
|
||||
|
||||
if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
|
||||
|
|
|
|||
|
|
@ -6,10 +6,6 @@ extern gotoblas_t gotoblas_POWER8;
|
|||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
extern gotoblas_t gotoblas_POWER9;
|
||||
#endif
|
||||
//#if (!defined __GNUC__) || ( __GNUC__ >= 11) \
|
||||
// || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
|
||||
//#define HAVE_P10_SUPPORT 1
|
||||
//#endif
|
||||
#ifdef HAVE_P10_SUPPORT
|
||||
extern gotoblas_t gotoblas_POWER10;
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -73,6 +73,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
#ifndef likely
|
||||
#ifdef __GNUC__
|
||||
#define likely(x) __builtin_expect(!!(x), 1)
|
||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
#else
|
||||
#define likely(x) (x)
|
||||
#define unlikely(x) (x)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(USE_TLS) && defined(SMP)
|
||||
#define COMPILE_TLS
|
||||
|
||||
|
|
@ -428,7 +438,7 @@ extern int openblas_goto_num_threads_env();
|
|||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
|
|
@ -436,7 +446,7 @@ int blas_get_cpu_number(void){
|
|||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
|
|
@ -460,7 +470,7 @@ int blas_get_cpu_number(void){
|
|||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
|
|
@ -1291,7 +1301,12 @@ UNLOCK_COMMAND(&alloc_lock);
|
|||
return (void *)(((char *)alloc_info) + sizeof(struct alloc_t));
|
||||
|
||||
error:
|
||||
printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
|
||||
printf("OpenBLAS : Program will terminate because you tried to allocate too many TLS memory regions.\n");
|
||||
printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
|
||||
printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
|
||||
printf("a sufficiently small number. This error typically occurs when the software that relies on\n");
|
||||
printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n");
|
||||
printf("cpu cores than what OpenBLAS was configured to handle.\n");
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
|
@ -1979,7 +1994,7 @@ extern int openblas_goto_num_threads_env();
|
|||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
|
|
@ -1987,7 +2002,7 @@ int blas_get_cpu_number(void){
|
|||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
|
|
@ -2011,7 +2026,7 @@ int blas_get_cpu_number(void){
|
|||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
|
|
@ -2055,6 +2070,7 @@ struct release_t {
|
|||
int hugetlb_allocated = 0;
|
||||
|
||||
static struct release_t release_info[NUM_BUFFERS];
|
||||
static struct release_t *new_release_info;
|
||||
static int release_pos = 0;
|
||||
|
||||
#if defined(OS_LINUX) && !defined(NO_WARMUP)
|
||||
|
|
@ -2105,8 +2121,13 @@ static void *alloc_mmap(void *address){
|
|||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
if (likely(release_pos < NUM_BUFFERS)) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_mmap_free;
|
||||
} else {
|
||||
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
|
||||
new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free;
|
||||
}
|
||||
release_pos ++;
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
|
@ -2269,8 +2290,13 @@ static void *alloc_mmap(void *address){
|
|||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
if (likely(release_pos < NUM_BUFFERS)) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_mmap_free;
|
||||
} else {
|
||||
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
|
||||
new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free;
|
||||
}
|
||||
release_pos ++;
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
|
@ -2302,8 +2328,13 @@ static void *alloc_malloc(void *address){
|
|||
if (map_address == (void *)NULL) map_address = (void *)-1;
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
if (likely(release_pos < NUM_BUFFERS)) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_malloc_free;
|
||||
} else {
|
||||
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
|
||||
new_release_info[release_pos-NUM_BUFFERS].func = alloc_malloc_free;
|
||||
}
|
||||
release_pos ++;
|
||||
}
|
||||
|
||||
|
|
@ -2336,8 +2367,13 @@ static void *alloc_qalloc(void *address){
|
|||
if (map_address == (void *)NULL) map_address = (void *)-1;
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
if (likely(release_pos < NUM_BUFFERS)) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_qalloc_free;
|
||||
} else {
|
||||
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
|
||||
new_release_info[release_pos-NUM_BUFFERS].func = alloc_qalloc_free;
|
||||
}
|
||||
release_pos ++;
|
||||
}
|
||||
|
||||
|
|
@ -2365,8 +2401,13 @@ static void *alloc_windows(void *address){
|
|||
if (map_address == (void *)NULL) map_address = (void *)-1;
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
if (likely(release_pos < NUM_BUFFERS)) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_windows_free;
|
||||
} else {
|
||||
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
|
||||
new_release_info[release_pos-NUM_BUFFERS].func = alloc_windows_free;
|
||||
}
|
||||
release_pos ++;
|
||||
}
|
||||
|
||||
|
|
@ -2409,9 +2450,15 @@ static void *alloc_devicedirver(void *address){
|
|||
fd, 0);
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
if (likely(release_pos < NUM_BUFFERS)) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].attr = fd;
|
||||
release_info[release_pos].func = alloc_devicedirver_free;
|
||||
} else {
|
||||
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
|
||||
new_release_info[release_pos-NUM_BUFFERS].attr = fd;
|
||||
new_release_info[release_pos-NUM_BUFFERS].func = alloc_devicedirver_free;
|
||||
}
|
||||
release_pos ++;
|
||||
}
|
||||
|
||||
|
|
@ -2445,9 +2492,15 @@ static void *alloc_shm(void *address){
|
|||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
if (likely(release_pos < NUM_BUFFERS)) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].attr = shmid;
|
||||
release_info[release_pos].func = alloc_shm_free;
|
||||
} else {
|
||||
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
|
||||
new_release_info[release_pos-NUM_BUFFERS].attr = shmid;
|
||||
new_release_info[release_pos-NUM_BUFFERS].func = alloc_shm_free;
|
||||
}
|
||||
release_pos ++;
|
||||
}
|
||||
|
||||
|
|
@ -2551,8 +2604,13 @@ static void *alloc_hugetlb(void *address){
|
|||
#endif
|
||||
|
||||
if (map_address != (void *)-1){
|
||||
if (likely(release_pos < NUM_BUFFERS)) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_hugetlb_free;
|
||||
} else {
|
||||
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
|
||||
new_release_info[release_pos-NUM_BUFFERS].func = alloc_hugetlb_free;
|
||||
}
|
||||
release_pos ++;
|
||||
}
|
||||
|
||||
|
|
@ -2599,9 +2657,15 @@ static void *alloc_hugetlbfile(void *address){
|
|||
fd, 0);
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
if (likely(release_pos < NUM_BUFFERS)) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].attr = fd;
|
||||
release_info[release_pos].func = alloc_hugetlbfile_free;
|
||||
} else {
|
||||
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
|
||||
new_release_info[release_pos-NUM_BUFFERS].attr = fd;
|
||||
new_release_info[release_pos-NUM_BUFFERS].func = alloc_hugetlbfile_free;
|
||||
}
|
||||
release_pos ++;
|
||||
}
|
||||
|
||||
|
|
@ -2631,8 +2695,25 @@ static volatile struct {
|
|||
|
||||
} memory[NUM_BUFFERS];
|
||||
|
||||
static int memory_initialized = 0;
|
||||
struct newmemstruct
|
||||
{
|
||||
BLASULONG lock;
|
||||
void *addr;
|
||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
||||
int pos;
|
||||
#endif
|
||||
int used;
|
||||
#ifndef __64BIT__
|
||||
char dummy[48];
|
||||
#else
|
||||
char dummy[40];
|
||||
#endif
|
||||
|
||||
};
|
||||
static volatile struct newmemstruct *newmemory;
|
||||
|
||||
static int memory_initialized = 0;
|
||||
static int memory_overflowed = 0;
|
||||
/* Memory allocation routine */
|
||||
/* procpos ... indicates where it comes from */
|
||||
/* 0 : Level 3 functions */
|
||||
|
|
@ -2641,6 +2722,8 @@ static int memory_initialized = 0;
|
|||
|
||||
void *blas_memory_alloc(int procpos){
|
||||
|
||||
int i;
|
||||
|
||||
int position;
|
||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
||||
int mypos = 0;
|
||||
|
|
@ -2774,6 +2857,29 @@ void *blas_memory_alloc(int procpos){
|
|||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
if (memory_overflowed) {
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
do {
|
||||
RMB;
|
||||
#if defined(USE_OPENMP)
|
||||
if (!newmemory[position-NUM_BUFFERS].used) {
|
||||
blas_lock(&newmemory[position-NUM_BUFFERS].lock);
|
||||
#endif
|
||||
if (!newmemory[position-NUM_BUFFERS].used) goto allocation2;
|
||||
|
||||
#if defined(USE_OPENMP)
|
||||
blas_unlock(&newmemory[position-NUM_BUFFERS].lock);
|
||||
}
|
||||
#endif
|
||||
position ++;
|
||||
|
||||
} while (position < 512+NUM_BUFFERS);
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
}
|
||||
goto error;
|
||||
|
||||
allocation :
|
||||
|
|
@ -2878,8 +2984,97 @@ void *blas_memory_alloc(int procpos){
|
|||
return (void *)memory[position].addr;
|
||||
|
||||
error:
|
||||
printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
|
||||
if (memory_overflowed) goto terminate;
|
||||
fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n");
|
||||
memory_overflowed=1;
|
||||
new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t));
|
||||
newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct));
|
||||
for (i = 0; i < 512; i++) {
|
||||
newmemory[i].addr = (void *)0;
|
||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
||||
newmemory[i].pos = -1;
|
||||
#endif
|
||||
newmemory[i].used = 0;
|
||||
newmemory[i].lock = 0;
|
||||
}
|
||||
newmemory[position-NUM_BUFFERS].used = 1;
|
||||
|
||||
allocation2:
|
||||
newmemory[position-NUM_BUFFERS].used = 1;
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#else
|
||||
blas_unlock(&newmemory[position-NUM_BUFFERS].lock);
|
||||
#endif
|
||||
do {
|
||||
#ifdef DEBUG
|
||||
printf("Allocation Start : %lx\n", base_address);
|
||||
#endif
|
||||
|
||||
map_address = (void *)-1;
|
||||
|
||||
func = &memoryalloc[0];
|
||||
|
||||
while ((func != NULL) && (map_address == (void *) -1)) {
|
||||
|
||||
map_address = (*func)((void *)base_address);
|
||||
|
||||
#ifdef ALLOC_DEVICEDRIVER
|
||||
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
|
||||
fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ALLOC_HUGETLBFILE
|
||||
if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
|
||||
#ifndef OS_WINDOWS
|
||||
fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
|
||||
if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
|
||||
#endif
|
||||
|
||||
func ++;
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
printf(" Success -> %08lx\n", map_address);
|
||||
#endif
|
||||
if (((BLASLONG) map_address) == -1) base_address = 0UL;
|
||||
|
||||
if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
|
||||
|
||||
} while ((BLASLONG)map_address == -1);
|
||||
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
newmemory[position-NUM_BUFFERS].addr = map_address;
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
printf(" Mapping Succeeded. %p(%d)\n", (void *)newmemory[position-NUM_BUFFERS].addr, position);
|
||||
#endif
|
||||
|
||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
||||
|
||||
if (newmemory[position-NUM_BUFFERS].pos == -1) newmemory[position-NUM_BUFFERS].pos = mypos;
|
||||
|
||||
#endif
|
||||
return (void *)newmemory[position-NUM_BUFFERS].addr;
|
||||
|
||||
terminate:
|
||||
printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
|
||||
printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
|
||||
printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
|
||||
printf("a sufficiently small number. This error typically occurs when the software that relies on\n");
|
||||
printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n");
|
||||
printf("cpu cores than what OpenBLAS was configured to handle.\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
|
@ -2898,13 +3093,28 @@ void blas_memory_free(void *free_area){
|
|||
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
|
||||
position++;
|
||||
|
||||
if (position >= NUM_BUFFERS) goto error;
|
||||
if (position >= NUM_BUFFERS && !memory_overflowed) goto error;
|
||||
|
||||
#ifdef DEBUG
|
||||
if (memory[position].addr != free_area) goto error;
|
||||
printf(" Position : %d\n", position);
|
||||
#endif
|
||||
if (unlikely(memory_overflowed && position >= NUM_BUFFERS)) {
|
||||
while ((position < NUM_BUFFERS+512) && (newmemory[position-NUM_BUFFERS].addr != free_area))
|
||||
position++;
|
||||
// arm: ensure all writes are finished before other thread takes this memory
|
||||
WMB;
|
||||
|
||||
newmemory[position].used = 0;
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("Unmap from overflow area succeeded.\n\n");
|
||||
#endif
|
||||
return;
|
||||
} else {
|
||||
// arm: ensure all writes are finished before other thread takes this memory
|
||||
WMB;
|
||||
|
||||
|
|
@ -2918,7 +3128,7 @@ void blas_memory_free(void *free_area){
|
|||
#endif
|
||||
|
||||
return;
|
||||
|
||||
}
|
||||
error:
|
||||
printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
|
||||
|
||||
|
|
@ -2953,7 +3163,10 @@ void blas_shutdown(void){
|
|||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
for (pos = 0; pos < release_pos; pos ++) {
|
||||
if (likely(pos < NUM_BUFFERS))
|
||||
release_info[pos].func(&release_info[pos]);
|
||||
else
|
||||
new_release_info[pos-NUM_BUFFERS].func(&new_release_info[pos-NUM_BUFFERS]);
|
||||
}
|
||||
|
||||
#ifdef SEEK_ADDRESS
|
||||
|
|
@ -2970,6 +3183,15 @@ void blas_shutdown(void){
|
|||
#endif
|
||||
memory[pos].lock = 0;
|
||||
}
|
||||
if (memory_overflowed)
|
||||
for (pos = 0; pos < 512; pos ++){
|
||||
newmemory[pos].addr = (void *)0;
|
||||
newmemory[pos].used = 0;
|
||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
||||
newmemory[pos].pos = -1;
|
||||
#endif
|
||||
newmemory[pos].lock = 0;
|
||||
}
|
||||
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
||||
|
|
|
|||
|
|
@ -524,6 +524,9 @@ void blas_set_parameter(void){
|
|||
xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M;
|
||||
#endif
|
||||
|
||||
#ifdef BUILD_BFLOAT16
|
||||
sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15;
|
||||
#endif
|
||||
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
|
||||
dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
|
||||
cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
|
||||
|
|
@ -629,7 +632,9 @@ void blas_set_parameter(void){
|
|||
xgemm_p = 16 * (size + 1);
|
||||
#endif
|
||||
|
||||
#ifdef BUILD_BFLOAT16
|
||||
sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15;
|
||||
#endif
|
||||
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
|
||||
dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
|
||||
cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
|
||||
|
|
|
|||
111
getarch.c
111
getarch.c
|
|
@ -142,6 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/* #define FORCE_SICORTEX */
|
||||
/* #define FORCE_LOONGSON3R3 */
|
||||
/* #define FORCE_LOONGSON3R4 */
|
||||
/* #define FORCE_LOONGSON3R5 */
|
||||
/* #define FORCE_I6400 */
|
||||
/* #define FORCE_P6600 */
|
||||
/* #define FORCE_P5600 */
|
||||
|
|
@ -312,6 +313,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#ifdef NO_AVX
|
||||
#define SUBARCHITECTURE "NEHALEM"
|
||||
#define ARCHCONFIG "-DNEHALEM " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
|
||||
#define LIBNAME "nehalem"
|
||||
#define CORENAME "NEHALEM"
|
||||
#else
|
||||
#define SUBARCHITECTURE "SANDYBRIDGE"
|
||||
#define ARCHCONFIG "-DSANDYBRIDGE " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
|
|
@ -321,12 +332,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define LIBNAME "sandybridge"
|
||||
#define CORENAME "SANDYBRIDGE"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_HASWELL
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#ifdef NO_AVX2
|
||||
#ifdef NO_AVX
|
||||
#define SUBARCHITECTURE "NEHALEM"
|
||||
#define ARCHCONFIG "-DNEHALEM " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
|
||||
#define LIBNAME "nehalem"
|
||||
#define CORENAME "NEHALEM"
|
||||
#else
|
||||
#define SUBARCHITECTURE "SANDYBRIDGE"
|
||||
#define ARCHCONFIG "-DSANDYBRIDGE " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
|
|
@ -335,6 +357,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
|
||||
#define LIBNAME "sandybridge"
|
||||
#define CORENAME "SANDYBRIDGE"
|
||||
#endif
|
||||
#else
|
||||
#define SUBARCHITECTURE "HASWELL"
|
||||
#define ARCHCONFIG "-DHASWELL " \
|
||||
|
|
@ -349,10 +372,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
#ifdef FORCE_SKYLAKEX
|
||||
#ifdef NO_AVX512
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#ifdef NO_AVX512
|
||||
#ifdef NO_AVX2
|
||||
#ifdef NO_AVX
|
||||
#define SUBARCHITECTURE "NEHALEM"
|
||||
#define ARCHCONFIG "-DNEHALEM " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
|
||||
#define LIBNAME "nehalem"
|
||||
#define CORENAME "NEHALEM"
|
||||
#else
|
||||
#define SUBARCHITECTURE "SANDYBRIDGE"
|
||||
#define ARCHCONFIG "-DSANDYBRIDGE " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
|
||||
#define LIBNAME "sandybridge"
|
||||
#define CORENAME "SANDYBRIDGE"
|
||||
#endif
|
||||
#else
|
||||
#define SUBARCHITECTURE "HASWELL"
|
||||
#define ARCHCONFIG "-DHASWELL " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
|
|
@ -362,10 +406,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
|
||||
#define LIBNAME "haswell"
|
||||
#define CORENAME "HASWELL"
|
||||
#endif
|
||||
#else
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#define SUBARCHITECTURE "SKYLAKEX"
|
||||
#define ARCHCONFIG "-DSKYLAKEX " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
|
|
@ -379,10 +421,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
#ifdef FORCE_COOPERLAKE
|
||||
#ifdef NO_AVX512
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#ifdef NO_AVX512
|
||||
#ifdef NO_AVX2
|
||||
#ifdef NO_AVX
|
||||
#define SUBARCHITECTURE "NEHALEM"
|
||||
#define ARCHCONFIG "-DNEHALEM " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
|
||||
#define LIBNAME "nehalem"
|
||||
#define CORENAME "NEHALEM"
|
||||
#else
|
||||
#define SUBARCHITECTURE "SANDYBRIDGE"
|
||||
#define ARCHCONFIG "-DSANDYBRIDGE " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
|
||||
#define LIBNAME "sandybridge"
|
||||
#define CORENAME "SANDYBRIDGE"
|
||||
#endif
|
||||
#else
|
||||
#define SUBARCHITECTURE "HASWELL"
|
||||
#define ARCHCONFIG "-DHASWELL " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
|
|
@ -392,10 +455,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
|
||||
#define LIBNAME "haswell"
|
||||
#define CORENAME "HASWELL"
|
||||
#endif
|
||||
#else
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#define SUBARCHITECTURE "COOPERLAKE"
|
||||
#define ARCHCONFIG "-DCOOPERLAKE " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
|
|
@ -563,6 +624,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#ifdef NO_AVX2
|
||||
#ifdef NO_AVX
|
||||
#define SUBARCHITECTURE "NEHALEM"
|
||||
#define ARCHCONFIG "-DNEHALEM " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
|
||||
#define LIBNAME "nehalem"
|
||||
#define CORENAME "NEHALEM"
|
||||
#else
|
||||
#define SUBARCHITECTURE "SANDYBRIDGE"
|
||||
#define ARCHCONFIG "-DSANDYBRIDGE " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
|
|
@ -571,6 +642,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
|
||||
#define LIBNAME "sandybridge"
|
||||
#define CORENAME "SANDYBRIDGE"
|
||||
#endif
|
||||
#else
|
||||
#define SUBARCHITECTURE "ZEN"
|
||||
#define ARCHCONFIG "-DZEN " \
|
||||
|
|
@ -842,6 +914,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_LOONGSON3R5
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "LOONGARCH"
|
||||
#define SUBARCHITECTURE "LOONGSON3R5"
|
||||
#define SUBDIRNAME "loongarch64"
|
||||
#define ARCHCONFIG "-DLOONGSON3R5 " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 "
|
||||
#define LIBNAME "loongson3r5"
|
||||
#define CORENAME "LOONGSON3R5"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_I6400
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "MIPS"
|
||||
|
|
@ -1388,6 +1474,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define OPENBLAS_SUPPORTED
|
||||
#endif
|
||||
|
||||
#ifdef __loongarch64
|
||||
#include "cpuid_loongarch64.c"
|
||||
#define OPENBLAS_SUPPORTED
|
||||
#endif
|
||||
|
||||
#ifdef __riscv
|
||||
#include "cpuid_riscv64.c"
|
||||
#define OPENBLAS_SUPPORTED
|
||||
|
|
@ -1463,7 +1554,7 @@ int main(int argc, char *argv[]){
|
|||
#ifdef FORCE
|
||||
printf("CORE=%s\n", CORENAME);
|
||||
#else
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc)
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
|
||||
printf("CORE=%s\n", get_corename());
|
||||
#endif
|
||||
#endif
|
||||
|
|
@ -1611,7 +1702,7 @@ printf("ELF_VERSION=2\n");
|
|||
#ifdef FORCE
|
||||
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
|
||||
#else
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc)
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
|
||||
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -82,6 +82,7 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS})
|
|||
GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX})
|
||||
GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX})
|
||||
|
||||
GenerateNamedObjects("xerbla.c" "" "xerbla" ${CBLAS_FLAG} "" "" true)
|
||||
#sdsdot, dsdot
|
||||
if (BUILD_SINGLE OR BUILD_DOUBLE)
|
||||
GenerateNamedObjects("sdsdot.c" "" "sdsdot" ${CBLAS_FLAG} "" "" true "SINGLE")
|
||||
|
|
@ -104,6 +105,15 @@ endif ()
|
|||
GenerateNamedObjects("imax.c" "USE_ABS;USE_MIN" "i*amin" ${CBLAS_FLAG})
|
||||
GenerateNamedObjects("imax.c" "USE_MIN" "i*min" ${CBLAS_FLAG})
|
||||
|
||||
if (BUILD_BFLOAT16)
|
||||
GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("bf16to.c" "SINGLE_PREC" "sbf16tos" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("bf16to.c" "DOUBLE_PREC" "dbf16tod" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
endif ()
|
||||
|
||||
# complex-specific sources
|
||||
foreach (float_type ${FLOAT_TYPES})
|
||||
|
|
|
|||
|
|
@ -105,6 +105,55 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B
|
|||
#endif
|
||||
};
|
||||
|
||||
#if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE)
|
||||
#define USE_SMALL_MATRIX_OPT 1
|
||||
#else
|
||||
#define USE_SMALL_MATRIX_OPT 0
|
||||
#endif
|
||||
|
||||
#if USE_SMALL_MATRIX_OPT
|
||||
#ifndef DYNAMIC_ARCH
|
||||
#define SMALL_KERNEL_ADDR(table, idx) ((void *)(table[idx]))
|
||||
#else
|
||||
#define SMALL_KERNEL_ADDR(table, idx) ((void *)(*(uintptr_t *)((char *)gotoblas + (size_t)(table[idx]))))
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef COMPLEX
|
||||
static size_t gemm_small_kernel[] = {
|
||||
GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, 0, 0,
|
||||
GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, 0, 0,
|
||||
};
|
||||
|
||||
|
||||
static size_t gemm_small_kernel_b0[] = {
|
||||
GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, 0, 0,
|
||||
GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, 0, 0,
|
||||
};
|
||||
|
||||
#define GEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, IFLOAT *, BLASLONG, FLOAT, IFLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel_b0, (idx))
|
||||
#define GEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, IFLOAT *, BLASLONG, FLOAT, IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel, (idx))
|
||||
#else
|
||||
|
||||
static size_t zgemm_small_kernel[] = {
|
||||
GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN,
|
||||
GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT,
|
||||
GEMM_SMALL_KERNEL_NR, GEMM_SMALL_KERNEL_TR, GEMM_SMALL_KERNEL_RR, GEMM_SMALL_KERNEL_CR,
|
||||
GEMM_SMALL_KERNEL_NC, GEMM_SMALL_KERNEL_TC, GEMM_SMALL_KERNEL_RC, GEMM_SMALL_KERNEL_CC,
|
||||
};
|
||||
|
||||
static size_t zgemm_small_kernel_b0[] = {
|
||||
GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN,
|
||||
GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT,
|
||||
GEMM_SMALL_KERNEL_B0_NR, GEMM_SMALL_KERNEL_B0_TR, GEMM_SMALL_KERNEL_B0_RR, GEMM_SMALL_KERNEL_B0_CR,
|
||||
GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC,
|
||||
};
|
||||
|
||||
#define ZGEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel, (idx))
|
||||
#define ZGEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel_b0, (idx))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef CBLAS
|
||||
|
||||
void NAME(char *TRANSA, char *TRANSB,
|
||||
|
|
@ -224,8 +273,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
|||
blasint m, blasint n, blasint k,
|
||||
#ifndef COMPLEX
|
||||
FLOAT alpha,
|
||||
FLOAT *a, blasint lda,
|
||||
FLOAT *b, blasint ldb,
|
||||
IFLOAT *a, blasint lda,
|
||||
IFLOAT *b, blasint ldb,
|
||||
FLOAT beta,
|
||||
FLOAT *c, blasint ldc) {
|
||||
#else
|
||||
|
|
@ -277,7 +326,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
|||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
#if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT)
|
||||
#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT)
|
||||
#ifdef DYNAMIC_ARCH
|
||||
if (support_avx512() )
|
||||
#endif
|
||||
|
|
@ -417,6 +466,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
|||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
#if USE_SMALL_MATRIX_OPT
|
||||
#if !defined(COMPLEX)
|
||||
if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, *(FLOAT *)(args.alpha), *(FLOAT *)(args.beta))){
|
||||
if(*(FLOAT *)(args.beta) == 0.0){
|
||||
(GEMM_SMALL_KERNEL_B0((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc);
|
||||
}else{
|
||||
(GEMM_SMALL_KERNEL((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#else
|
||||
if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, alpha[0], alpha[1], beta[0], beta[1])){
|
||||
if(beta[0] == 0.0 && beta[1] == 0.0){
|
||||
(ZGEMM_SMALL_KERNEL_B0((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, args.c, args.ldc);
|
||||
}else{
|
||||
(ZGEMM_SMALL_KERNEL((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, beta[0], beta[1], args.c, args.ldc);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
buffer = (XFLOAT *)blas_memory_alloc(0);
|
||||
|
||||
sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A);
|
||||
|
|
|
|||
|
|
@ -119,7 +119,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
|
|||
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) {
|
||||
|
||||
FLOAT *buffer;
|
||||
int trans, uplo;
|
||||
int uplo;
|
||||
blasint info;
|
||||
FLOAT * ALPHA = α
|
||||
FLOAT alpha_r = ALPHA[0];
|
||||
|
|
@ -130,7 +130,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO
|
|||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
trans = -1;
|
||||
uplo = -1;
|
||||
info = 0;
|
||||
|
||||
|
|
|
|||
|
|
@ -91,6 +91,15 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "d*dot_k" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "dsdot_k" false "" "" false "SINGLE")
|
||||
|
||||
# sbdot
|
||||
if (BUILD_BFLOAT16)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBDOTKERNEL}" "SBDOT" "dot_k" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "SINGLE" "f16tos_k" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "DOUBLE" "bf16tod_k" false "" "" false "DOUBLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "SINGLE" "stobf16_k" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "DOUBLE" "dtobf16_k" false "" "" false "BFLOAT16")
|
||||
endif()
|
||||
|
||||
if ((BUILD_COMPLEX OR BUILD_DOUBLE) AND NOT BUILD_SINGLE)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "SINGLE")
|
||||
|
|
@ -149,9 +158,6 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3)
|
||||
foreach (float_type ${FLOAT_TYPES})
|
||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||
if (${float_type} STREQUAL "BFLOAT16")
|
||||
set (float_char "SB")
|
||||
endif ()
|
||||
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type})
|
||||
|
|
@ -185,6 +191,10 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE")
|
||||
endif ()
|
||||
if (BUILD_BFLOAT16)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMVNKERNEL}" "" "gemv_n" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMVTKERNEL}" "" "gemv_t" false "" "" false "BFLOAT16")
|
||||
endif ()
|
||||
# Makefile.L3
|
||||
set(USE_TRMM false)
|
||||
string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE)
|
||||
|
|
@ -209,15 +219,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE)
|
||||
endif()
|
||||
|
||||
foreach (float_type SINGLE DOUBLE BFLOAT16)
|
||||
foreach (float_type SINGLE DOUBLE)
|
||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||
if (${float_type} STREQUAL "BFLOAT16")
|
||||
if (NOT ${BUILD_BFLOAT16})
|
||||
continue ()
|
||||
else ()
|
||||
set (float_char "SB")
|
||||
endif ()
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type})
|
||||
endforeach()
|
||||
if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
|
||||
|
|
@ -253,11 +256,24 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
GenerateNamedObjects("${KERNELDIR}/${SGEMM_BETA}" "" "gemm_beta" false "" "" false "SINGLE")
|
||||
endif ()
|
||||
|
||||
if (BUILD_BFLOAT16)
|
||||
if (SBGEMMINCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMMINCOPY}" "" "${SBGEMMINCOPYOBJ}" false "" "" true "BFLOAT16")
|
||||
endif ()
|
||||
if (SBGEMMITCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMMITCOPY}" "" "${SBGEMMITCOPYOBJ}" false "" "" true "BFLOAT16")
|
||||
endif ()
|
||||
if (SBGEMMONCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMMONCOPY}" "" "${SBGEMMONCOPYOBJ}" false "" "" true "BFLOAT16")
|
||||
endif ()
|
||||
if (SBGEMMOTCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMMOTCOPY}" "" "${SBGEMMOTCOPYOBJ}" false "" "" true "BFLOAT16")
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMMKERNEL}" "" "gemm_kernel" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_BETA}" "" "gemm_beta" false "" "" false "BFLOAT16")
|
||||
endif ()
|
||||
foreach (float_type ${FLOAT_TYPES})
|
||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||
if (${float_type} STREQUAL "BFLOAT16")
|
||||
set (float_char "SB")
|
||||
endif ()
|
||||
if (${float_char}GEMMINCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type})
|
||||
endif ()
|
||||
|
|
@ -458,7 +474,155 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false ${float_type})
|
||||
|
||||
if (NOT DEFINED ${float_char}GEMM_SMALL_M_PERMIT)
|
||||
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
|
||||
set(${float_char}GEMM_SMALL_M_PERMIT ../generic/zgemm_small_matrix_permit.c)
|
||||
else ()
|
||||
set(${float_char}GEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c)
|
||||
endif ()
|
||||
endif ()
|
||||
if (NOT DEFINED ${float_char}GEMM_SMALL_K_NN)
|
||||
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
|
||||
set(${float_char}GEMM_SMALL_K_NN ../generic/zgemm_small_matrix_kernel_nn.c)
|
||||
else ()
|
||||
set(${float_char}GEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c)
|
||||
endif ()
|
||||
endif ()
|
||||
if (NOT DEFINED ${float_char}GEMM_SMALL_K_NT)
|
||||
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
|
||||
set(${float_char}GEMM_SMALL_K_NT ../generic/zgemm_small_matrix_kernel_nt.c)
|
||||
else ()
|
||||
set(${float_char}GEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c)
|
||||
endif ()
|
||||
endif ()
|
||||
if (NOT DEFINED ${float_char}GEMM_SMALL_K_TN)
|
||||
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
|
||||
set(${float_char}GEMM_SMALL_K_TN ../generic/zgemm_small_matrix_kernel_tn.c)
|
||||
else ()
|
||||
set(${float_char}GEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c)
|
||||
endif ()
|
||||
endif ()
|
||||
if (NOT DEFINED ${float_char}GEMM_SMALL_K_TT)
|
||||
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
|
||||
set(${float_char}GEMM_SMALL_K_TT ../generic/zgemm_small_matrix_kernel_tt.c)
|
||||
else ()
|
||||
set(${float_char}GEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c)
|
||||
endif ()
|
||||
endif ()
|
||||
if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NN)
|
||||
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
|
||||
set(${float_char}GEMM_SMALL_K_B0_NN ../generic/zgemm_small_matrix_kernel_nn.c)
|
||||
else ()
|
||||
set(${float_char}GEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c)
|
||||
endif ()
|
||||
endif ()
|
||||
if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NT)
|
||||
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
|
||||
set(${float_char}GEMM_SMALL_K_B0_NT ../generic/zgemm_small_matrix_kernel_nt.c)
|
||||
else ()
|
||||
set(${float_char}GEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c)
|
||||
endif ()
|
||||
endif ()
|
||||
if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TN)
|
||||
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
|
||||
set(${float_char}GEMM_SMALL_K_B0_TN ../generic/zgemm_small_matrix_kernel_tn.c)
|
||||
else ()
|
||||
set(${float_char}GEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c)
|
||||
endif ()
|
||||
endif ()
|
||||
if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TT)
|
||||
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
|
||||
set(${float_char}GEMM_SMALL_K_B0_TT ../generic/zgemm_small_matrix_kernel_tt.c)
|
||||
else ()
|
||||
set(${float_char}GEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (SMALL_MATRIX_OPT)
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false ${float_type})
|
||||
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "NN" "gemm_small_kernel_nn" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "NR" "gemm_small_kernel_nr" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "RN" "gemm_small_kernel_rn" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "RR" "gemm_small_kernel_rr" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "NT" "gemm_small_kernel_nt" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "NC" "gemm_small_kernel_nc" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "RT" "gemm_small_kernel_rt" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "RC" "gemm_small_kernel_rc" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "TN" "gemm_small_kernel_tn" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "TR" "gemm_small_kernel_tr" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "CN" "gemm_small_kernel_cn" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "CR" "gemm_small_kernel_cr" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "TT" "gemm_small_kernel_tt" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "TC" "gemm_small_kernel_tc" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CT" "gemm_small_kernel_ct" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CC" "gemm_small_kernel_cc" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NN;B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NR;B0" "gemm_small_kernel_b0_nr" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RN;B0" "gemm_small_kernel_b0_rn" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RR;B0" "gemm_small_kernel_b0_rr" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NT;B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NC;B0" "gemm_small_kernel_b0_nc" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RT;B0" "gemm_small_kernel_b0_rt" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RC;B0" "gemm_small_kernel_b0_rc" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TN;B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TR;B0" "gemm_small_kernel_b0_tr" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CN;B0" "gemm_small_kernel_b0_cn" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CR;B0" "gemm_small_kernel_b0_cr" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TT;B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TC;B0" "gemm_small_kernel_b0_tc" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CT;B0" "gemm_small_kernel_b0_ct" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CC;B0" "gemm_small_kernel_b0_cc" false "" "" false ${float_type})
|
||||
|
||||
else ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type})
|
||||
endif ()
|
||||
if (BUILD_BFLOAT16)
|
||||
if (NOT DEFINED SBGEMM_SMALL_M_PERMIT)
|
||||
set(SBGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SBGEMM_SMALL_K_NN)
|
||||
set(SBGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SBGEMM_SMALL_K_NT)
|
||||
set(SBGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SBGEMM_SMALL_K_TN)
|
||||
set(SBGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SBGEMM_SMALL_K_TT)
|
||||
set(SBGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SBGEMM_SMALL_K_B0_NN)
|
||||
set(SBGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SBGEMM_SMALL_K_B0_NT)
|
||||
set(SBGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SBGEMM_SMALL_K_B0_TN)
|
||||
set(SBGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SBGEMM_SMALL_K_B0_TT)
|
||||
set($SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED ${float_char}OMATCOPY_CN)
|
||||
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
|
||||
|
|
@ -592,6 +756,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
#geadd
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type})
|
||||
endforeach ()
|
||||
|
||||
if (BUILD_DOUBLE AND NOT BUILD_SINGLE)
|
||||
GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false "SINGLE")
|
||||
|
|
|
|||
|
|
@ -1,3 +1,10 @@
|
|||
FMAFLAG=
|
||||
ifndef OLDGCC
|
||||
ifdef HAVE_FMA3
|
||||
FMAFLAG = -mfma
|
||||
endif
|
||||
endif
|
||||
|
||||
### GEMV ###
|
||||
|
||||
ifndef SGEMVNKERNEL
|
||||
|
|
@ -263,7 +270,7 @@ $(KDIR)dgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_n$(TSUFFIX).$(PSUFFIX) : $(KER
|
|||
$(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -UTRANS $< -o $@
|
||||
|
||||
$(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
|
||||
$(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@
|
||||
$(CC) -c $(CFLAGS) $(FMAFLAG) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVNKERNEL)
|
||||
|
|
|
|||
|
|
@ -447,6 +447,72 @@ XBLASOBJS += \
|
|||
|
||||
endif
|
||||
|
||||
###### BLAS small matrix optimization #####
|
||||
ifeq ($(SMALL_MATRIX_OPT), 1)
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
SBBLASOBJS += \
|
||||
sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
|
||||
sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
|
||||
sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
|
||||
sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
|
||||
sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
SBLASOBJS += \
|
||||
sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
|
||||
sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
|
||||
sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
|
||||
sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
|
||||
sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DBLASOBJS += \
|
||||
dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
|
||||
dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
|
||||
dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
|
||||
dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
|
||||
dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CBLASOBJS += \
|
||||
cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
|
||||
cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
|
||||
cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \
|
||||
cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
|
||||
cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \
|
||||
cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \
|
||||
cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \
|
||||
cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \
|
||||
cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \
|
||||
cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
|
||||
cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \
|
||||
cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \
|
||||
cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \
|
||||
cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \
|
||||
cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \
|
||||
cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \
|
||||
cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZBLASOBJS += \
|
||||
zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
|
||||
zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
|
||||
zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \
|
||||
zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
|
||||
zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \
|
||||
zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \
|
||||
zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \
|
||||
zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \
|
||||
zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \
|
||||
zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
|
||||
zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \
|
||||
zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \
|
||||
zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \
|
||||
zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \
|
||||
zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \
|
||||
zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \
|
||||
zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
endif
|
||||
|
||||
###### BLAS extensions #####
|
||||
|
||||
ifeq ($(BUILD_SINGLE),1)
|
||||
|
|
@ -4237,3 +4303,469 @@ endif
|
|||
$(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@
|
||||
|
||||
|
||||
|
||||
###### BLAS small matrix optimization #####
|
||||
|
||||
ifndef DGEMM_SMALL_M_PERMIT
|
||||
DGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
|
||||
endif
|
||||
|
||||
ifndef DGEMM_SMALL_K_NN
|
||||
DGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
|
||||
endif
|
||||
|
||||
ifndef DGEMM_SMALL_K_NT
|
||||
DGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
|
||||
endif
|
||||
|
||||
ifndef DGEMM_SMALL_K_TN
|
||||
DGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
|
||||
endif
|
||||
|
||||
ifndef DGEMM_SMALL_K_TT
|
||||
DGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
|
||||
endif
|
||||
|
||||
$(KDIR)dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_M_PERMIT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NN)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TN)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
ifndef DGEMM_SMALL_K_B0_NN
|
||||
DGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
|
||||
endif
|
||||
|
||||
ifndef DGEMM_SMALL_K_B0_NT
|
||||
DGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
|
||||
endif
|
||||
|
||||
ifndef DGEMM_SMALL_K_B0_TN
|
||||
DGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
|
||||
endif
|
||||
|
||||
ifndef DGEMM_SMALL_K_B0_TT
|
||||
DGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
|
||||
endif
|
||||
|
||||
$(KDIR)dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NN)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
|
||||
|
||||
$(KDIR)dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
|
||||
|
||||
$(KDIR)dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TN)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
|
||||
|
||||
$(KDIR)dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
|
||||
|
||||
ifndef SGEMM_SMALL_M_PERMIT
|
||||
SGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
|
||||
endif
|
||||
|
||||
ifndef SGEMM_SMALL_K_NN
|
||||
SGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
|
||||
endif
|
||||
|
||||
ifndef SGEMM_SMALL_K_NT
|
||||
SGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
|
||||
endif
|
||||
|
||||
ifndef SGEMM_SMALL_K_TN
|
||||
SGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
|
||||
endif
|
||||
|
||||
ifndef SGEMM_SMALL_K_TT
|
||||
SGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
|
||||
endif
|
||||
|
||||
$(KDIR)sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_M_PERMIT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NN)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TN)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
ifndef SGEMM_SMALL_K_B0_NN
|
||||
SGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
|
||||
endif
|
||||
|
||||
ifndef SGEMM_SMALL_K_B0_NT
|
||||
SGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
|
||||
endif
|
||||
|
||||
ifndef SGEMM_SMALL_K_B0_TN
|
||||
SGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
|
||||
endif
|
||||
|
||||
ifndef SGEMM_SMALL_K_B0_TT
|
||||
SGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
|
||||
endif
|
||||
|
||||
$(KDIR)sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NN)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
|
||||
|
||||
$(KDIR)sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
|
||||
|
||||
$(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TN)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
|
||||
|
||||
$(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
|
||||
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16), 1)
|
||||
ifndef SBGEMM_SMALL_M_PERMIT
|
||||
SBGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
|
||||
endif
|
||||
|
||||
ifndef SBGEMM_SMALL_K_NN
|
||||
SBGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
|
||||
endif
|
||||
|
||||
ifndef SBGEMM_SMALL_K_NT
|
||||
SBGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
|
||||
endif
|
||||
|
||||
ifndef SBGEMM_SMALL_K_TN
|
||||
SBGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
|
||||
endif
|
||||
|
||||
ifndef SBGEMM_SMALL_K_TT
|
||||
SBGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
|
||||
endif
|
||||
|
||||
$(KDIR)sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_M_PERMIT)
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NN)
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NT)
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TN)
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TT)
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
ifndef SBGEMM_SMALL_K_B0_NN
|
||||
SBGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
|
||||
endif
|
||||
|
||||
ifndef SBGEMM_SMALL_K_B0_NT
|
||||
SBGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
|
||||
endif
|
||||
|
||||
ifndef SBGEMM_SMALL_K_B0_TN
|
||||
SBGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
|
||||
endif
|
||||
|
||||
ifndef SBGEMM_SMALL_K_B0_TT
|
||||
SBGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
|
||||
endif
|
||||
|
||||
$(KDIR)sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NN)
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
|
||||
|
||||
$(KDIR)sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NT)
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
|
||||
|
||||
$(KDIR)sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TN)
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
|
||||
|
||||
$(KDIR)sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TT)
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
|
||||
endif
|
||||
|
||||
ifndef CGEMM_SMALL_M_PERMIT
|
||||
CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c
|
||||
endif
|
||||
|
||||
ifndef CGEMM_SMALL_K_NN
|
||||
CGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c
|
||||
endif
|
||||
|
||||
ifndef CGEMM_SMALL_K_NT
|
||||
CGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c
|
||||
endif
|
||||
|
||||
ifndef CGEMM_SMALL_K_TN
|
||||
CGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c
|
||||
endif
|
||||
|
||||
ifndef CGEMM_SMALL_K_TT
|
||||
CGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c
|
||||
endif
|
||||
|
||||
$(KDIR)cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_M_PERMIT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
|
||||
|
||||
ifndef CGEMM_SMALL_K_B0_NN
|
||||
CGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c
|
||||
endif
|
||||
|
||||
ifndef CGEMM_SMALL_K_B0_NT
|
||||
CGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c
|
||||
endif
|
||||
|
||||
ifndef CGEMM_SMALL_K_B0_TN
|
||||
CGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c
|
||||
endif
|
||||
|
||||
ifndef CGEMM_SMALL_K_B0_TT
|
||||
CGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c
|
||||
endif
|
||||
|
||||
$(KDIR)cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@
|
||||
|
||||
$(KDIR)cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@
|
||||
|
||||
ifndef ZGEMM_SMALL_M_PERMIT
|
||||
ZGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c
|
||||
endif
|
||||
|
||||
ifndef ZGEMM_SMALL_K_NN
|
||||
ZGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c
|
||||
endif
|
||||
|
||||
ifndef ZGEMM_SMALL_K_NT
|
||||
ZGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c
|
||||
endif
|
||||
|
||||
ifndef ZGEMM_SMALL_K_TN
|
||||
ZGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c
|
||||
endif
|
||||
|
||||
ifndef ZGEMM_SMALL_K_TT
|
||||
ZGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c
|
||||
endif
|
||||
|
||||
$(KDIR)zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_M_PERMIT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@
|
||||
|
||||
|
||||
$(KDIR)zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
|
||||
|
||||
ifndef ZGEMM_SMALL_K_B0_NN
|
||||
ZGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c
|
||||
endif
|
||||
|
||||
ifndef ZGEMM_SMALL_K_B0_NT
|
||||
ZGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c
|
||||
endif
|
||||
|
||||
ifndef ZGEMM_SMALL_K_B0_TN
|
||||
ZGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c
|
||||
endif
|
||||
|
||||
ifndef ZGEMM_SMALL_K_B0_TT
|
||||
ZGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c
|
||||
endif
|
||||
|
||||
$(KDIR)zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@
|
||||
|
||||
$(KDIR)zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@
|
||||
|
|
|
|||
|
|
@ -50,11 +50,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define B03 x16
|
||||
#define B04 x17
|
||||
|
||||
#define I x18
|
||||
#define J x19
|
||||
#define I x19
|
||||
#define J x20
|
||||
|
||||
#define TEMP1 x20
|
||||
#define TEMP2 x21
|
||||
#define TEMP1 x21
|
||||
|
||||
#define A_PREFETCH 2560
|
||||
#define B_PREFETCH 256
|
||||
|
|
|
|||
|
|
@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define alpha x17
|
||||
#define temp x18
|
||||
//#define temp x18
|
||||
#define tempOffset x19
|
||||
#define tempK x20
|
||||
#define temp x21
|
||||
|
||||
#define alpha0 d10
|
||||
#define alphaV0 v10.d[0]
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ All rights reserved.
|
|||
#define B00 x22
|
||||
|
||||
|
||||
#define I x18
|
||||
#define I x21
|
||||
#define J x19
|
||||
|
||||
#define TEMP1 x20
|
||||
|
|
|
|||
|
|
@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define alpha w17
|
||||
#define temp x18
|
||||
//#define temp x18
|
||||
#define tempOffset x19
|
||||
#define tempK x20
|
||||
#define temp x21
|
||||
|
||||
#define alpha0 s10
|
||||
#define alphaV0 v10.s[0]
|
||||
|
|
|
|||
|
|
@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow2 x14
|
||||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define alphaR x17
|
||||
#define alphaI x18
|
||||
#define alphaR x19
|
||||
#define alphaI x20
|
||||
|
||||
#define alpha0_R d10
|
||||
#define alphaV0_R v10.d[0]
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define alphaR x17
|
||||
#define alphaI x18
|
||||
#define alphaI x22
|
||||
#define temp x19
|
||||
#define tempOffset x20
|
||||
#define tempK x21
|
||||
|
|
|
|||
|
|
@ -47,7 +47,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
int n1 = n & -4;
|
||||
#if V_SIMD && !defined(DSDOT)
|
||||
const int vstep = v_nlanes_f32;
|
||||
const int unrollx4 = n & (-vstep * 4);
|
||||
|
|
@ -84,6 +83,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
}
|
||||
dot = v_sum_f32(vsum0);
|
||||
#elif defined(DSDOT)
|
||||
int n1 = n & -4;
|
||||
for (; i < n1; i += 4)
|
||||
{
|
||||
dot += (double) y[i] * (double) x[i]
|
||||
|
|
@ -92,6 +92,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
+ (double) y[i+3] * (double) x[i+3] ;
|
||||
}
|
||||
#else
|
||||
int n1 = n & -4;
|
||||
for (; i < n1; i += 4)
|
||||
{
|
||||
dot += y[i] * x[i]
|
||||
|
|
|
|||
|
|
@ -0,0 +1,56 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef B0
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
|
||||
#else
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
|
||||
#endif
|
||||
{
|
||||
//naive implemtation
|
||||
//Column major
|
||||
|
||||
BLASLONG i,j,k;
|
||||
FLOAT result=0.0;
|
||||
|
||||
for(i=0; i<M; i++){
|
||||
for(j=0; j<N; j++){
|
||||
result=0.0;
|
||||
for(k=0; k<K; k++){
|
||||
result += A[i+k*lda] * B[k+j*ldb];
|
||||
}
|
||||
#ifdef B0
|
||||
C[i+j*ldc]=alpha * result;
|
||||
#else
|
||||
C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef B0
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
|
||||
#else
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
|
||||
#endif
|
||||
{
|
||||
//naive implemtation
|
||||
//Column major
|
||||
|
||||
BLASLONG i,j,k;
|
||||
FLOAT result=0.0;
|
||||
|
||||
for(i=0; i<M; i++){
|
||||
for(j=0; j<N; j++){
|
||||
result=0.0;
|
||||
for(k=0; k<K; k++){
|
||||
result += A[i+k*lda] * B[k*ldb+j];
|
||||
}
|
||||
#ifdef B0
|
||||
C[i+j*ldc]=alpha * result;
|
||||
#else
|
||||
C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef B0
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
|
||||
#else
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
|
||||
#endif
|
||||
{
|
||||
//naive implemtation
|
||||
//Column major
|
||||
|
||||
BLASLONG i,j,k;
|
||||
FLOAT result=0.0;
|
||||
|
||||
for(i=0; i<M; i++){
|
||||
for(j=0; j<N; j++){
|
||||
result=0.0;
|
||||
for(k=0; k<K; k++){
|
||||
result += A[i*lda+k] * B[k+j*ldb];
|
||||
}
|
||||
#ifdef B0
|
||||
C[i+j*ldc]=alpha * result;
|
||||
#else
|
||||
C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,57 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef B0
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
|
||||
#else
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
|
||||
#endif
|
||||
{
|
||||
//naive implemtation
|
||||
//Column major
|
||||
|
||||
BLASLONG i,j,k;
|
||||
FLOAT result=0.0;
|
||||
|
||||
for(i=0; i<M; i++){
|
||||
for(j=0; j<N; j++){
|
||||
result=0.0;
|
||||
for(k=0; k<K; k++){
|
||||
result += A[i*lda+k] * B[k*ldb+j];
|
||||
}
|
||||
#ifdef B0
|
||||
C[i+j*ldc]=alpha * result;
|
||||
#else
|
||||
C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,40 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
|
||||
{
|
||||
return 0;
|
||||
/*
|
||||
double MNK = (double) M * (double) N * (double) K;
|
||||
if (MNK <= 100.0*100.0*100.0)
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
*/
|
||||
}
|
||||
|
|
@ -0,0 +1,89 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifndef B0
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
|
||||
#else
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
|
||||
#endif
|
||||
{
|
||||
FLOAT real, imag;
|
||||
#ifndef B0
|
||||
FLOAT tmp0, tmp1;
|
||||
#endif
|
||||
int i, j, l;
|
||||
for(i = 0; i < M; i++){
|
||||
for(j = 0; j < N; j++){
|
||||
real=0;
|
||||
imag=0;
|
||||
|
||||
for(l = 0; l < K; l++){
|
||||
#if defined(NN)
|
||||
real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
|
||||
-A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
|
||||
|
||||
imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
|
||||
+ A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
|
||||
#elif defined(NR)
|
||||
real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
|
||||
+A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
|
||||
|
||||
imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
|
||||
+ A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
|
||||
#elif defined(RN)
|
||||
real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
|
||||
+A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
|
||||
|
||||
imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
|
||||
- A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
|
||||
#elif defined(RR)
|
||||
real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
|
||||
-A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
|
||||
|
||||
imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
|
||||
- A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef B0
|
||||
tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
|
||||
tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
|
||||
|
||||
|
||||
C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
|
||||
C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
|
||||
#else
|
||||
C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
|
||||
C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,93 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifndef B0
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
|
||||
#else
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
|
||||
#endif
|
||||
{
|
||||
FLOAT real, imag;
|
||||
#ifndef B0
|
||||
FLOAT tmp0, tmp1;
|
||||
#endif
|
||||
int i, j, l;
|
||||
for(i = 0; i < M; i++){
|
||||
for(j = 0; j < N; j++){
|
||||
real=0;
|
||||
imag=0;
|
||||
|
||||
for(l = 0; l < K; l++){
|
||||
#if defined(NT)
|
||||
real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
|
||||
-A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
|
||||
|
||||
imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
|
||||
+ A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
|
||||
|
||||
#elif defined(NC)
|
||||
real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
|
||||
+A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
|
||||
|
||||
imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
|
||||
+ A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
|
||||
|
||||
#elif defined(RT)
|
||||
real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
|
||||
+A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
|
||||
|
||||
imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
|
||||
- A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
|
||||
|
||||
#elif defined(RC)
|
||||
real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
|
||||
-A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
|
||||
|
||||
imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
|
||||
- A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef B0
|
||||
tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
|
||||
tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
|
||||
|
||||
|
||||
C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
|
||||
C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
|
||||
#else
|
||||
C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
|
||||
C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,93 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifndef B0
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
|
||||
#else
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
|
||||
#endif
|
||||
{
|
||||
FLOAT real, imag;
|
||||
#ifndef B0
|
||||
FLOAT tmp0, tmp1;
|
||||
#endif
|
||||
int i, j, l;
|
||||
for(i = 0; i < M; i++){
|
||||
for(j = 0; j < N; j++){
|
||||
real=0;
|
||||
imag=0;
|
||||
|
||||
for(l = 0; l < K; l++){
|
||||
#if defined(TN)
|
||||
real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
|
||||
-A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
|
||||
|
||||
imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
|
||||
+ A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
|
||||
|
||||
#elif defined(TR)
|
||||
real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
|
||||
+A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
|
||||
|
||||
imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
|
||||
+ A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
|
||||
|
||||
#elif defined(CN)
|
||||
real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
|
||||
+A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
|
||||
|
||||
imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
|
||||
- A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
|
||||
|
||||
#elif defined(CR)
|
||||
real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
|
||||
-A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
|
||||
|
||||
imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
|
||||
- A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef B0
|
||||
tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
|
||||
tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
|
||||
|
||||
|
||||
C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
|
||||
C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
|
||||
#else
|
||||
C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
|
||||
C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,93 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifndef B0
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
|
||||
#else
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
|
||||
#endif
|
||||
{
|
||||
FLOAT real, imag;
|
||||
#ifndef B0
|
||||
FLOAT tmp0, tmp1;
|
||||
#endif
|
||||
int i, j, l;
|
||||
for(i = 0; i < M; i++){
|
||||
for(j = 0; j < N; j++){
|
||||
real=0;
|
||||
imag=0;
|
||||
|
||||
for(l = 0; l < K; l++){
|
||||
#if defined(TT)
|
||||
real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
|
||||
-A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
|
||||
|
||||
imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
|
||||
+ A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
|
||||
|
||||
#elif defined(TC)
|
||||
real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
|
||||
+A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
|
||||
|
||||
imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
|
||||
+ A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
|
||||
|
||||
#elif defined(CT)
|
||||
real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
|
||||
+A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
|
||||
|
||||
imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
|
||||
- A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
|
||||
|
||||
#elif defined(CC)
|
||||
real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
|
||||
-A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
|
||||
|
||||
imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
|
||||
- A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifndef B0
|
||||
tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
|
||||
tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
|
||||
|
||||
|
||||
C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
|
||||
C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
|
||||
#else
|
||||
C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
|
||||
C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,40 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha0, FLOAT alpha1, FLOAT beta0, FLOAT beta1)
|
||||
{
|
||||
return 0;
|
||||
/*
|
||||
double MNK = (double) M * (double) N * (double) K;
|
||||
if (MNK <= 100.0*100.0*100.0)
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
*/
|
||||
}
|
||||
|
|
@ -0,0 +1,238 @@
|
|||
ifndef SAXPYKERNEL
|
||||
SAXPYKERNEL = ../arm/axpy.c
|
||||
endif
|
||||
|
||||
ifndef DAXPYKERNEL
|
||||
DAXPYKERNEL = ../arm/axpy.c
|
||||
endif
|
||||
|
||||
ifndef CAXPYKERNEL
|
||||
CAXPYKERNEL = ../arm/zaxpy.c
|
||||
endif
|
||||
|
||||
ifndef ZAXPYKERNEL
|
||||
ZAXPYKERNEL = ../arm/zaxpy.c
|
||||
endif
|
||||
|
||||
ifndef SROTKERNEL
|
||||
SROTKERNEL = ../arm/rot.c
|
||||
endif
|
||||
|
||||
ifndef DROTKERNEL
|
||||
DROTKERNEL = ../arm/rot.c
|
||||
endif
|
||||
|
||||
ifndef CROTKERNEL
|
||||
CROTKERNEL = ../arm/zrot.c
|
||||
endif
|
||||
|
||||
ifndef ZROTKERNEL
|
||||
ZROTKERNEL = ../arm/zrot.c
|
||||
endif
|
||||
|
||||
ifndef CSWAPKERNEL
|
||||
CSWAPKERNEL = ../arm/zswap.c
|
||||
endif
|
||||
|
||||
ifndef ZSWAPKERNEL
|
||||
ZSWAPKERNEL = ../arm/zswap.c
|
||||
endif
|
||||
|
||||
ifndef SSUMKERNEL
|
||||
SSUMKERNEL = ../arm/sum.c
|
||||
endif
|
||||
|
||||
ifndef DSUMKERNEL
|
||||
DSUMKERNEL = ../arm/sum.c
|
||||
endif
|
||||
|
||||
ifndef CSUMKERNEL
|
||||
CSUMKERNEL = ../arm/zsum.c
|
||||
endif
|
||||
|
||||
ifndef ZSUMKERNEL
|
||||
ZSUMKERNEL = ../arm/zsum.c
|
||||
endif
|
||||
|
||||
ifndef ISMAXKERNEL
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
endif
|
||||
|
||||
ifndef IDMAXKERNEL
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
endif
|
||||
|
||||
ifndef ISMINKERNEL
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
endif
|
||||
|
||||
ifndef IDMINKERNEL
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
endif
|
||||
|
||||
ifndef SNRM2KERNEL
|
||||
SNRM2KERNEL = snrm2.S
|
||||
endif
|
||||
|
||||
ifndef DNRM2KERNEL
|
||||
DNRM2KERNEL = dnrm2.S
|
||||
endif
|
||||
|
||||
ifndef CNRM2KERNEL
|
||||
CNRM2KERNEL = cnrm2.S
|
||||
endif
|
||||
|
||||
ifndef ZNRM2KERNEL
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
endif
|
||||
|
||||
ifndef SCABS_KERNEL
|
||||
SCABS_KERNEL = ../generic/cabs.c
|
||||
endif
|
||||
|
||||
ifndef DCABS_KERNEL
|
||||
DCABS_KERNEL = ../generic/cabs.c
|
||||
endif
|
||||
|
||||
ifndef QCABS_KERNEL
|
||||
QCABS_KERNEL = ../generic/cabs.c
|
||||
endif
|
||||
|
||||
ifndef LSAME_KERNEL
|
||||
LSAME_KERNEL = ../generic/lsame.c
|
||||
endif
|
||||
|
||||
ifndef SGEMMKERNEL
|
||||
SGEMMKERNEL = gemm_kernel.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_2.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
endif
|
||||
|
||||
ifndef DGEMMKERNEL
|
||||
DGEMMKERNEL = gemm_kernel.S
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_8.c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy.o
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy.o
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
endif
|
||||
|
||||
ifndef CGEMMKERNEL
|
||||
CGEMMKERNEL = zgemm_kernel.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_1.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_1.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy.o
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy.o
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
endif
|
||||
|
||||
ifndef ZGEMMKERNEL
|
||||
ZGEMMKERNEL = zgemm_kernel.S
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy.o
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy.o
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
endif
|
||||
|
||||
ifndef SGEMM_BETA
|
||||
SGEMM_BETA = ../generic/gemm_beta.c
|
||||
endif
|
||||
ifndef DGEMM_BETA
|
||||
DGEMM_BETA = ../generic/gemm_beta.c
|
||||
endif
|
||||
ifndef CGEMM_BETA
|
||||
CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
endif
|
||||
ifndef ZGEMM_BETA
|
||||
ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||
endif
|
||||
|
||||
ifndef STRSMKERNEL_LN
|
||||
STRSMKERNEL_LN = trsm_kernel_LN.S
|
||||
endif
|
||||
|
||||
ifndef STRSMKERNEL_LT
|
||||
STRSMKERNEL_LT = trsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef STRSMKERNEL_RN
|
||||
STRSMKERNEL_RN = trsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef STRSMKERNEL_RT
|
||||
STRSMKERNEL_RT = trsm_kernel_RT.S
|
||||
endif
|
||||
|
||||
ifndef DTRSMKERNEL_LN
|
||||
DTRSMKERNEL_LN = trsm_kernel_LN.S
|
||||
endif
|
||||
|
||||
ifndef DTRSMKERNEL_LT
|
||||
DTRSMKERNEL_LT = trsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef DTRSMKERNEL_RN
|
||||
DTRSMKERNEL_RN = trsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef DTRSMKERNEL_RT
|
||||
DTRSMKERNEL_RT = trsm_kernel_RT.S
|
||||
endif
|
||||
|
||||
ifndef CTRSMKERNEL_LN
|
||||
CTRSMKERNEL_LN = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef CTRSMKERNEL_LT
|
||||
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef CTRSMKERNEL_RN
|
||||
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef CTRSMKERNEL_RT
|
||||
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
endif
|
||||
|
||||
ifndef ZTRSMKERNEL_LN
|
||||
ZTRSMKERNEL_LN = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef ZTRSMKERNEL_LT
|
||||
ZTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef ZTRSMKERNEL_RN
|
||||
ZTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef ZTRSMKERNEL_RT
|
||||
ZTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
endif
|
||||
|
||||
ifndef CGEMM3MKERNEL
|
||||
CGEMM3MKERNEL = zgemm3m_kernel.S
|
||||
endif
|
||||
|
||||
ifndef ZGEMM3MKERNEL
|
||||
ZGEMM3MKERNEL = zgemm3m_kernel.S
|
||||
endif
|
||||
|
||||
DSDOTKERNEL = dot.S
|
||||
|
|
@ -0,0 +1 @@
|
|||
#TODO: Add loongarch64 SIMD optimizations
|
||||
|
|
@ -0,0 +1,167 @@
|
|||
SGEMM_BETA = ../generic/gemm_beta.c
|
||||
DGEMM_BETA = ../generic/gemm_beta.c
|
||||
CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
#Pure C for other kernels
|
||||
SAMAXKERNEL = ../arm/amax.c
|
||||
DAMAXKERNEL = ../arm/amax.c
|
||||
CAMAXKERNEL = ../arm/zamax.c
|
||||
ZAMAXKERNEL = ../arm/zamax.c
|
||||
|
||||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMAXKERNEL = ../arm/iamax.c
|
||||
IDAMAXKERNEL = ../arm/iamax.c
|
||||
ICAMAXKERNEL = ../arm/izamax.c
|
||||
IZAMAXKERNEL = ../arm/izamax.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
SASUMKERNEL = ../arm/asum.c
|
||||
DASUMKERNEL = ../arm/asum.c
|
||||
CASUMKERNEL = ../arm/zasum.c
|
||||
ZASUMKERNEL = ../arm/zasum.c
|
||||
|
||||
SSUMKERNEL = ../arm/sum.c
|
||||
DSUMKERNEL = ../arm/sum.c
|
||||
CSUMKERNEL = ../arm/zsum.c
|
||||
ZSUMKERNEL = ../arm/zsum.c
|
||||
|
||||
|
||||
SAXPYKERNEL = ../arm/axpy.c
|
||||
DAXPYKERNEL = ../arm/axpy.c
|
||||
CAXPYKERNEL = ../arm/zaxpy.c
|
||||
ZAXPYKERNEL = ../arm/zaxpy.c
|
||||
|
||||
SCOPYKERNEL = ../arm/copy.c
|
||||
DCOPYKERNEL = ../arm/copy.c
|
||||
CCOPYKERNEL = ../arm/zcopy.c
|
||||
ZCOPYKERNEL = ../arm/zcopy.c
|
||||
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
DDOTKERNEL = ../arm/dot.c
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
DNRM2KERNEL = ../arm/nrm2.c
|
||||
CNRM2KERNEL = ../arm/znrm2.c
|
||||
ZNRM2KERNEL = ../arm/znrm2.c
|
||||
|
||||
SROTKERNEL = ../arm/rot.c
|
||||
DROTKERNEL = ../arm/rot.c
|
||||
CROTKERNEL = ../arm/zrot.c
|
||||
ZROTKERNEL = ../arm/zrot.c
|
||||
|
||||
SSCALKERNEL = ../arm/scal.c
|
||||
DSCALKERNEL = ../arm/scal.c
|
||||
CSCALKERNEL = ../arm/zscal.c
|
||||
ZSCALKERNEL = ../arm/zscal.c
|
||||
|
||||
SSWAPKERNEL = ../arm/swap.c
|
||||
DSWAPKERNEL = ../arm/swap.c
|
||||
CSWAPKERNEL = ../arm/zswap.c
|
||||
ZSWAPKERNEL = ../arm/zswap.c
|
||||
|
||||
SGEMVNKERNEL = ../arm/gemv_n.c
|
||||
DGEMVNKERNEL = ../arm/gemv_n.c
|
||||
CGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
ZGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
|
||||
SGEMVTKERNEL = ../arm/gemv_t.c
|
||||
DGEMVTKERNEL = ../arm/gemv_t.c
|
||||
CGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
ZGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
|
||||
SSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
SSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
DSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
DSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
QSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
QSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
CSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
CSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
ZSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
ZSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
XSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
XSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
|
||||
ZHEMV_U_KERNEL = ../generic/zhemv_k.c
|
||||
ZHEMV_L_KERNEL = ../generic/zhemv_k.c
|
||||
|
||||
LSAME_KERNEL = ../generic/lsame.c
|
||||
SCABS_KERNEL = ../generic/cabs.c
|
||||
DCABS_KERNEL = ../generic/cabs.c
|
||||
QCABS_KERNEL = ../generic/cabs.c
|
||||
|
||||
#Dump kernel
|
||||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
|
|
@ -0,0 +1 @@
|
|||
clean ::
|
||||
|
|
@ -0,0 +1,230 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
|
||||
#define a1 $f10
|
||||
#define a2 $f11
|
||||
#define a3 $f12
|
||||
#define a4 $f13
|
||||
#define a5 $f14
|
||||
#define a6 $f15
|
||||
#define a7 $f16
|
||||
#define a8 $f17
|
||||
|
||||
#define t1 $f0
|
||||
#define t2 $f1
|
||||
#define t3 $f2
|
||||
#define t4 $f3
|
||||
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define s3 $f23
|
||||
#define s4 $f9
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
MTC s1, $r0
|
||||
bge $r0, N, .L999
|
||||
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d N, N, -1
|
||||
|
||||
add.d X, X, INCX
|
||||
FABS s1, a1
|
||||
|
||||
FABS s2, a1
|
||||
bge $r0, N, .L999
|
||||
|
||||
FABS s3, a1
|
||||
srai.d I, N, 3
|
||||
|
||||
FABS s4, a1
|
||||
bge $r0, I, .L15
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a6, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a8, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
FABS t1, a1
|
||||
LD a1, X, 0 * SIZE
|
||||
FABS t2, a2
|
||||
add.d X, X, INCX
|
||||
|
||||
FABS t3, a3
|
||||
LD a2, X, 0 * SIZE
|
||||
FABS t4, a4
|
||||
add.d X, X, INCX
|
||||
|
||||
CMPLT $fcc0, s1, t1
|
||||
LD a3, X, 0 * SIZE
|
||||
CMPLT $fcc1, s2, t2
|
||||
add.d X, X, INCX
|
||||
|
||||
CMPLT $fcc2, s3, t3
|
||||
LD a4, X, 0 * SIZE
|
||||
CMPLT $fcc3, s4, t4
|
||||
add.d X, X, INCX
|
||||
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
|
||||
FABS t1, a5
|
||||
LD a5, X, 0 * SIZE
|
||||
FABS t2, a6
|
||||
add.d X, X, INCX
|
||||
|
||||
FABS t3, a7
|
||||
LD a6, X, 0 * SIZE
|
||||
FABS t4, a8
|
||||
add.d X, X, INCX
|
||||
|
||||
CMPLT $fcc0, s1, t1
|
||||
LD a7, X, 0 * SIZE
|
||||
CMPLT $fcc1, s2, t2
|
||||
add.d X, X, INCX
|
||||
|
||||
CMPLT $fcc2, s3, t3
|
||||
LD a8, X, 0 * SIZE
|
||||
CMPLT $fcc3, s4, t4
|
||||
add.d X, X, INCX
|
||||
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
addi.d I, I, -1
|
||||
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
FABS t3, a3
|
||||
FABS t4, a4
|
||||
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMPLT $fcc1, s2, t2
|
||||
CMPLT $fcc2, s3, t3
|
||||
CMPLT $fcc3, s4, t4
|
||||
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
|
||||
FABS t1, a5
|
||||
FABS t2, a6
|
||||
FABS t3, a7
|
||||
FABS t4, a8
|
||||
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMPLT $fcc1, s2, t2
|
||||
CMPLT $fcc2, s3, t3
|
||||
CMPLT $fcc3, s4, t4
|
||||
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
|
||||
bge $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
|
||||
FABS t1, a1
|
||||
|
||||
CMPLT $fcc0, s1, t1
|
||||
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L16
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
CMPLT $fcc0, s1, s2
|
||||
CMPLT $fcc1, s3, s4
|
||||
|
||||
CMOVT s1, s1, s2, $fcc0
|
||||
CMOVT s3, s3, s4, $fcc1
|
||||
|
||||
CMPLT $fcc0, s1, s3
|
||||
CMOVT s1, s1, s3, $fcc0
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,186 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define a1 $f10
|
||||
#define a2 $f11
|
||||
#define a3 $f12
|
||||
#define a4 $f13
|
||||
#define a5 $f14
|
||||
#define a6 $f15
|
||||
#define a7 $f16
|
||||
#define a8 $f17
|
||||
#define t1 $f0
|
||||
#define t2 $f1
|
||||
#define t3 $f2
|
||||
#define t4 $f3
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define s3 $f23
|
||||
#define s4 $f9
|
||||
|
||||
PROLOGUE
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
MTC s1, $r0
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d N, N, -1
|
||||
add.d X, X, INCX
|
||||
FABS s1, a1
|
||||
FABS s2, a1
|
||||
bge $r0, N, .L999
|
||||
FABS s3, a1
|
||||
srai.d I, N, 3
|
||||
FABS s4, a1
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a6, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a8, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
.L12:
|
||||
FABS t1, a1
|
||||
LD a1, X, 0 * SIZE
|
||||
FABS t2, a2
|
||||
add.d X, X, INCX
|
||||
FABS t3, a3
|
||||
LD a2, X, 0 * SIZE
|
||||
FABS t4, a4
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc0, t1, s1
|
||||
LD a3, X, 0 * SIZE
|
||||
CMPLT $fcc1, t2, s2
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, t3, s3
|
||||
LD a4, X, 0 * SIZE
|
||||
CMPLT $fcc3, t4, s4
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
FABS t1, a5
|
||||
LD a5, X, 0 * SIZE
|
||||
FABS t2, a6
|
||||
add.d X, X, INCX
|
||||
FABS t3, a7
|
||||
LD a6, X, 0 * SIZE
|
||||
FABS t4, a8
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc0, t1, s1
|
||||
LD a7, X, 0 * SIZE
|
||||
CMPLT $fcc1, t2, s2
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, t3, s3
|
||||
LD a8, X, 0 * SIZE
|
||||
CMPLT $fcc3, t4, s4
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
addi.d I, I, -1
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
.L13:
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
FABS t3, a3
|
||||
FABS t4, a4
|
||||
CMPLT $fcc0, t1, s1
|
||||
CMPLT $fcc1, t2, s2
|
||||
CMPLT $fcc2, t3, s3
|
||||
CMPLT $fcc3, t4, s4
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
FABS t1, a5
|
||||
FABS t2, a6
|
||||
FABS t3, a7
|
||||
FABS t4, a8
|
||||
CMPLT $fcc0, t1, s1
|
||||
CMPLT $fcc1, t2, s2
|
||||
CMPLT $fcc2, t3, s3
|
||||
CMPLT $fcc3, t4, s4
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
.align 3
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
NOP
|
||||
bge $r0, I, .L998
|
||||
.align 3
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
FABS t1, a1
|
||||
CMPLT $fcc0, t1, s1
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L16
|
||||
.align 3
|
||||
.L998:
|
||||
CMPLT $fcc0, s2, s1
|
||||
CMPLT $fcc1, s4, s3
|
||||
CMOVT s1, s1, s2, $fcc0
|
||||
CMOVT s3, s3, s4, $fcc1
|
||||
CMPLT $fcc0, s3, s1
|
||||
CMOVT s1, s1, s3, $fcc0
|
||||
.align 3
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,232 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define a1 $f23
|
||||
#define a2 $f9
|
||||
#define a3 $f10
|
||||
#define a4 $f11
|
||||
#define a5 $f12
|
||||
#define a6 $f13
|
||||
#define a7 $f14
|
||||
#define a8 $f15
|
||||
#define t1 $f16
|
||||
#define t2 $f17
|
||||
#define t3 $f0
|
||||
#define t4 $f1
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
PROLOGUE
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
MTC s1, $r0
|
||||
MTC s2, $r0
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
li.d TEMP, SIZE
|
||||
bge $r0, N, .L999
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, X, 2 * SIZE
|
||||
LD a4, X, 3 * SIZE
|
||||
LD a5, X, 4 * SIZE
|
||||
FABS t1, a1
|
||||
LD a6, X, 5 * SIZE
|
||||
FABS t2, a2
|
||||
LD a7, X, 6 * SIZE
|
||||
FABS t3, a3
|
||||
FABS t4, a4
|
||||
addi.d I, I, -1
|
||||
LD a8, X, 7 * SIZE
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
.L12:
|
||||
ADD s1, s1, t1
|
||||
LD a1, X, 8 * SIZE
|
||||
FABS t1, a5
|
||||
addi.d I, I, -1
|
||||
ADD s2, s2, t2
|
||||
LD a2, X, 9 * SIZE
|
||||
FABS t2, a6
|
||||
NOP
|
||||
ADD s1, s1, t3
|
||||
LD a3, X, 10 * SIZE
|
||||
FABS t3, a7
|
||||
NOP
|
||||
ADD s2, s2, t4
|
||||
LD a4, X, 11 * SIZE
|
||||
FABS t4, a8
|
||||
addi.d X, X, 8 * SIZE
|
||||
ADD s1, s1, t1
|
||||
LD a5, X, 4 * SIZE
|
||||
FABS t1, a1
|
||||
NOP
|
||||
ADD s2, s2, t2
|
||||
LD a6, X, 5 * SIZE
|
||||
FABS t2, a2
|
||||
NOP
|
||||
ADD s1, s1, t3
|
||||
LD a7, X, 6 * SIZE
|
||||
FABS t3, a3
|
||||
NOP
|
||||
ADD s2, s2, t4
|
||||
LD a8, X, 7 * SIZE
|
||||
FABS t4, a4
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
.L13:
|
||||
ADD s1, s1, t1
|
||||
addi.d X, X, 8 * SIZE
|
||||
FABS t1, a5
|
||||
NOP
|
||||
ADD s2, s2, t2
|
||||
FABS t2, a6
|
||||
ADD s1, s1, t3
|
||||
FABS t3, a7
|
||||
ADD s2, s2, t4
|
||||
FABS t4, a8
|
||||
ADD s1, s1, t1
|
||||
ADD s2, s2, t2
|
||||
ADD s1, s1, t3
|
||||
ADD s2, s2, t4
|
||||
.align 3
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
FABS t1, a1
|
||||
ADD s1, s1, t1
|
||||
addi.d X, X, SIZE
|
||||
blt $r0, I, .L16
|
||||
b .L999
|
||||
.align 3
|
||||
.L20:
|
||||
bge $r0, I, .L25
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a6, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, a1
|
||||
LD a7, X, 0 * SIZE
|
||||
FABS t2, a2
|
||||
add.d X, X, INCX
|
||||
FABS t3, a3
|
||||
LD a8, X, 0 * SIZE
|
||||
FABS t4, a4
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L24
|
||||
.align 3
|
||||
.L23:
|
||||
ADD s1, s1, t1
|
||||
LD a1, X, 0 * SIZE
|
||||
FABS t1, a5
|
||||
add.d X, X, INCX
|
||||
ADD s2, s2, t2
|
||||
LD a2, X, 0 * SIZE
|
||||
FABS t2, a6
|
||||
add.d X, X, INCX
|
||||
ADD s1, s1, t3
|
||||
LD a3, X, 0 * SIZE
|
||||
FABS t3, a7
|
||||
add.d X, X, INCX
|
||||
ADD s2, s2, t4
|
||||
LD a4, X, 0 * SIZE
|
||||
FABS t4, a8
|
||||
add.d X, X, INCX
|
||||
ADD s1, s1, t1
|
||||
LD a5, X, 0 * SIZE
|
||||
FABS t1, a1
|
||||
add.d X, X, INCX
|
||||
ADD s2, s2, t2
|
||||
LD a6, X, 0 * SIZE
|
||||
FABS t2, a2
|
||||
add.d X, X, INCX
|
||||
ADD s1, s1, t3
|
||||
LD a7, X, 0 * SIZE
|
||||
FABS t3, a3
|
||||
add.d X, X, INCX
|
||||
ADD s2, s2, t4
|
||||
LD a8, X, 0 * SIZE
|
||||
FABS t4, a4
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L23
|
||||
.align 3
|
||||
.L24:
|
||||
ADD s1, s1, t1
|
||||
FABS t1, a5
|
||||
ADD s2, s2, t2
|
||||
FABS t2, a6
|
||||
ADD s1, s1, t3
|
||||
FABS t3, a7
|
||||
ADD s2, s2, t4
|
||||
FABS t4, a8
|
||||
ADD s1, s1, t1
|
||||
ADD s2, s2, t2
|
||||
ADD s1, s1, t3
|
||||
ADD s2, s2, t4
|
||||
.align 3
|
||||
.L25:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
.L26:
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
FABS t1, a1
|
||||
add.d X, X, INCX
|
||||
ADD s1, s1, t1
|
||||
blt $r0, I, .L26
|
||||
.align 3
|
||||
.L999:
|
||||
ADD s1, s1, s2
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,159 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define a5 $f16
|
||||
#define a6 $f17
|
||||
#define a7 $f0
|
||||
#define a8 $f1
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define t1 $f23
|
||||
#define t2 $f9
|
||||
#define t3 $f10
|
||||
#define t4 $f11
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
movgr2fr.d s1, $r0
|
||||
li.d TEMP, 2 * SIZE
|
||||
fmov.d s2, s1
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
srai.d I, N, 2
|
||||
bge $r0, I, .L25
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
LD a6, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fcvt.d.s t1, a1
|
||||
LD a7, X, 0 * SIZE
|
||||
fcvt.d.s t2, a2
|
||||
LD a8, X, 1 * SIZE
|
||||
fcvt.d.s t3, a3
|
||||
addi.d I, I, -1
|
||||
fcvt.d.s t4, a4
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
fmadd.d s1, t1, t1, s1
|
||||
LD a1, X, 0 * SIZE
|
||||
fcvt.d.s t1, a5
|
||||
fmadd.d s2, t2, t2, s2
|
||||
LD a2, X, 1 * SIZE
|
||||
fcvt.d.s t2, a6
|
||||
add.d X, X, INCX
|
||||
fmadd.d s1, t3, t3, s1
|
||||
LD a3, X, 0 * SIZE
|
||||
fcvt.d.s t3, a7
|
||||
fmadd.d s2, t4, t4, s2
|
||||
LD a4, X, 1 * SIZE
|
||||
fcvt.d.s t4, a8
|
||||
add.d X, X, INCX
|
||||
fmadd.d s1, t1, t1, s1
|
||||
LD a5, X, 0 * SIZE
|
||||
fcvt.d.s t1, a1
|
||||
addi.d I, I, -1
|
||||
fmadd.d s2, t2, t2, s2
|
||||
LD a6, X, 1 * SIZE
|
||||
fcvt.d.s t2, a2
|
||||
add.d X, X, INCX
|
||||
fmadd.d s1, t3, t3, s1
|
||||
LD a7, X, 0 * SIZE
|
||||
fcvt.d.s t3, a3
|
||||
LD a8, X, 1 * SIZE
|
||||
fmadd.d s2, t4, t4, s2
|
||||
add.d X, X, INCX
|
||||
fcvt.d.s t4, a4
|
||||
blt $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
fmadd.d s1, t1, t1, s1
|
||||
fcvt.d.s t1, a5
|
||||
fmadd.d s2, t2, t2, s2
|
||||
fcvt.d.s t2, a6
|
||||
fmadd.d s1, t3, t3, s1
|
||||
fcvt.d.s t3, a7
|
||||
fmadd.d s2, t4, t4, s2
|
||||
fcvt.d.s t4, a8
|
||||
fmadd.d s1, t1, t1, s1
|
||||
fmadd.d s2, t2, t2, s2
|
||||
fmadd.d s1, t3, t3, s1
|
||||
fmadd.d s2, t4, t4, s2
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
fcvt.d.s t1, a1
|
||||
fcvt.d.s t2, a2
|
||||
fmadd.d s1, t1, t1, s1
|
||||
add.d X, X, INCX
|
||||
fmadd.d s2, t2, t2, s2
|
||||
blt $r0, I, .L26
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
fadd.d s1, s1, s2
|
||||
fsqrt.d s1, s1
|
||||
move $r4, $r17
|
||||
fcvt.s.d $f0, s1
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,225 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define Y $r7
|
||||
#define INCY $r8
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define a1 $f22
|
||||
#define a2 $f8
|
||||
#define a3 $f23
|
||||
#define a4 $f9
|
||||
#define a5 $f10
|
||||
#define a6 $f11
|
||||
#define a7 $f12
|
||||
#define a8 $f13
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
LDINT INCY, 0(INCY)
|
||||
#endif
|
||||
|
||||
li.d TEMP, SIZE
|
||||
NOP
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, N, .L999
|
||||
slli.d INCY, INCY, BASE_SHIFT
|
||||
bne INCX, TEMP, .L20
|
||||
srai.d I, N, 3
|
||||
bne INCY, TEMP, .L20
|
||||
addi.d I, I, -1
|
||||
blt I, $r0, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, X, 2 * SIZE
|
||||
LD a4, X, 3 * SIZE
|
||||
LD a5, X, 4 * SIZE
|
||||
LD a6, X, 5 * SIZE
|
||||
LD a7, X, 6 * SIZE
|
||||
LD a8, X, 7 * SIZE
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
ST a1, Y, 0 * SIZE
|
||||
LD a1, X, 8 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
LD a2, X, 9 * SIZE
|
||||
ST a3, Y, 2 * SIZE
|
||||
LD a3, X, 10 * SIZE
|
||||
ST a4, Y, 3 * SIZE
|
||||
LD a4, X, 11 * SIZE
|
||||
ST a5, Y, 4 * SIZE
|
||||
LD a5, X, 12 * SIZE
|
||||
ST a6, Y, 5 * SIZE
|
||||
LD a6, X, 13 * SIZE
|
||||
ST a7, Y, 6 * SIZE
|
||||
LD a7, X, 14 * SIZE
|
||||
ST a8, Y, 7 * SIZE
|
||||
LD a8, X, 15 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
ST a3, Y, 2 * SIZE
|
||||
ST a4, Y, 3 * SIZE
|
||||
ST a5, Y, 4 * SIZE
|
||||
ST a6, Y, 5 * SIZE
|
||||
ST a7, Y, 6 * SIZE
|
||||
ST a8, Y, 7 * SIZE
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d X, X, SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d Y, Y, SIZE
|
||||
ST a1, Y, -1 * SIZE
|
||||
blt $r0, I, .L16
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
srai.d I, N, 3
|
||||
addi.d I, I, -1
|
||||
blt I, $r0, .L25
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a6, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a8, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
ST a1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a4, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a5, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a6, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a6, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a7, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a7, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a8, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a8, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L22
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
ST a1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a4, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a5, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a6, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a7, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a8, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
addi.d I, I, -1
|
||||
ST a1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L26
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,314 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define XX $r7
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define a1 $f10
|
||||
#define a2 $f11
|
||||
#define a3 $f12
|
||||
#define a4 $f13
|
||||
#define a5 $f14
|
||||
#define a6 $f15
|
||||
#define a7 $f16
|
||||
#define a8 $f17
|
||||
#define t1 $f0
|
||||
#define t2 $f1
|
||||
#define t3 $f2
|
||||
#define t4 $f3
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define s3 $f23
|
||||
#define s4 $f9
|
||||
#define ALPHA $f4
|
||||
#define max $f5
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
MTC s1, $r0
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
move XX, X
|
||||
NOP
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d N, N, -1
|
||||
add.d X, X, INCX
|
||||
FABS s1, a1
|
||||
FABS s2, a1
|
||||
bge $r0, N, .L999
|
||||
FABS s3, a1
|
||||
srai.d I, N, 3
|
||||
FABS s4, a1
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a6, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a8, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
FABS t1, a1
|
||||
LD a1, X, 0 * SIZE
|
||||
FABS t2, a2
|
||||
add.d X, X, INCX
|
||||
FABS t3, a3
|
||||
LD a2, X, 0 * SIZE
|
||||
FABS t4, a4
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc0, s1, t1
|
||||
LD a3, X, 0 * SIZE
|
||||
CMPLT $fcc1, s2, t2
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, s3, t3
|
||||
LD a4, X, 0 * SIZE
|
||||
CMPLT $fcc3, s4, t4
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
FABS t1, a5
|
||||
LD a5, X, 0 * SIZE
|
||||
FABS t2, a6
|
||||
add.d X, X, INCX
|
||||
FABS t3, a7
|
||||
LD a6, X, 0 * SIZE
|
||||
FABS t4, a8
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc0, s1, t1
|
||||
LD a7, X, 0 * SIZE
|
||||
CMPLT $fcc1, s2, t2
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, s3, t3
|
||||
LD a8, X, 0 * SIZE
|
||||
CMPLT $fcc3, s4, t4
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
addi.d I, I, -1
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
FABS t3, a3
|
||||
FABS t4, a4
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMPLT $fcc1, s2, t2
|
||||
CMPLT $fcc2, s3, t3
|
||||
CMPLT $fcc3, s4, t4
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
FABS t1, a5
|
||||
FABS t2, a6
|
||||
FABS t3, a7
|
||||
FABS t4, a8
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMPLT $fcc1, s2, t2
|
||||
CMPLT $fcc2, s3, t3
|
||||
CMPLT $fcc3, s4, t4
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L100
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
FABS t1, a1
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L16
|
||||
.align 3
|
||||
|
||||
.L100:
|
||||
CMPLT $fcc0, s1, s2
|
||||
CMPLT $fcc1, s3, s4
|
||||
CMOVT s1, s1, s2, $fcc0
|
||||
CMOVT s3, s3, s4, $fcc1
|
||||
CMPLT $fcc0, s1, s3
|
||||
CMOVT s1, s1, s3, $fcc0
|
||||
addi.d N, N, 1
|
||||
lu12i.w TEMP, 0x3f800
|
||||
movgr2fr.d a1, $r0
|
||||
movgr2fr.w ALPHA, TEMP
|
||||
CMPEQ $fcc0, s1, a1
|
||||
fcvt.d.s ALPHA, ALPHA
|
||||
bcnez $fcc0, .L999
|
||||
fdiv.d ALPHA, ALPHA, s1
|
||||
MOV max, s1
|
||||
MOV s1, a1
|
||||
MOV s2, a1
|
||||
MOV s3, a1
|
||||
MOV s4, a1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L105
|
||||
LD a1, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD a2, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD a3, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD a4, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD a5, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD a6, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD a7, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD a8, XX, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d XX, XX, INCX
|
||||
bge $r0, I, .L104
|
||||
.align 3
|
||||
|
||||
.L103:
|
||||
MUL t1, ALPHA, a1
|
||||
LD a1, XX, 0 * SIZE
|
||||
MUL t2, ALPHA, a2
|
||||
add.d XX, XX, INCX
|
||||
MUL t3, ALPHA, a3
|
||||
LD a2, XX, 0 * SIZE
|
||||
MUL t4, ALPHA, a4
|
||||
add.d XX, XX, INCX
|
||||
MADD s1, t1, t1, s1
|
||||
LD a3, XX, 0 * SIZE
|
||||
MADD s2, t2, t2, s2
|
||||
add.d XX, XX, INCX
|
||||
MADD s3, t3, t3, s3
|
||||
LD a4, XX, 0 * SIZE
|
||||
MADD s4, t4, t4, s4
|
||||
add.d XX, XX, INCX
|
||||
MUL t1, ALPHA, a5
|
||||
LD a5, XX, 0 * SIZE
|
||||
MUL t2, ALPHA, a6
|
||||
add.d XX, XX, INCX
|
||||
MUL t3, ALPHA, a7
|
||||
LD a6, XX, 0 * SIZE
|
||||
MUL t4, ALPHA, a8
|
||||
add.d XX, XX, INCX
|
||||
MADD s1, t1, t1, s1
|
||||
LD a7, XX, 0 * SIZE
|
||||
MADD s2, t2, t2, s2
|
||||
add.d XX, XX, INCX
|
||||
MADD s3, t3, t3, s3
|
||||
LD a8, XX, 0 * SIZE
|
||||
MADD s4, t4, t4, s4
|
||||
addi.d I, I, -1
|
||||
add.d XX, XX, INCX
|
||||
blt $r0, I, .L103
|
||||
.align 3
|
||||
|
||||
.L104:
|
||||
MUL t1, ALPHA, a1
|
||||
MUL t2, ALPHA, a2
|
||||
MUL t3, ALPHA, a3
|
||||
MUL t4, ALPHA, a4
|
||||
MADD s1, t1, t1, s1
|
||||
MADD s2, t2, t2, s2
|
||||
MADD s3, t3, t3, s3
|
||||
MADD s4, t4, t4, s4
|
||||
MUL t1, ALPHA, a5
|
||||
MUL t2, ALPHA, a6
|
||||
MUL t3, ALPHA, a7
|
||||
MUL t4, ALPHA, a8
|
||||
MADD s1, t1, t1, s1
|
||||
MADD s2, t2, t2, s2
|
||||
MADD s3, t3, t3, s3
|
||||
MADD s4, t4, t4, s4
|
||||
.align 3
|
||||
|
||||
.L105:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L106:
|
||||
LD a1, XX, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
MUL t1, ALPHA, a1
|
||||
add.d XX, XX, INCX
|
||||
MADD s1, t1, t1, s1
|
||||
blt $r0, I, .L106
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
ADD s1, s1, s2
|
||||
ADD s3, s3, s4
|
||||
ADD s1, s1, s3
|
||||
fsqrt.d s1, s1
|
||||
move $r4, $r17
|
||||
MUL $f0, max, s1
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,391 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define Y $r7
|
||||
#define INCY $r8
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define a1 $f23
|
||||
#define a2 $f9
|
||||
#define a3 $f10
|
||||
#define a4 $f11
|
||||
#define b1 $f12
|
||||
#define b2 $f13
|
||||
#define b3 $f14
|
||||
#define b4 $f15
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
LDINT INCY, 0(INCY)
|
||||
#endif
|
||||
|
||||
MTC s1, $r0
|
||||
MTC s2, $r0
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
li.d TEMP, SIZE
|
||||
slli.d INCY, INCY, BASE_SHIFT
|
||||
bge $r0, N, .L999
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L20
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
LD b1, Y, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD b2, Y, 1 * SIZE
|
||||
LD a3, X, 2 * SIZE
|
||||
LD b3, Y, 2 * SIZE
|
||||
LD a4, X, 3 * SIZE
|
||||
addi.d I, I, -1
|
||||
LD b4, Y, 3 * SIZE
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
LD a1, X, 4 * SIZE
|
||||
LD b1, Y, 4 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a2, a2
|
||||
fcvt.d.s b2, b2
|
||||
fmadd.d s2, b2, a2, s2
|
||||
#else
|
||||
MADD s2, b2, a2, s2
|
||||
#endif
|
||||
LD a2, X, 5 * SIZE
|
||||
LD b2, Y, 5 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a3, a3
|
||||
fcvt.d.s b3, b3
|
||||
fmadd.d s1, b3, a3, s1
|
||||
#else
|
||||
MADD s1, b3, a3, s1
|
||||
#endif
|
||||
LD a3, X, 6 * SIZE
|
||||
LD b3, Y, 6 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a4, a4
|
||||
fcvt.d.s b4, b4
|
||||
fmadd.d s2, b4, a4, s2
|
||||
#else
|
||||
MADD s2, b4, a4, s2
|
||||
#endif
|
||||
LD a4, X, 7 * SIZE
|
||||
LD b4, Y, 7 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
LD a1, X, 8 * SIZE
|
||||
LD b1, Y, 8 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a2, a2
|
||||
fcvt.d.s b2, b2
|
||||
fmadd.d s2, b2, a2, s2
|
||||
#else
|
||||
MADD s2, b2, a2, s2
|
||||
#endif
|
||||
LD a2, X, 9 * SIZE
|
||||
LD b2, Y, 9 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a3, a3
|
||||
fcvt.d.s b3, b3
|
||||
fmadd.d s1, b3, a3, s1
|
||||
#else
|
||||
MADD s1, b3, a3, s1
|
||||
#endif
|
||||
LD a3, X, 10 * SIZE
|
||||
LD b3, Y, 10 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a4, a4
|
||||
fcvt.d.s b4, b4
|
||||
fmadd.d s2, b4, a4, s2
|
||||
#else
|
||||
MADD s2, b4, a4, s2
|
||||
#endif
|
||||
LD a4, X, 11 * SIZE
|
||||
LD b4, Y, 11 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
.L13:
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
LD a1, X, 4 * SIZE
|
||||
LD b1, Y, 4 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a2, a2
|
||||
fcvt.d.s b2, b2
|
||||
fmadd.d s2, b2, a2, s2
|
||||
#else
|
||||
MADD s2, b2, a2, s2
|
||||
#endif
|
||||
LD a2, X, 5 * SIZE
|
||||
LD b2, Y, 5 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a3, a3
|
||||
fcvt.d.s b3, b3
|
||||
fmadd.d s1, b3, a3, s1
|
||||
#else
|
||||
MADD s1, b3, a3, s1
|
||||
#endif
|
||||
LD a3, X, 6 * SIZE
|
||||
LD b3, Y, 6 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a4, a4
|
||||
fcvt.d.s b4, b4
|
||||
fmadd.d s2, b4, a4, s2
|
||||
#else
|
||||
MADD s2, b4, a4, s2
|
||||
#endif
|
||||
LD a4, X, 7 * SIZE
|
||||
LD b4, Y, 7 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a2, a2
|
||||
fcvt.d.s b2, b2
|
||||
fmadd.d s2, b2, a2, s2
|
||||
#else
|
||||
MADD s2, b2, a2, s2
|
||||
#endif
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a3, a3
|
||||
fcvt.d.s b3, b3
|
||||
fmadd.d s1, b3, a3, s1
|
||||
#else
|
||||
MADD s1, b3, a3, s1
|
||||
#endif
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a4, a4
|
||||
fcvt.d.s b4, b4
|
||||
fmadd.d s2, b4, a4, s2
|
||||
#else
|
||||
MADD s2, b4, a4, s2
|
||||
#endif
|
||||
.align 3
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD b1, Y, 0 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
addi.d Y, Y, SIZE
|
||||
blt $r0, I, .L16
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
#ifdef F_INTERFACE
|
||||
bgez INCX, .L21
|
||||
addi.d TEMP, N, -1
|
||||
mult TEMP, INCX
|
||||
mflo TEMP
|
||||
dsub X, X, TEMP
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
bgez INCY, .L22
|
||||
addi.d TEMP, N, -1
|
||||
mult TEMP, INCY
|
||||
mflo TEMP
|
||||
dsub Y, Y, TEMP
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#endif
|
||||
bge $r0, I, .L25
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s2, b1, a1, s2
|
||||
#else
|
||||
MADD s2, b1, a1, s2
|
||||
#endif
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s2, b1, a1, s2
|
||||
#else
|
||||
MADD s2, b1, a1, s2
|
||||
#endif
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s2, b1, a1, s2
|
||||
#else
|
||||
MADD s2, b1, a1, s2
|
||||
#endif
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
addi.d I, I, -1
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s2, b1, a1, s2
|
||||
#else
|
||||
MADD s2, b1, a1, s2
|
||||
#endif
|
||||
blt $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
addi.d I, I, -1
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
blt $r0, I, .L26
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
#ifdef DSDOT
|
||||
fadd.d $f0, s1, s2
|
||||
#else
|
||||
ADD $f0, s1, s2
|
||||
#endif
|
||||
move $r4, $r17
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,531 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
/* Unused param dummy1 */
|
||||
#define M $r4
|
||||
#define N $r5
|
||||
#define A $r7
|
||||
#define LDA $r8
|
||||
#define X $r9
|
||||
#define INCX $r10
|
||||
#define Y $r11
|
||||
#define INCY $r6
|
||||
#define BUFFER $r16
|
||||
#define YORIG $r18
|
||||
#define XX $r12
|
||||
#define YY $r13
|
||||
#define I $r14
|
||||
#define J $r15
|
||||
#define AO1 $r23
|
||||
#define AO2 $r24
|
||||
#define ALPHA $f0
|
||||
#define a1 $f22
|
||||
#define a2 $f8
|
||||
#define a3 $f23
|
||||
#define a4 $f9
|
||||
#define a5 $f10
|
||||
#define a6 $f11
|
||||
#define a7 $f12
|
||||
#define a8 $f13
|
||||
#define x1 $f14
|
||||
#define x2 $f15
|
||||
#define y1 $f16
|
||||
#define y2 $f17
|
||||
#define y3 $f3
|
||||
#define y4 $f1
|
||||
#define y5 $f2
|
||||
#define y6 $f4
|
||||
#define y7 $f5
|
||||
#define y8 $f6
|
||||
#define t1 $f7
|
||||
#define t2 $f18
|
||||
#define t3 $f19
|
||||
#define t4 $f20
|
||||
|
||||
PROLOGUE
|
||||
|
||||
LDARG INCY, $sp, 0
|
||||
LDARG BUFFER, $sp, 8
|
||||
#ifdef __64BIT__
|
||||
addi.d $sp, $sp, -16
|
||||
#else
|
||||
addi.d $sp, $sp, -48
|
||||
#endif
|
||||
SDARG $r23, $sp, 0
|
||||
SDARG $r24, $sp, 8
|
||||
slli.d LDA, LDA, BASE_SHIFT
|
||||
#ifndef __64BIT__
|
||||
fst.d $f18, $sp, 16
|
||||
fst.d $f19, $sp, 24
|
||||
fst.d $f20, $sp, 32
|
||||
#endif
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, M, .L999
|
||||
slli.d INCY, INCY, BASE_SHIFT
|
||||
bge $r0, N, .L999
|
||||
li.d I, SIZE
|
||||
move YORIG, Y
|
||||
beq INCY, I, .L10
|
||||
srai.d I, M, 2
|
||||
move YORIG, BUFFER
|
||||
move XX, Y
|
||||
move YY, BUFFER
|
||||
bge $r0, I, .L05
|
||||
.align 3
|
||||
|
||||
.L02:
|
||||
LD a1, XX, 0 * SIZE
|
||||
add.d XX, XX, INCY
|
||||
LD a2, XX, 0 * SIZE
|
||||
add.d XX, XX, INCY
|
||||
LD a3, XX, 0 * SIZE
|
||||
add.d XX, XX, INCY
|
||||
LD a4, XX, 0 * SIZE
|
||||
add.d XX, XX, INCY
|
||||
ST a1, YY, 0 * SIZE
|
||||
ST a2, YY, 1 * SIZE
|
||||
ST a3, YY, 2 * SIZE
|
||||
ST a4, YY, 3 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d YY, YY, 4 * SIZE
|
||||
blt $r0, I, .L02
|
||||
.align 3
|
||||
|
||||
.L05:
|
||||
andi I, M, 3
|
||||
bge $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L06:
|
||||
LD a1, XX, 0 * SIZE
|
||||
add.d XX, XX, INCY
|
||||
ST a1, YY, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d YY, YY, 1 * SIZE
|
||||
blt $r0, I, .L06
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
srai.d J, N, 1
|
||||
bge $r0, J, .L20
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
LD x1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD x2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
move AO1, A
|
||||
add.d AO2, A, LDA
|
||||
add.d A, AO2, LDA
|
||||
move YY, YORIG
|
||||
MUL x1, ALPHA, x1
|
||||
srai.d I, M, 3
|
||||
MUL x2, ALPHA, x2
|
||||
bge $r0, I, .L15
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
LD y2, YY, 1 * SIZE
|
||||
LD a3, AO1, 2 * SIZE
|
||||
LD y3, YY, 2 * SIZE
|
||||
LD a4, AO1, 3 * SIZE
|
||||
LD y4, YY, 3 * SIZE
|
||||
LD a5, AO2, 0 * SIZE
|
||||
LD y5, YY, 4 * SIZE
|
||||
LD a6, AO2, 1 * SIZE
|
||||
LD y6, YY, 5 * SIZE
|
||||
LD a7, AO2, 2 * SIZE
|
||||
LD y7, YY, 6 * SIZE
|
||||
LD a8, AO2, 3 * SIZE
|
||||
addi.d I, I, -1
|
||||
LD y8, YY, 7 * SIZE
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
.L12:
|
||||
MADD t1, a1, x1, y1
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD t2, a2, x1, y2
|
||||
LD a2, AO1, 5 * SIZE
|
||||
LD y1, YY, 8 * SIZE
|
||||
LD y2, YY, 9 * SIZE
|
||||
MADD t3, a3, x1, y3
|
||||
LD a3, AO1, 6 * SIZE
|
||||
MADD t4, a4, x1, y4
|
||||
LD a4, AO1, 7 * SIZE
|
||||
LD y3, YY, 10 * SIZE
|
||||
LD y4, YY, 11 * SIZE
|
||||
MADD t1, a5, x2, t1
|
||||
LD a5, AO2, 4 * SIZE
|
||||
MADD t2, a6, x2, t2
|
||||
LD a6, AO2, 5 * SIZE
|
||||
MADD t3, a7, x2, t3
|
||||
LD a7, AO2, 6 * SIZE
|
||||
MADD t4, a8, x2, t4
|
||||
LD a8, AO2, 7 * SIZE
|
||||
ST t1, YY, 0 * SIZE
|
||||
ST t2, YY, 1 * SIZE
|
||||
ST t3, YY, 2 * SIZE
|
||||
ST t4, YY, 3 * SIZE
|
||||
MADD t1, a1, x1, y5
|
||||
LD a1, AO1, 8 * SIZE
|
||||
MADD t2, a2, x1, y6
|
||||
LD a2, AO1, 9 * SIZE
|
||||
LD y5, YY, 12 * SIZE
|
||||
LD y6, YY, 13 * SIZE
|
||||
MADD t3, a3, x1, y7
|
||||
LD a3, AO1, 10 * SIZE
|
||||
MADD t4, a4, x1, y8
|
||||
LD a4, AO1, 11 * SIZE
|
||||
LD y7, YY, 14 * SIZE
|
||||
LD y8, YY, 15 * SIZE
|
||||
MADD t1, a5, x2, t1
|
||||
LD a5, AO2, 8 * SIZE
|
||||
MADD t2, a6, x2, t2
|
||||
LD a6, AO2, 9 * SIZE
|
||||
MADD t3, a7, x2, t3
|
||||
LD a7, AO2, 10 * SIZE
|
||||
MADD t4, a8, x2, t4
|
||||
LD a8, AO2, 11 * SIZE
|
||||
ST t1, YY, 4 * SIZE
|
||||
ST t2, YY, 5 * SIZE
|
||||
ST t3, YY, 6 * SIZE
|
||||
ST t4, YY, 7 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d YY, YY, 8 * SIZE
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
addi.d AO2, AO2, 8 * SIZE
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
MADD t1, a1, x1, y1
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD t2, a2, x1, y2
|
||||
LD a2, AO1, 5 * SIZE
|
||||
MADD t3, a3, x1, y3
|
||||
LD a3, AO1, 6 * SIZE
|
||||
MADD t4, a4, x1, y4
|
||||
LD a4, AO1, 7 * SIZE
|
||||
MADD t1, a5, x2, t1
|
||||
LD a5, AO2, 4 * SIZE
|
||||
MADD t2, a6, x2, t2
|
||||
LD a6, AO2, 5 * SIZE
|
||||
MADD t3, a7, x2, t3
|
||||
LD a7, AO2, 6 * SIZE
|
||||
MADD t4, a8, x2, t4
|
||||
LD a8, AO2, 7 * SIZE
|
||||
ST t1, YY, 0 * SIZE
|
||||
MADD t1, a1, x1, y5
|
||||
ST t2, YY, 1 * SIZE
|
||||
MADD t2, a2, x1, y6
|
||||
ST t3, YY, 2 * SIZE
|
||||
MADD t3, a3, x1, y7
|
||||
ST t4, YY, 3 * SIZE
|
||||
MADD t4, a4, x1, y8
|
||||
MADD t1, a5, x2, t1
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
MADD t2, a6, x2, t2
|
||||
addi.d AO2, AO2, 8 * SIZE
|
||||
MADD t3, a7, x2, t3
|
||||
addi.d YY, YY, 8 * SIZE
|
||||
MADD t4, a8, x2, t4
|
||||
ST t1, YY, -4 * SIZE
|
||||
ST t2, YY, -3 * SIZE
|
||||
ST t3, YY, -2 * SIZE
|
||||
ST t4, YY, -1 * SIZE
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, M, 4
|
||||
bge $r0, I, .L16
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
LD y2, YY, 1 * SIZE
|
||||
LD a3, AO1, 2 * SIZE
|
||||
LD y3, YY, 2 * SIZE
|
||||
LD a4, AO1, 3 * SIZE
|
||||
LD y4, YY, 3 * SIZE
|
||||
LD a5, AO2, 0 * SIZE
|
||||
MADD y1, a1, x1, y1
|
||||
LD a6, AO2, 1 * SIZE
|
||||
MADD y2, a2, x1, y2
|
||||
LD a7, AO2, 2 * SIZE
|
||||
MADD y3, a3, x1, y3
|
||||
LD a8, AO2, 3 * SIZE
|
||||
MADD y4, a4, x1, y4
|
||||
MADD y1, a5, x2, y1
|
||||
addi.d YY, YY, 4 * SIZE
|
||||
MADD y2, a6, x2, y2
|
||||
addi.d AO1, AO1, 4 * SIZE
|
||||
MADD y3, a7, x2, y3
|
||||
addi.d AO2, AO2, 4 * SIZE
|
||||
MADD y4, a8, x2, y4
|
||||
ST y1, YY, -4 * SIZE
|
||||
ST y2, YY, -3 * SIZE
|
||||
ST y3, YY, -2 * SIZE
|
||||
ST y4, YY, -1 * SIZE
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
andi I, M, 2
|
||||
bge $r0, I, .L17
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
LD y2, YY, 1 * SIZE
|
||||
LD a5, AO2, 0 * SIZE
|
||||
LD a6, AO2, 1 * SIZE
|
||||
MADD y1, a1, x1, y1
|
||||
MADD y2, a2, x1, y2
|
||||
addi.d YY, YY, 2 * SIZE
|
||||
MADD y1, a5, x2, y1
|
||||
addi.d AO1, AO1, 2 * SIZE
|
||||
MADD y2, a6, x2, y2
|
||||
addi.d AO2, AO2, 2 * SIZE
|
||||
ST y1, YY, -2 * SIZE
|
||||
ST y2, YY, -1 * SIZE
|
||||
.align 3
|
||||
|
||||
.L17:
|
||||
andi I, M, 1
|
||||
bge $r0, I, .L19
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD a5, AO2, 0 * SIZE
|
||||
MADD y1, a1, x1, y1
|
||||
MADD y1, a5, x2, y1
|
||||
ST y1, YY, 0 * SIZE
|
||||
.align 3
|
||||
|
||||
.L19:
|
||||
addi.d J, J, -1
|
||||
blt $r0, J, .L11
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
andi J, N, 1
|
||||
bge $r0, J, .L900
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
LD x1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
move YY, YORIG
|
||||
move AO1, A
|
||||
srai.d I, M, 3
|
||||
MUL x1, ALPHA, x1
|
||||
bge $r0, I, .L25
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
LD y2, YY, 1 * SIZE
|
||||
LD a3, AO1, 2 * SIZE
|
||||
LD y3, YY, 2 * SIZE
|
||||
LD a4, AO1, 3 * SIZE
|
||||
LD y4, YY, 3 * SIZE
|
||||
LD y5, YY, 4 * SIZE
|
||||
LD y6, YY, 5 * SIZE
|
||||
LD y7, YY, 6 * SIZE
|
||||
addi.d I, I, -1
|
||||
LD y8, YY, 7 * SIZE
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
.L22:
|
||||
MADD t1, a1, x1, y1
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD t2, a2, x1, y2
|
||||
LD a2, AO1, 5 * SIZE
|
||||
LD y1, YY, 8 * SIZE
|
||||
LD y2, YY, 9 * SIZE
|
||||
MADD t3, a3, x1, y3
|
||||
LD a3, AO1, 6 * SIZE
|
||||
MADD t4, a4, x1, y4
|
||||
LD a4, AO1, 7 * SIZE
|
||||
LD y3, YY, 10 * SIZE
|
||||
LD y4, YY, 11 * SIZE
|
||||
ST t1, YY, 0 * SIZE
|
||||
ST t2, YY, 1 * SIZE
|
||||
ST t3, YY, 2 * SIZE
|
||||
ST t4, YY, 3 * SIZE
|
||||
MADD t1, a1, x1, y5
|
||||
LD a1, AO1, 8 * SIZE
|
||||
MADD t2, a2, x1, y6
|
||||
LD a2, AO1, 9 * SIZE
|
||||
LD y5, YY, 12 * SIZE
|
||||
LD y6, YY, 13 * SIZE
|
||||
MADD t3, a3, x1, y7
|
||||
LD a3, AO1, 10 * SIZE
|
||||
MADD t4, a4, x1, y8
|
||||
LD a4, AO1, 11 * SIZE
|
||||
LD y7, YY, 14 * SIZE
|
||||
LD y8, YY, 15 * SIZE
|
||||
ST t1, YY, 4 * SIZE
|
||||
ST t2, YY, 5 * SIZE
|
||||
ST t3, YY, 6 * SIZE
|
||||
ST t4, YY, 7 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d YY, YY, 8 * SIZE
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
blt $r0, I, .L22
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
MADD t1, a1, x1, y1
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD t2, a2, x1, y2
|
||||
LD a2, AO1, 5 * SIZE
|
||||
MADD t3, a3, x1, y3
|
||||
LD a3, AO1, 6 * SIZE
|
||||
MADD t4, a4, x1, y4
|
||||
LD a4, AO1, 7 * SIZE
|
||||
ST t1, YY, 0 * SIZE
|
||||
MADD t1, a1, x1, y5
|
||||
ST t2, YY, 1 * SIZE
|
||||
MADD t2, a2, x1, y6
|
||||
ST t3, YY, 2 * SIZE
|
||||
MADD t3, a3, x1, y7
|
||||
ST t4, YY, 3 * SIZE
|
||||
MADD t4, a4, x1, y8
|
||||
ST t1, YY, 4 * SIZE
|
||||
ST t2, YY, 5 * SIZE
|
||||
ST t3, YY, 6 * SIZE
|
||||
ST t4, YY, 7 * SIZE
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
addi.d YY, YY, 8 * SIZE
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, M, 4
|
||||
bge $r0, I, .L26
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
LD y2, YY, 1 * SIZE
|
||||
LD a3, AO1, 2 * SIZE
|
||||
LD y3, YY, 2 * SIZE
|
||||
LD a4, AO1, 3 * SIZE
|
||||
LD y4, YY, 3 * SIZE
|
||||
MADD y1, a1, x1, y1
|
||||
MADD y2, a2, x1, y2
|
||||
MADD y3, a3, x1, y3
|
||||
addi.d YY, YY, 4 * SIZE
|
||||
MADD y4, a4, x1, y4
|
||||
addi.d AO1, AO1, 4 * SIZE
|
||||
ST y1, YY, -4 * SIZE
|
||||
ST y2, YY, -3 * SIZE
|
||||
ST y3, YY, -2 * SIZE
|
||||
ST y4, YY, -1 * SIZE
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
andi I, M, 2
|
||||
bge $r0, I, .L27
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
LD y2, YY, 1 * SIZE
|
||||
MADD y1, a1, x1, y1
|
||||
addi.d YY, YY, 2 * SIZE
|
||||
MADD y2, a2, x1, y2
|
||||
addi.d AO1, AO1, 2 * SIZE
|
||||
ST y1, YY, -2 * SIZE
|
||||
ST y2, YY, -1 * SIZE
|
||||
.align 3
|
||||
|
||||
.L27:
|
||||
andi I, M, 1
|
||||
bge $r0, I, .L900
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD a1, AO1, 0 * SIZE
|
||||
MADD y1, a1, x1, y1
|
||||
ST y1, YY, 0 * SIZE
|
||||
.align 3
|
||||
|
||||
.L900:
|
||||
li.d YORIG, SIZE
|
||||
srai.d I, M, 2
|
||||
beq INCY, YORIG, .L999
|
||||
move XX, BUFFER
|
||||
bge $r0, I, .L905
|
||||
.align 3
|
||||
|
||||
.L902:
|
||||
LD a1, XX, 0 * SIZE
|
||||
LD a2, XX, 1 * SIZE
|
||||
LD a3, XX, 2 * SIZE
|
||||
LD a4, XX, 3 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a4, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
addi.d I, I, -1
|
||||
addi.d XX, XX, 4 * SIZE
|
||||
blt $r0, I, .L902
|
||||
.align 3
|
||||
|
||||
.L905:
|
||||
andi I, M, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L906:
|
||||
LD a1, XX, 0 * SIZE
|
||||
addi.d XX, XX, 1 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L906
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
LDARG $r23, $sp, 0
|
||||
LDARG $r24, $sp, 8
|
||||
#ifndef __64BIT__
|
||||
fld.d $f18, $sp, 16
|
||||
fld.d $f19, $sp, 24
|
||||
fld.d $f20, $sp, 32
|
||||
#endif
|
||||
#ifdef __64BIT__
|
||||
addi.d $sp, $sp, 16
|
||||
#else
|
||||
addi.d $sp, $sp, 48
|
||||
#endif
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,436 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
/* Unused param dummy1 */
|
||||
#define M $r4
|
||||
#define N $r5
|
||||
#define A $r7
|
||||
#define LDA $r8
|
||||
#define X $r9
|
||||
#define INCX $r10
|
||||
#define Y $r11
|
||||
#define INCY $r6
|
||||
#define BUFFER $r16
|
||||
#define XORIG $r18
|
||||
#define XX $r12
|
||||
#define YY $r13
|
||||
#define I $r14
|
||||
#define J $r15
|
||||
#define AO1 $r23
|
||||
#define AO2 $r24
|
||||
#define ALPHA $f0
|
||||
#define a1 $f22
|
||||
#define a2 $f8
|
||||
#define a3 $f23
|
||||
#define a4 $f9
|
||||
#define a5 $f10
|
||||
#define a6 $f11
|
||||
#define a7 $f12
|
||||
#define a8 $f13
|
||||
#define y1 $f14
|
||||
#define y2 $f15
|
||||
#define y3 $f16
|
||||
#define y4 $f17
|
||||
#define x1 $f3
|
||||
#define x2 $f1
|
||||
#define x3 $f2
|
||||
#define x4 $f4
|
||||
#define x5 $f5
|
||||
#define x6 $f6
|
||||
#define x7 $f7
|
||||
#define x8 $f18
|
||||
|
||||
PROLOGUE
|
||||
|
||||
LDARG INCY, $sp, 0
|
||||
LDARG BUFFER, $sp, 8
|
||||
#ifdef __64BIT__
|
||||
addi.d $sp, $sp, -16
|
||||
#else
|
||||
addi.d $sp, $sp, -32
|
||||
#endif
|
||||
MTC y1, $r0
|
||||
SDARG $r23, $sp, 0
|
||||
SDARG $r24, $sp, 8
|
||||
slli.d LDA, LDA, BASE_SHIFT
|
||||
#ifndef __64BIT__
|
||||
fst.d $f18, $sp, 16
|
||||
#endif
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, M, .L999
|
||||
slli.d INCY, INCY, BASE_SHIFT
|
||||
bge $r0, N, .L999
|
||||
li.d I, SIZE
|
||||
move XORIG, X
|
||||
beq INCX, I, .L10
|
||||
srai.d I, M, 2
|
||||
move XORIG, BUFFER
|
||||
move YY, BUFFER
|
||||
bge $r0, I, .L05
|
||||
.align 3
|
||||
|
||||
.L02:
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, YY, 0 * SIZE
|
||||
ST a2, YY, 1 * SIZE
|
||||
ST a3, YY, 2 * SIZE
|
||||
ST a4, YY, 3 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d YY, YY, 4 * SIZE
|
||||
blt $r0, I, .L02
|
||||
.align 3
|
||||
|
||||
.L05:
|
||||
andi I, M, 3
|
||||
bge $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L06:
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, YY, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d YY, YY, 1 * SIZE
|
||||
blt $r0, I, .L06
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
srai.d J, N, 1
|
||||
move YY, Y
|
||||
bge $r0, J, .L20
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
move AO1, A
|
||||
MOV y2, y1
|
||||
add.d AO2, A, LDA
|
||||
MOV y3, y1
|
||||
add.d A, AO2, LDA
|
||||
MOV y4, y1
|
||||
srai.d I, M, 3
|
||||
move XX, XORIG
|
||||
bge $r0, I, .L15
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD x1, XX, 0 * SIZE
|
||||
LD a2, AO2, 0 * SIZE
|
||||
LD x2, XX, 1 * SIZE
|
||||
LD a3, AO1, 1 * SIZE
|
||||
LD x3, XX, 2 * SIZE
|
||||
LD a4, AO2, 1 * SIZE
|
||||
LD x4, XX, 3 * SIZE
|
||||
LD a5, AO1, 2 * SIZE
|
||||
LD x5, XX, 4 * SIZE
|
||||
LD a6, AO2, 2 * SIZE
|
||||
LD x6, XX, 5 * SIZE
|
||||
LD a7, AO1, 3 * SIZE
|
||||
LD x7, XX, 6 * SIZE
|
||||
LD a8, AO2, 3 * SIZE
|
||||
addi.d I, I, -1
|
||||
LD x8, XX, 7 * SIZE
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
.L12:
|
||||
MADD y1, a1, x1, y1
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD y2, a2, x1, y2
|
||||
LD a2, AO2, 4 * SIZE
|
||||
MADD y3, a3, x2, y3
|
||||
LD a3, AO1, 5 * SIZE
|
||||
MADD y4, a4, x2, y4
|
||||
LD a4, AO2, 5 * SIZE
|
||||
LD x1, XX, 8 * SIZE
|
||||
LD x2, XX, 9 * SIZE
|
||||
MADD y1, a5, x3, y1
|
||||
LD a5, AO1, 6 * SIZE
|
||||
MADD y2, a6, x3, y2
|
||||
LD a6, AO2, 6 * SIZE
|
||||
MADD y3, a7, x4, y3
|
||||
LD a7, AO1, 7 * SIZE
|
||||
MADD y4, a8, x4, y4
|
||||
LD a8, AO2, 7 * SIZE
|
||||
LD x3, XX, 10 * SIZE
|
||||
LD x4, XX, 11 * SIZE
|
||||
MADD y1, a1, x5, y1
|
||||
LD a1, AO1, 8 * SIZE
|
||||
MADD y2, a2, x5, y2
|
||||
LD a2, AO2, 8 * SIZE
|
||||
MADD y3, a3, x6, y3
|
||||
LD a3, AO1, 9 * SIZE
|
||||
MADD y4, a4, x6, y4
|
||||
LD a4, AO2, 9 * SIZE
|
||||
LD x5, XX, 12 * SIZE
|
||||
LD x6, XX, 13 * SIZE
|
||||
MADD y1, a5, x7, y1
|
||||
LD a5, AO1, 10 * SIZE
|
||||
MADD y2, a6, x7, y2
|
||||
LD a6, AO2, 10 * SIZE
|
||||
MADD y3, a7, x8, y3
|
||||
LD a7, AO1, 11 * SIZE
|
||||
MADD y4, a8, x8, y4
|
||||
LD a8, AO2, 11 * SIZE
|
||||
LD x7, XX, 14 * SIZE
|
||||
LD x8, XX, 15 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d XX, XX, 8 * SIZE
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
addi.d AO2, AO2, 8 * SIZE
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
MADD y1, a1, x1, y1
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD y2, a2, x1, y2
|
||||
LD a2, AO2, 4 * SIZE
|
||||
MADD y3, a3, x2, y3
|
||||
LD a3, AO1, 5 * SIZE
|
||||
MADD y4, a4, x2, y4
|
||||
LD a4, AO2, 5 * SIZE
|
||||
MADD y1, a5, x3, y1
|
||||
LD a5, AO1, 6 * SIZE
|
||||
MADD y2, a6, x3, y2
|
||||
LD a6, AO2, 6 * SIZE
|
||||
MADD y3, a7, x4, y3
|
||||
LD a7, AO1, 7 * SIZE
|
||||
MADD y4, a8, x4, y4
|
||||
LD a8, AO2, 7 * SIZE
|
||||
MADD y1, a1, x5, y1
|
||||
MADD y2, a2, x5, y2
|
||||
MADD y3, a3, x6, y3
|
||||
MADD y4, a4, x6, y4
|
||||
MADD y1, a5, x7, y1
|
||||
addi.d XX, XX, 8 * SIZE
|
||||
MADD y2, a6, x7, y2
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
MADD y3, a7, x8, y3
|
||||
addi.d AO2, AO2, 8 * SIZE
|
||||
MADD y4, a8, x8, y4
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, M, 4
|
||||
bge $r0, I, .L17
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD x1, XX, 0 * SIZE
|
||||
LD a2, AO2, 0 * SIZE
|
||||
LD a3, AO1, 1 * SIZE
|
||||
LD x2, XX, 1 * SIZE
|
||||
LD a4, AO2, 1 * SIZE
|
||||
LD a5, AO1, 2 * SIZE
|
||||
LD x3, XX, 2 * SIZE
|
||||
MADD y1, a1, x1, y1
|
||||
LD a6, AO2, 2 * SIZE
|
||||
MADD y2, a2, x1, y2
|
||||
LD a7, AO1, 3 * SIZE
|
||||
MADD y3, a3, x2, y3
|
||||
LD x4, XX, 3 * SIZE
|
||||
MADD y4, a4, x2, y4
|
||||
LD a8, AO2, 3 * SIZE
|
||||
MADD y1, a5, x3, y1
|
||||
MADD y2, a6, x3, y2
|
||||
addi.d XX, XX, 4 * SIZE
|
||||
MADD y3, a7, x4, y3
|
||||
addi.d AO1, AO1, 4 * SIZE
|
||||
MADD y4, a8, x4, y4
|
||||
addi.d AO2, AO2, 4 * SIZE
|
||||
.align 3
|
||||
|
||||
.L17:
|
||||
andi I, M, 3
|
||||
ADD y1, y1, y3
|
||||
ADD y2, y2, y4
|
||||
bge $r0, I, .L19
|
||||
.align 3
|
||||
.L18:
|
||||
LD x1, XX, 0 * SIZE
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD a2, AO2, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d XX, XX, 1 * SIZE
|
||||
addi.d AO1, AO1, 1 * SIZE
|
||||
addi.d AO2, AO2, 1 * SIZE
|
||||
MADD y1, a1, x1, y1
|
||||
MADD y2, a2, x1, y2
|
||||
blt $r0, I, .L18
|
||||
.align 3
|
||||
|
||||
.L19:
|
||||
LD a1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
MADD a1, y1, ALPHA, a1
|
||||
addi.d J, J, -1
|
||||
MADD a2, y2, ALPHA, a2
|
||||
MTC y1, $r0
|
||||
ST a1, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
ST a2, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
blt $r0, J, .L11
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
andi J, N, 1
|
||||
MOV y3, y1
|
||||
move AO1, A
|
||||
bge $r0, J, .L999
|
||||
srai.d I, M, 3
|
||||
move XX, XORIG
|
||||
bge $r0, I, .L25
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD x1, XX, 0 * SIZE
|
||||
LD a3, AO1, 1 * SIZE
|
||||
LD x2, XX, 1 * SIZE
|
||||
LD a5, AO1, 2 * SIZE
|
||||
LD x3, XX, 2 * SIZE
|
||||
LD a7, AO1, 3 * SIZE
|
||||
LD x4, XX, 3 * SIZE
|
||||
LD x5, XX, 4 * SIZE
|
||||
LD x6, XX, 5 * SIZE
|
||||
LD x7, XX, 6 * SIZE
|
||||
addi.d I, I, -1
|
||||
LD x8, XX, 7 * SIZE
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
.L22:
|
||||
MADD y1, a1, x1, y1
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD y3, a3, x2, y3
|
||||
LD a3, AO1, 5 * SIZE
|
||||
LD x1, XX, 8 * SIZE
|
||||
LD x2, XX, 9 * SIZE
|
||||
MADD y1, a5, x3, y1
|
||||
LD a5, AO1, 6 * SIZE
|
||||
MADD y3, a7, x4, y3
|
||||
LD a7, AO1, 7 * SIZE
|
||||
LD x3, XX, 10 * SIZE
|
||||
LD x4, XX, 11 * SIZE
|
||||
MADD y1, a1, x5, y1
|
||||
LD a1, AO1, 8 * SIZE
|
||||
MADD y3, a3, x6, y3
|
||||
LD a3, AO1, 9 * SIZE
|
||||
LD x5, XX, 12 * SIZE
|
||||
LD x6, XX, 13 * SIZE
|
||||
MADD y1, a5, x7, y1
|
||||
LD a5, AO1, 10 * SIZE
|
||||
MADD y3, a7, x8, y3
|
||||
LD a7, AO1, 11 * SIZE
|
||||
LD x7, XX, 14 * SIZE
|
||||
LD x8, XX, 15 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d XX, XX, 8 * SIZE
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
blt $r0, I, .L22
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
MADD y1, a1, x1, y1
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD y3, a3, x2, y3
|
||||
LD a3, AO1, 5 * SIZE
|
||||
MADD y1, a5, x3, y1
|
||||
LD a5, AO1, 6 * SIZE
|
||||
MADD y3, a7, x4, y3
|
||||
LD a7, AO1, 7 * SIZE
|
||||
MADD y1, a1, x5, y1
|
||||
MADD y3, a3, x6, y3
|
||||
MADD y1, a5, x7, y1
|
||||
MADD y3, a7, x8, y3
|
||||
addi.d XX, XX, 8 * SIZE
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, M, 4
|
||||
bge $r0, I, .L27
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD x1, XX, 0 * SIZE
|
||||
LD a3, AO1, 1 * SIZE
|
||||
LD x2, XX, 1 * SIZE
|
||||
LD a5, AO1, 2 * SIZE
|
||||
LD x3, XX, 2 * SIZE
|
||||
MADD y1, a1, x1, y1
|
||||
LD a7, AO1, 3 * SIZE
|
||||
MADD y3, a3, x2, y3
|
||||
LD x4, XX, 3 * SIZE
|
||||
MADD y1, a5, x3, y1
|
||||
addi.d XX, XX, 4 * SIZE
|
||||
MADD y3, a7, x4, y3
|
||||
addi.d AO1, AO1, 4 * SIZE
|
||||
.align 3
|
||||
|
||||
.L27:
|
||||
andi I, M, 3
|
||||
ADD y1, y1, y3
|
||||
bge $r0, I, .L29
|
||||
.align 3
|
||||
.L28:
|
||||
LD x1, XX, 0 * SIZE
|
||||
LD a1, AO1, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d XX, XX, 1 * SIZE
|
||||
addi.d AO1, AO1, 1 * SIZE
|
||||
MADD y1, a1, x1, y1
|
||||
blt $r0, I, .L28
|
||||
.align 3
|
||||
|
||||
.L29:
|
||||
LD a1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
MADD a1, y1, ALPHA, a1
|
||||
ST a1, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
LDARG $r23, $sp, 0
|
||||
LDARG $r24, $sp, 8
|
||||
#ifndef __64BIT__
|
||||
fld.d $f18, $sp, 16
|
||||
#endif
|
||||
#ifdef __64BIT__
|
||||
addi.d $sp, $sp, 16
|
||||
#else
|
||||
addi.d $sp, $sp, 32
|
||||
#endif
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,233 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r18
|
||||
#define TEMP $r7
|
||||
#define a1 $f10
|
||||
#define a2 $f11
|
||||
#define a3 $f12
|
||||
#define a4 $f13
|
||||
#define a5 $f14
|
||||
#define a6 $f15
|
||||
#define a7 $f16
|
||||
#define a8 $f17
|
||||
#define t1 $f0
|
||||
#define t2 $f1
|
||||
#define t3 $f2
|
||||
#define t4 $f3
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define s3 $f23
|
||||
#define s4 $f9
|
||||
#define x1 $r17
|
||||
#define x2 $r8
|
||||
#define x3 $r9
|
||||
#define x4 $r10
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
li.d x1, 0
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d N, N, -1
|
||||
li.d x1, 1
|
||||
bge $r0, N, .L999
|
||||
FABS s1, a1
|
||||
add.d X, X, INCX
|
||||
FABS s2, a1
|
||||
li.d x2, 1
|
||||
FABS s3, a1
|
||||
srai.d I, N, 3
|
||||
FABS s4, a1
|
||||
li.d x3, 1
|
||||
li.d TEMP, 2
|
||||
li.d x4, 1
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a6, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a8, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
FABS t1, a1
|
||||
LD a1, X, 0 * SIZE
|
||||
FABS t2, a2
|
||||
add.d X, X, INCX
|
||||
FABS t3, a3
|
||||
LD a2, X, 0 * SIZE
|
||||
FABS t4, a4
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc0, s1, t1
|
||||
LD a3, X, 0 * SIZE
|
||||
CMPLT $fcc1, s2, t2
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, s3, t3
|
||||
LD a4, X, 0 * SIZE
|
||||
CMPLT $fcc3, s4, t4
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
addi.d TEMP, TEMP, 4
|
||||
addi.d I, I, -1
|
||||
FABS t1, a5
|
||||
LD a5, X, 0 * SIZE
|
||||
FABS t2, a6
|
||||
add.d X, X, INCX
|
||||
FABS t3, a7
|
||||
LD a6, X, 0 * SIZE
|
||||
FABS t4, a8
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc0, s1, t1
|
||||
LD a7, X, 0 * SIZE
|
||||
CMPLT $fcc1, s2, t2
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, s3, t3
|
||||
LD a8, X, 0 * SIZE
|
||||
CMPLT $fcc3, s4, t4
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
addi.d TEMP, TEMP, 4
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
FABS t3, a3
|
||||
FABS t4, a4
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMPLT $fcc1, s2, t2
|
||||
CMPLT $fcc2, s3, t3
|
||||
CMPLT $fcc3, s4, t4
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
FABS t1, a5
|
||||
addi.d TEMP, TEMP, 4
|
||||
FABS t2, a6
|
||||
FABS t3, a7
|
||||
FABS t4, a8
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMPLT $fcc1, s2, t2
|
||||
CMPLT $fcc2, s3, t3
|
||||
CMPLT $fcc3, s4, t4
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
addi.d TEMP, TEMP, 4
|
||||
addi.d x2, x2, 1
|
||||
addi.d x3, x3, 2
|
||||
addi.d x4, x4, 3
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, a1
|
||||
addi.d I, I, -1
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
addi.d TEMP, TEMP, 1
|
||||
blt $r0, I, .L16
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
CMPLT $fcc0, s1, s2
|
||||
CMPLT $fcc1, s3, s4
|
||||
CMOVT s1, s1, s2, $fcc0
|
||||
MOVT(x1, x2, $fcc0)
|
||||
CMOVT s3, s3, s4, $fcc1
|
||||
MOVT(x3, x4, $fcc1)
|
||||
CMPLT $fcc0, s1, s3
|
||||
CMOVT s1, s1, s3, $fcc0
|
||||
MOVT(x1, x3, $fcc0)
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,233 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r18
|
||||
#define TEMP $r7
|
||||
#define a1 $f10
|
||||
#define a2 $f11
|
||||
#define a3 $f12
|
||||
#define a4 $f13
|
||||
#define a5 $f14
|
||||
#define a6 $f15
|
||||
#define a7 $f16
|
||||
#define a8 $f17
|
||||
#define t1 $f0
|
||||
#define t2 $f1
|
||||
#define t3 $f2
|
||||
#define t4 $f3
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define s3 $f23
|
||||
#define s4 $f9
|
||||
#define x1 $r17
|
||||
#define x2 $r8
|
||||
#define x3 $r9
|
||||
#define x4 $r10
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
li.d x1, 0
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d N, N, -1
|
||||
li.d x1, 1
|
||||
bge $r0, N, .L999
|
||||
FABS s1, a1
|
||||
add.d X, X, INCX
|
||||
FABS s2, a1
|
||||
li.d x2, 1
|
||||
FABS s3, a1
|
||||
srai.d I, N, 3
|
||||
FABS s4, a1
|
||||
li.d x3, 1
|
||||
li.d TEMP, 2
|
||||
li.d x4, 1
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a6, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a8, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
FABS t1, a1
|
||||
LD a1, X, 0 * SIZE
|
||||
FABS t2, a2
|
||||
add.d X, X, INCX
|
||||
FABS t3, a3
|
||||
LD a2, X, 0 * SIZE
|
||||
FABS t4, a4
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc0, t1, s1
|
||||
LD a3, X, 0 * SIZE
|
||||
CMPLT $fcc1, t2, s2
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, t3, s3
|
||||
LD a4, X, 0 * SIZE
|
||||
CMPLT $fcc3, t4, s4
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
addi.d TEMP, TEMP, 4
|
||||
addi.d I, I, -1
|
||||
FABS t1, a5
|
||||
LD a5, X, 0 * SIZE
|
||||
FABS t2, a6
|
||||
add.d X, X, INCX
|
||||
FABS t3, a7
|
||||
LD a6, X, 0 * SIZE
|
||||
FABS t4, a8
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc0, t1, s1
|
||||
LD a7, X, 0 * SIZE
|
||||
CMPLT $fcc1, t2, s2
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, t3, s3
|
||||
LD a8, X, 0 * SIZE
|
||||
CMPLT $fcc3, t4, s4
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
addi.d TEMP, TEMP, 4
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
FABS t3, a3
|
||||
FABS t4, a4
|
||||
CMPLT $fcc0, t1, s1
|
||||
CMPLT $fcc1, t2, s2
|
||||
CMPLT $fcc2, t3, s3
|
||||
CMPLT $fcc3, t4, s4
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
FABS t1, a5
|
||||
addi.d TEMP, TEMP, 4
|
||||
FABS t2, a6
|
||||
FABS t3, a7
|
||||
FABS t4, a8
|
||||
CMPLT $fcc0, t1, s1
|
||||
CMPLT $fcc1, t2, s2
|
||||
CMPLT $fcc2, t3, s3
|
||||
CMPLT $fcc3, t4, s4
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
addi.d TEMP, TEMP, 4
|
||||
addi.d x2, x2, 1
|
||||
addi.d x3, x3, 2
|
||||
addi.d x4, x4, 3
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, a1
|
||||
addi.d I, I, -1
|
||||
CMPLT $fcc0, t1, s1
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
addi.d TEMP, TEMP, 1
|
||||
blt $r0, I, .L16
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
CMPLT $fcc0, s2, s1
|
||||
CMPLT $fcc1, s4, s3
|
||||
CMOVT s1, s1, s2, $fcc0
|
||||
MOVT(x1, x2, $fcc0)
|
||||
CMOVT s3, s3, s4, $fcc1
|
||||
MOVT(x3, x4, $fcc1)
|
||||
CMPLT $fcc0, s3, s1
|
||||
CMOVT s1, s1, s3, $fcc0
|
||||
MOVT(x1, x3, $fcc0)
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,217 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r18
|
||||
#define TEMP $r7
|
||||
#define a1 $f10
|
||||
#define a2 $f11
|
||||
#define a3 $f12
|
||||
#define a4 $f13
|
||||
#define a5 $f14
|
||||
#define a6 $f15
|
||||
#define a7 $f16
|
||||
#define a8 $f17
|
||||
#define t1 $f0
|
||||
#define t2 $f1
|
||||
#define t3 $f2
|
||||
#define t4 $f3
|
||||
#define t5 $f4
|
||||
#define t6 $f5
|
||||
#define t7 $f6
|
||||
#define t8 $f7
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define s3 $f23
|
||||
#define s4 $f9
|
||||
#define x1 $r17
|
||||
#define x2 $r8
|
||||
#define x3 $r9
|
||||
#define x4 $r10
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
li.d x1, 0
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
ADD s1, t1, t2
|
||||
ADD s2, t1, t2
|
||||
ADD s3, t1, t2
|
||||
ADD s4, t1, t2
|
||||
addi.d N, N, -1
|
||||
li.d x1, 1
|
||||
bge $r0, N, .L999
|
||||
add.d X, X, INCX
|
||||
li.d x2, 1
|
||||
srai.d I, N, 2
|
||||
li.d x3, 1
|
||||
li.d TEMP, 2
|
||||
li.d x4, 1
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
LD a6, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
LD a8, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
FABS t1, a1
|
||||
LD a1, X, 0 * SIZE
|
||||
FABS t2, a2
|
||||
LD a2, X, 1 * SIZE
|
||||
FABS t3, a3
|
||||
add.d X, X, INCX
|
||||
FABS t4, a4
|
||||
FABS t5, a5
|
||||
LD a3, X, 0 * SIZE
|
||||
FABS t6, a6
|
||||
LD a4, X, 1 * SIZE
|
||||
FABS t7, a7
|
||||
add.d X, X, INCX
|
||||
FABS t8, a8
|
||||
ADD t1, t1, t2
|
||||
LD a5, X, 0 * SIZE
|
||||
ADD t3, t3, t4
|
||||
LD a6, X, 1 * SIZE
|
||||
ADD t5, t5, t6
|
||||
add.d X, X, INCX
|
||||
ADD t7, t7, t8
|
||||
CMPLT $fcc0, s1, t1
|
||||
LD a7, X, 0 * SIZE
|
||||
CMPLT $fcc1, s2, t3
|
||||
LD a8, X, 1 * SIZE
|
||||
CMPLT $fcc2, s3, t5
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc3, s4, t7
|
||||
addi.d I, I, -1
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t3, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t5, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t7, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
addi.d TEMP, TEMP, 4
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
FABS t3, a3
|
||||
FABS t4, a4
|
||||
FABS t5, a5
|
||||
FABS t6, a6
|
||||
FABS t7, a7
|
||||
FABS t8, a8
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
ADD t5, t5, t6
|
||||
ADD t7, t7, t8
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMPLT $fcc1, s2, t3
|
||||
CMPLT $fcc2, s3, t5
|
||||
CMPLT $fcc3, s4, t7
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t3, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t5, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t7, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
addi.d TEMP, TEMP, 4
|
||||
addi.d x2, x2, 1
|
||||
addi.d x3, x3, 2
|
||||
addi.d x4, x4, 3
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
ADD t1, t1, t2
|
||||
addi.d I, I, -1
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
addi.d TEMP, TEMP, 1
|
||||
blt $r0, I, .L16
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
CMPLT $fcc0, s1, s2
|
||||
CMPLT $fcc1, s3, s4
|
||||
CMOVT s1, s1, s2, $fcc0
|
||||
MOVT(x1, x2, $fcc0)
|
||||
CMOVT s3, s3, s4, $fcc1
|
||||
MOVT(x3, x4, $fcc1)
|
||||
CMPLT $fcc0, s1, s3
|
||||
CMOVT s1, s1, s3, $fcc0
|
||||
MOVT(x1, x3, $fcc0)
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,217 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r18
|
||||
#define TEMP $r7
|
||||
#define a1 $f10
|
||||
#define a2 $f11
|
||||
#define a3 $f12
|
||||
#define a4 $f13
|
||||
#define a5 $f14
|
||||
#define a6 $f15
|
||||
#define a7 $f16
|
||||
#define a8 $f17
|
||||
#define t1 $f0
|
||||
#define t2 $f1
|
||||
#define t3 $f2
|
||||
#define t4 $f3
|
||||
#define t5 $f4
|
||||
#define t6 $f5
|
||||
#define t7 $f6
|
||||
#define t8 $f7
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define s3 $f23
|
||||
#define s4 $f9
|
||||
#define x1 $r17
|
||||
#define x2 $r8
|
||||
#define x3 $r9
|
||||
#define x4 $r10
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
li.d x1, 0
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
ADD s1, t1, t2
|
||||
ADD s2, t1, t2
|
||||
ADD s3, t1, t2
|
||||
ADD s4, t1, t2
|
||||
addi.d N, N, -1
|
||||
li.d x1, 1
|
||||
bge $r0, N, .L999
|
||||
add.d X, X, INCX
|
||||
li.d x2, 1
|
||||
srai.d I, N, 2
|
||||
li.d x3, 1
|
||||
li.d TEMP, 2
|
||||
li.d x4, 1
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
LD a6, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
LD a8, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
FABS t1, a1
|
||||
LD a1, X, 0 * SIZE
|
||||
FABS t2, a2
|
||||
LD a2, X, 1 * SIZE
|
||||
FABS t3, a3
|
||||
add.d X, X, INCX
|
||||
FABS t4, a4
|
||||
FABS t5, a5
|
||||
LD a3, X, 0 * SIZE
|
||||
FABS t6, a6
|
||||
LD a4, X, 1 * SIZE
|
||||
FABS t7, a7
|
||||
add.d X, X, INCX
|
||||
FABS t8, a8
|
||||
ADD t1, t1, t2
|
||||
LD a5, X, 0 * SIZE
|
||||
ADD t3, t3, t4
|
||||
LD a6, X, 1 * SIZE
|
||||
ADD t5, t5, t6
|
||||
add.d X, X, INCX
|
||||
ADD t7, t7, t8
|
||||
CMPLT $fcc0, t1, s1
|
||||
LD a7, X, 0 * SIZE
|
||||
CMPLT $fcc1, t3, s2
|
||||
LD a8, X, 1 * SIZE
|
||||
CMPLT $fcc2, t5, s3
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc3, t7, s4
|
||||
addi.d I, I, -1
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t3, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t5, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t7, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
addi.d TEMP, TEMP, 4
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
FABS t3, a3
|
||||
FABS t4, a4
|
||||
FABS t5, a5
|
||||
FABS t6, a6
|
||||
FABS t7, a7
|
||||
FABS t8, a8
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
ADD t5, t5, t6
|
||||
ADD t7, t7, t8
|
||||
CMPLT $fcc0, t1, s1
|
||||
CMPLT $fcc1, t3, s2
|
||||
CMPLT $fcc2, t5, s3
|
||||
CMPLT $fcc3, t7, s4
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t3, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t5, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t7, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
addi.d TEMP, TEMP, 4
|
||||
addi.d x2, x2, 1
|
||||
addi.d x3, x3, 2
|
||||
addi.d x4, x4, 3
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
ADD t1, t1, t2
|
||||
addi.d I, I, -1
|
||||
CMPLT $fcc0, t1, s1
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
addi.d TEMP, TEMP, 1
|
||||
blt $r0, I, .L16
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
CMPLT $fcc0, s2, s1
|
||||
CMPLT $fcc1, s4, s3
|
||||
CMOVT s1, s1, s2, $fcc0
|
||||
MOVT(x1, x2, $fcc0)
|
||||
CMOVT s3, s3, s4, $fcc1
|
||||
MOVT(x3, x4, $fcc1)
|
||||
CMPLT $fcc0, s3, s1
|
||||
CMOVT s1, s1, s3, $fcc0
|
||||
MOVT(x1, x3, $fcc0)
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,174 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define a1 $f10
|
||||
#define a2 $f11
|
||||
#define a3 $f12
|
||||
#define a4 $f13
|
||||
#define a5 $f14
|
||||
#define a6 $f15
|
||||
#define a7 $f16
|
||||
#define a8 $f17
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define s3 $f23
|
||||
#define s4 $f9
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
MTC s1, $r0
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
LD s1, X, 0 * SIZE
|
||||
addi.d N, N, -1
|
||||
add.d X, X, INCX
|
||||
MOV s2, s1
|
||||
bge $r0, N, .L999
|
||||
MOV s3, s1
|
||||
srai.d I, N, 3
|
||||
MOV s4, s1
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a6, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
CMPLT $fcc0, s1, a1
|
||||
LD a7, X, 0 * SIZE
|
||||
CMPLT $fcc1, s2, a2
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, s3, a3
|
||||
LD a8, X, 0 * SIZE
|
||||
CMPLT $fcc3, s4, a4
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, a1, $fcc0
|
||||
LD a1, X, 0 * SIZE
|
||||
CMOVT s2, s2, a2, $fcc1
|
||||
add.d X, X, INCX
|
||||
CMOVT s3, s3, a3, $fcc2
|
||||
LD a2, X, 0 * SIZE
|
||||
CMOVT s4, s4, a4, $fcc3
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc0, s1, a5
|
||||
LD a3, X, 0 * SIZE
|
||||
CMPLT $fcc1, s2, a6
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, s3, a7
|
||||
LD a4, X, 0 * SIZE
|
||||
CMPLT $fcc3, s4, a8
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, a5, $fcc0
|
||||
LD a5, X, 0 * SIZE
|
||||
CMOVT s2, s2, a6, $fcc1
|
||||
add.d X, X, INCX
|
||||
CMOVT s3, s3, a7, $fcc2
|
||||
LD a6, X, 0 * SIZE
|
||||
CMOVT s4, s4, a8, $fcc3
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
CMPLT $fcc0, s1, a1
|
||||
LD a7, X, 0 * SIZE
|
||||
CMPLT $fcc1, s2, a2
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, s3, a3
|
||||
LD a8, X, 0 * SIZE
|
||||
CMPLT $fcc3, s4, a4
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, a1, $fcc0
|
||||
CMOVT s2, s2, a2, $fcc1
|
||||
CMOVT s3, s3, a3, $fcc2
|
||||
CMOVT s4, s4, a4, $fcc3
|
||||
CMPLT $fcc0, s1, a5
|
||||
CMPLT $fcc1, s2, a6
|
||||
CMPLT $fcc2, s3, a7
|
||||
CMPLT $fcc3, s4, a8
|
||||
CMOVT s1, s1, a5, $fcc0
|
||||
CMOVT s2, s2, a6, $fcc1
|
||||
CMOVT s3, s3, a7, $fcc2
|
||||
CMOVT s4, s4, a8, $fcc3
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
CMPLT $fcc0, s1, a1
|
||||
CMOVT s1, s1, a1, $fcc0
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L16
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
CMPLT $fcc0, s1, s2
|
||||
CMPLT $fcc1, s3, s4
|
||||
CMOVT s1, s1, s2, $fcc0
|
||||
CMOVT s3, s3, s4, $fcc1
|
||||
CMPLT $fcc0, s1, s3
|
||||
CMOVT s1, s1, s3, $fcc0
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,174 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define a1 $f10
|
||||
#define a2 $f11
|
||||
#define a3 $f12
|
||||
#define a4 $f13
|
||||
#define a5 $f14
|
||||
#define a6 $f15
|
||||
#define a7 $f16
|
||||
#define a8 $f17
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define s3 $f23
|
||||
#define s4 $f9
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
MTC s1, $r0
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
LD s1, X, 0 * SIZE
|
||||
addi.d N, N, -1
|
||||
add.d X, X, INCX
|
||||
MOV s2, s1
|
||||
bge $r0, N, .L999
|
||||
MOV s3, s1
|
||||
srai.d I, N, 3
|
||||
MOV s4, s1
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a6, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
CMPLT $fcc0, a1, s1
|
||||
LD a7, X, 0 * SIZE
|
||||
CMPLT $fcc1, a2, s2
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, a3, s3
|
||||
LD a8, X, 0 * SIZE
|
||||
CMPLT $fcc3, a4, s4
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, a1, $fcc0
|
||||
LD a1, X, 0 * SIZE
|
||||
CMOVT s2, s2, a2, $fcc1
|
||||
add.d X, X, INCX
|
||||
CMOVT s3, s3, a3, $fcc2
|
||||
LD a2, X, 0 * SIZE
|
||||
CMOVT s4, s4, a4, $fcc3
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc0, a5, s1
|
||||
LD a3, X, 0 * SIZE
|
||||
CMPLT $fcc1, a6, s2
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, a7, s3
|
||||
LD a4, X, 0 * SIZE
|
||||
CMPLT $fcc3, a8, s4
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, a5, $fcc0
|
||||
LD a5, X, 0 * SIZE
|
||||
CMOVT s2, s2, a6, $fcc1
|
||||
add.d X, X, INCX
|
||||
CMOVT s3, s3, a7, $fcc2
|
||||
LD a6, X, 0 * SIZE
|
||||
CMOVT s4, s4, a8, $fcc3
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
CMPLT $fcc0, a1, s1
|
||||
LD a7, X, 0 * SIZE
|
||||
CMPLT $fcc1, a2, s2
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, a3, s3
|
||||
LD a8, X, 0 * SIZE
|
||||
CMPLT $fcc3, a4, s4
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, a1, $fcc0
|
||||
CMOVT s2, s2, a2, $fcc1
|
||||
CMOVT s3, s3, a3, $fcc2
|
||||
CMOVT s4, s4, a4, $fcc3
|
||||
CMPLT $fcc0, a5, s1
|
||||
CMPLT $fcc1, a6, s2
|
||||
CMPLT $fcc2, a7, s3
|
||||
CMPLT $fcc3, a8, s4
|
||||
CMOVT s1, s1, a5, $fcc0
|
||||
CMOVT s2, s2, a6, $fcc1
|
||||
CMOVT s3, s3, a7, $fcc2
|
||||
CMOVT s4, s4, a8, $fcc3
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
CMPLT $fcc0, a1, s1
|
||||
CMOVT s1, s1, a1, $fcc0
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L16
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
CMPLT $fcc0, s2, s1
|
||||
CMPLT $fcc1, s4, s3
|
||||
CMOVT s1, s1, s2, $fcc0
|
||||
CMOVT s3, s3, s4, $fcc1
|
||||
CMPLT $fcc0, s3, s1
|
||||
CMOVT s1, s1, s3, $fcc0
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,330 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r7
|
||||
#define INCX $r8
|
||||
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define XX $r5
|
||||
#define ALPHA $f0
|
||||
#define a1 $f22
|
||||
#define a2 $f8
|
||||
#define a3 $f23
|
||||
#define a4 $f9
|
||||
#define a5 $f10
|
||||
#define a6 $f11
|
||||
#define a7 $f12
|
||||
#define a8 $f13
|
||||
#define t1 $f14
|
||||
#define t2 $f15
|
||||
#define t3 $f16
|
||||
#define t4 $f17
|
||||
|
||||
PROLOGUE
|
||||
|
||||
li.d TEMP, SIZE
|
||||
MTC a1, $r0
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, N, .L999
|
||||
CMPEQ $fcc0, ALPHA, a1
|
||||
bceqz $fcc0, .L50
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L15
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
ST a1, X, 0 * SIZE
|
||||
ST a1, X, 1 * SIZE
|
||||
ST a1, X, 2 * SIZE
|
||||
ST a1, X, 3 * SIZE
|
||||
ST a1, X, 4 * SIZE
|
||||
ST a1, X, 5 * SIZE
|
||||
ST a1, X, 6 * SIZE
|
||||
ST a1, X, 7 * SIZE
|
||||
addi.w I, I, -1
|
||||
addi.d X, X, 8 * SIZE
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
.L16:
|
||||
ST a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
blt $r0, I, .L16
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L25
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
ST a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L22
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
.L26:
|
||||
addi.d I, I, -1
|
||||
ST a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L26
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
.L50:
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L60
|
||||
addi.d I, I, -1
|
||||
blt I, $r0, .L55
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, X, 2 * SIZE
|
||||
LD a4, X, 3 * SIZE
|
||||
LD a5, X, 4 * SIZE
|
||||
LD a6, X, 5 * SIZE
|
||||
LD a7, X, 6 * SIZE
|
||||
LD a8, X, 7 * SIZE
|
||||
bge $r0, I, .L53
|
||||
.align 3
|
||||
|
||||
.L52:
|
||||
MUL t1, ALPHA, a1
|
||||
LD a1, X, 8 * SIZE
|
||||
MUL t2, ALPHA, a2
|
||||
LD a2, X, 9 * SIZE
|
||||
MUL t3, ALPHA, a3
|
||||
LD a3, X, 10 * SIZE
|
||||
MUL t4, ALPHA, a4
|
||||
LD a4, X, 11 * SIZE
|
||||
ST t1, X, 0 * SIZE
|
||||
MUL t1, ALPHA, a5
|
||||
LD a5, X, 12 * SIZE
|
||||
ST t2, X, 1 * SIZE
|
||||
MUL t2, ALPHA, a6
|
||||
LD a6, X, 13 * SIZE
|
||||
ST t3, X, 2 * SIZE
|
||||
MUL t3, ALPHA, a7
|
||||
LD a7, X, 14 * SIZE
|
||||
ST t4, X, 3 * SIZE
|
||||
MUL t4, ALPHA, a8
|
||||
LD a8, X, 15 * SIZE
|
||||
addi.d I, I, -1
|
||||
ST t1, X, 4 * SIZE
|
||||
ST t2, X, 5 * SIZE
|
||||
ST t3, X, 6 * SIZE
|
||||
ST t4, X, 7 * SIZE
|
||||
addi.d X, X, 8 * SIZE
|
||||
blt $r0, I, .L52
|
||||
.align 3
|
||||
|
||||
.L53:
|
||||
MUL t1, ALPHA, a1
|
||||
MUL t2, ALPHA, a2
|
||||
MUL t3, ALPHA, a3
|
||||
MUL t4, ALPHA, a4
|
||||
ST t1, X, 0 * SIZE
|
||||
MUL t1, ALPHA, a5
|
||||
ST t2, X, 1 * SIZE
|
||||
MUL t2, ALPHA, a6
|
||||
ST t3, X, 2 * SIZE
|
||||
MUL t3, ALPHA, a7
|
||||
ST t4, X, 3 * SIZE
|
||||
MUL t4, ALPHA, a8
|
||||
ST t1, X, 4 * SIZE
|
||||
ST t2, X, 5 * SIZE
|
||||
ST t3, X, 6 * SIZE
|
||||
ST t4, X, 7 * SIZE
|
||||
addi.d X, X, 8 * SIZE
|
||||
.align 3
|
||||
|
||||
.L55:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
.L56:
|
||||
LD a1, X, 0 * SIZE
|
||||
MUL t1, ALPHA, a1
|
||||
addi.d X, X, SIZE
|
||||
addi.d I, I, -1
|
||||
ST t1, X, -1 * SIZE
|
||||
blt $r0, I, .L56
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
.L60:
|
||||
srai.d I, N, 3
|
||||
move XX, X
|
||||
addi.d I, I, -1
|
||||
blt I, $r0, .L65
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a6, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a8, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L63
|
||||
.align 3
|
||||
|
||||
.L62:
|
||||
MUL t1, ALPHA, a1
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
MUL t2, ALPHA, a2
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
MUL t3, ALPHA, a3
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
MUL t4, ALPHA, a4
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST t1, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ST t2, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ST t3, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ST t4, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
MUL t1, ALPHA, a5
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
MUL t2, ALPHA, a6
|
||||
LD a6, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
MUL t3, ALPHA, a7
|
||||
LD a7, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
MUL t4, ALPHA, a8
|
||||
LD a8, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST t1, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ST t2, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ST t3, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ST t4, XX, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d XX, XX, INCX
|
||||
blt $r0, I, .L62
|
||||
.align 3
|
||||
|
||||
.L63:
|
||||
MUL t1, ALPHA, a1
|
||||
MUL t2, ALPHA, a2
|
||||
MUL t3, ALPHA, a3
|
||||
MUL t4, ALPHA, a4
|
||||
ST t1, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ST t2, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ST t3, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ST t4, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
MUL t1, ALPHA, a5
|
||||
MUL t2, ALPHA, a6
|
||||
MUL t3, ALPHA, a7
|
||||
MUL t4, ALPHA, a8
|
||||
ST t1, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ST t2, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ST t3, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ST t4, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
.align 3
|
||||
|
||||
.L65:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
.L66:
|
||||
LD a1, X, 0 * SIZE
|
||||
MUL t1, ALPHA, a1
|
||||
addi.d I, I, -1
|
||||
ST t1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L66
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,249 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define a5 $f16
|
||||
#define a6 $f17
|
||||
#define a7 $f0
|
||||
#define a8 $f1
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define t1 $f23
|
||||
#define t2 $f9
|
||||
#define t3 $f10
|
||||
#define t4 $f11
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
movgr2fr.d s1, $r0
|
||||
li.d TEMP, SIZE
|
||||
fmov.d s2, s1
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, X, 2 * SIZE
|
||||
LD a4, X, 3 * SIZE
|
||||
LD a5, X, 4 * SIZE
|
||||
addi.d I, I, -1
|
||||
fcvt.d.s t1, a1
|
||||
LD a6, X, 5 * SIZE
|
||||
fcvt.d.s t2, a2
|
||||
LD a7, X, 6 * SIZE
|
||||
fcvt.d.s t3, a3
|
||||
LD a8, X, 7 * SIZE
|
||||
fcvt.d.s t4, a4
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
fmadd.d s1, t1, t1, s1
|
||||
LD a1, X, 8 * SIZE
|
||||
fcvt.d.s t1, a5
|
||||
NOP
|
||||
fmadd.d s2, t2, t2, s2
|
||||
LD a2, X, 9 * SIZE
|
||||
fcvt.d.s t2, a6
|
||||
NOP
|
||||
fmadd.d s1, t3, t3, s1
|
||||
LD a3, X, 10 * SIZE
|
||||
fcvt.d.s t3, a7
|
||||
NOP
|
||||
fmadd.d s2, t4, t4, s2
|
||||
LD a4, X, 11 * SIZE
|
||||
fcvt.d.s t4, a8
|
||||
NOP
|
||||
fmadd.d s1, t1, t1, s1
|
||||
LD a5, X, 12 * SIZE
|
||||
fcvt.d.s t1, a1
|
||||
NOP
|
||||
fmadd.d s2, t2, t2, s2
|
||||
LD a6, X, 13 * SIZE
|
||||
fcvt.d.s t2, a2
|
||||
addi.d I, I, -1
|
||||
fmadd.d s1, t3, t3, s1
|
||||
LD a7, X, 14 * SIZE
|
||||
fcvt.d.s t3, a3
|
||||
addi.d X, X, 8 * SIZE
|
||||
fmadd.d s2, t4, t4, s2
|
||||
LD a8, X, 7 * SIZE
|
||||
fcvt.d.s t4, a4
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
fmadd.d s1, t1, t1, s1
|
||||
fcvt.d.s t1, a5
|
||||
fmadd.d s2, t2, t2, s2
|
||||
fcvt.d.s t2, a6
|
||||
fmadd.d s1, t3, t3, s1
|
||||
fcvt.d.s t3, a7
|
||||
fmadd.d s2, t4, t4, s2
|
||||
fcvt.d.s t4, a8
|
||||
fmadd.d s1, t1, t1, s1
|
||||
fmadd.d s2, t2, t2, s2
|
||||
fmadd.d s1, t3, t3, s1
|
||||
fmadd.d s2, t4, t4, s2
|
||||
addi.d X, X, 8 * SIZE
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
fcvt.d.s t1, a1
|
||||
fmadd.d s1, t1, t1, s1
|
||||
addi.d X, X, SIZE
|
||||
blt $r0, I, .L16
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
bge $r0, I, .L25
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a6, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a8, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
fcvt.d.s t1, a1
|
||||
fcvt.d.s t2, a2
|
||||
fcvt.d.s t3, a3
|
||||
fcvt.d.s t4, a4
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
fmadd.d s1, t1, t1, s1
|
||||
LD a1, X, 0 * SIZE
|
||||
fcvt.d.s t1, a5
|
||||
add.d X, X, INCX
|
||||
fmadd.d s2, t2, t2, s2
|
||||
LD a2, X, 0 * SIZE
|
||||
fcvt.d.s t2, a6
|
||||
add.d X, X, INCX
|
||||
fmadd.d s1, t3, t3, s1
|
||||
LD a3, X, 0 * SIZE
|
||||
fcvt.d.s t3, a7
|
||||
add.d X, X, INCX
|
||||
fmadd.d s2, t4, t4, s2
|
||||
LD a4, X, 0 * SIZE
|
||||
fcvt.d.s t4, a8
|
||||
add.d X, X, INCX
|
||||
fmadd.d s1, t1, t1, s1
|
||||
LD a5, X, 0 * SIZE
|
||||
fcvt.d.s t1, a1
|
||||
add.d X, X, INCX
|
||||
fmadd.d s2, t2, t2, s2
|
||||
LD a6, X, 0 * SIZE
|
||||
fcvt.d.s t2, a2
|
||||
add.d X, X, INCX
|
||||
fmadd.d s1, t3, t3, s1
|
||||
LD a7, X, 0 * SIZE
|
||||
fcvt.d.s t3, a3
|
||||
add.d X, X, INCX
|
||||
fmadd.d s2, t4, t4, s2
|
||||
LD a8, X, 0 * SIZE
|
||||
fcvt.d.s t4, a4
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
fmadd.d s1, t1, t1, s1
|
||||
fcvt.d.s t1, a5
|
||||
fmadd.d s2, t2, t2, s2
|
||||
fcvt.d.s t2, a6
|
||||
fmadd.d s1, t3, t3, s1
|
||||
fcvt.d.s t3, a7
|
||||
fmadd.d s2, t4, t4, s2
|
||||
fcvt.d.s t4, a8
|
||||
fmadd.d s1, t1, t1, s1
|
||||
fmadd.d s2, t2, t2, s2
|
||||
fmadd.d s1, t3, t3, s1
|
||||
fmadd.d s2, t4, t4, s2
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
fcvt.d.s t1, a1
|
||||
add.d X, X, INCX
|
||||
fmadd.d s1, t1, t1, s1
|
||||
blt $r0, I, .L26
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
fadd.d s1, s1, s2
|
||||
fsqrt.d s1, s1
|
||||
move $r4, $r17
|
||||
fcvt.s.d $f0, s1
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,330 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r7
|
||||
#define INCX $r8
|
||||
#define Y $r9
|
||||
#define INCY $r10
|
||||
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define XX $r5
|
||||
#define YY $r6
|
||||
#define a1 $f22
|
||||
#define a2 $f8
|
||||
#define a3 $f23
|
||||
#define a4 $f9
|
||||
#define a5 $f10
|
||||
#define a6 $f11
|
||||
#define a7 $f12
|
||||
#define a8 $f13
|
||||
#define b1 $f14
|
||||
#define b2 $f15
|
||||
#define b3 $f16
|
||||
#define b4 $f17
|
||||
#define b5 $f0
|
||||
#define b6 $f1
|
||||
#define b7 $f2
|
||||
#define b8 $f3
|
||||
|
||||
PROLOGUE
|
||||
|
||||
li.d TEMP, SIZE
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, N, .L999
|
||||
slli.d INCY, INCY, BASE_SHIFT
|
||||
bne INCX, TEMP, .L20
|
||||
srai.d I, N, 3
|
||||
bne INCY, TEMP, .L20
|
||||
addi.d I, I, -1
|
||||
blt I, $r0, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
LD b1, Y, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD b2, Y, 1 * SIZE
|
||||
LD a3, X, 2 * SIZE
|
||||
LD b3, Y, 2 * SIZE
|
||||
LD a4, X, 3 * SIZE
|
||||
LD b4, Y, 3 * SIZE
|
||||
LD a5, X, 4 * SIZE
|
||||
LD b5, Y, 4 * SIZE
|
||||
LD a6, X, 5 * SIZE
|
||||
LD b6, Y, 5 * SIZE
|
||||
LD a7, X, 6 * SIZE
|
||||
LD b7, Y, 6 * SIZE
|
||||
LD a8, X, 7 * SIZE
|
||||
LD b8, Y, 7 * SIZE
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
ST a1, Y, 0 * SIZE
|
||||
LD a1, X, 8 * SIZE
|
||||
ST b1, X, 0 * SIZE
|
||||
LD b1, Y, 8 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
LD a2, X, 9 * SIZE
|
||||
ST b2, X, 1 * SIZE
|
||||
LD b2, Y, 9 * SIZE
|
||||
ST a3, Y, 2 * SIZE
|
||||
LD a3, X, 10 * SIZE
|
||||
ST b3, X, 2 * SIZE
|
||||
LD b3, Y, 10 * SIZE
|
||||
ST a4, Y, 3 * SIZE
|
||||
LD a4, X, 11 * SIZE
|
||||
ST b4, X, 3 * SIZE
|
||||
LD b4, Y, 11 * SIZE
|
||||
ST a5, Y, 4 * SIZE
|
||||
LD a5, X, 12 * SIZE
|
||||
ST b5, X, 4 * SIZE
|
||||
LD b5, Y, 12 * SIZE
|
||||
ST a6, Y, 5 * SIZE
|
||||
LD a6, X, 13 * SIZE
|
||||
ST b6, X, 5 * SIZE
|
||||
LD b6, Y, 13 * SIZE
|
||||
ST a7, Y, 6 * SIZE
|
||||
LD a7, X, 14 * SIZE
|
||||
ST b7, X, 6 * SIZE
|
||||
LD b7, Y, 14 * SIZE
|
||||
ST a8, Y, 7 * SIZE
|
||||
LD a8, X, 15 * SIZE
|
||||
ST b8, X, 7 * SIZE
|
||||
LD b8, Y, 15 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST b1, X, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
ST b2, X, 1 * SIZE
|
||||
ST a3, Y, 2 * SIZE
|
||||
ST b3, X, 2 * SIZE
|
||||
ST a4, Y, 3 * SIZE
|
||||
ST b4, X, 3 * SIZE
|
||||
ST a5, Y, 4 * SIZE
|
||||
ST b5, X, 4 * SIZE
|
||||
ST a6, Y, 5 * SIZE
|
||||
ST b6, X, 5 * SIZE
|
||||
ST a7, Y, 6 * SIZE
|
||||
ST b7, X, 6 * SIZE
|
||||
ST a8, Y, 7 * SIZE
|
||||
ST b8, X, 7 * SIZE
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD b1, Y, 0 * SIZE
|
||||
addi.d X, X, SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d Y, Y, SIZE
|
||||
ST b1, X, -1 * SIZE
|
||||
ST a1, Y, -1 * SIZE
|
||||
blt $r0, I, .L16
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
srai.d I, N, 3
|
||||
move XX, X
|
||||
move YY, Y
|
||||
addi.d I, I, -1
|
||||
blt I, $r0, .L25
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b4, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b5, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a6, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b6, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a7, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b7, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a8, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b8, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
ST a1, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST b1, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a2, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST b2, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD b2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST b3, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD b3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a4, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST b4, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD b4, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a5, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST b5, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD b5, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a6, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
LD a6, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST b6, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD b6, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a7, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
LD a7, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST b7, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD b7, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a8, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
LD a8, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST b8, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD b8, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L22
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
ST a1, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
ST b1, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ST a2, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
ST b2, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ST a3, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
ST b3, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ST a4, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
ST b4, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ST a5, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
ST b5, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ST a6, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
ST b6, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ST a7, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
ST b7, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
ST a8, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
ST b8, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
.L26:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD b1, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST b1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L26
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,190 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define a1 $f10
|
||||
#define a2 $f11
|
||||
#define a3 $f12
|
||||
#define a4 $f13
|
||||
#define a5 $f14
|
||||
#define a6 $f15
|
||||
#define a7 $f16
|
||||
#define a8 $f17
|
||||
#define t1 $f0
|
||||
#define t2 $f1
|
||||
#define t3 $f2
|
||||
#define t4 $f3
|
||||
#define t5 $f4
|
||||
#define t6 $f5
|
||||
#define t7 $f6
|
||||
#define t8 $f7
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define s3 $f23
|
||||
#define s4 $f9
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
MTC s1, $r0
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d N, N, -1
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
ADD s1, t1, t2
|
||||
bge $r0, N, .L999
|
||||
ADD s2, t1, t2
|
||||
srai.d I, N, 2
|
||||
ADD s3, t1, t2
|
||||
ADD s4, t1, t2
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
LD a6, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
LD a8, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
FABS t1, a1
|
||||
LD a1, X, 0 * SIZE
|
||||
FABS t2, a2
|
||||
LD a2, X, 1 * SIZE
|
||||
FABS t3, a3
|
||||
add.d X, X, INCX
|
||||
FABS t4, a4
|
||||
FABS t5, a5
|
||||
LD a3, X, 0 * SIZE
|
||||
FABS t6, a6
|
||||
LD a4, X, 1 * SIZE
|
||||
FABS t7, a7
|
||||
add.d X, X, INCX
|
||||
FABS t8, a8
|
||||
ADD t1, t1, t2
|
||||
LD a5, X, 0 * SIZE
|
||||
ADD t3, t3, t4
|
||||
LD a6, X, 1 * SIZE
|
||||
ADD t5, t5, t6
|
||||
add.d X, X, INCX
|
||||
ADD t7, t7, t8
|
||||
CMPLT $fcc0, s1, t1
|
||||
LD a7, X, 0 * SIZE
|
||||
CMPLT $fcc1, s2, t3
|
||||
LD a8, X, 1 * SIZE
|
||||
CMPLT $fcc2, s3, t5
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc3, s4, t7
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
addi.d I, I, -1
|
||||
CMOVT s2, s2, t3, $fcc1
|
||||
CMOVT s3, s3, t5, $fcc2
|
||||
CMOVT s4, s4, t7, $fcc3
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
FABS t3, a3
|
||||
FABS t4, a4
|
||||
FABS t5, a5
|
||||
FABS t6, a6
|
||||
FABS t7, a7
|
||||
FABS t8, a8
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
ADD t5, t5, t6
|
||||
ADD t7, t7, t8
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMPLT $fcc1, s2, t3
|
||||
CMPLT $fcc2, s3, t5
|
||||
CMPLT $fcc3, s4, t7
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
CMOVT s2, s2, t3, $fcc1
|
||||
CMOVT s3, s3, t5, $fcc2
|
||||
CMOVT s4, s4, t7, $fcc3
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
ADD t1, t1, t2
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L16
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
CMPLT $fcc0, s1, s2
|
||||
CMPLT $fcc1, s3, s4
|
||||
CMOVT s1, s1, s2, $fcc0
|
||||
CMOVT s3, s3, s4, $fcc1
|
||||
CMPLT $fcc0, s1, s3
|
||||
CMOVT s1, s1, s3, $fcc0
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,198 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define a1 $f10
|
||||
#define a2 $f11
|
||||
#define a3 $f12
|
||||
#define a4 $f13
|
||||
#define a5 $f14
|
||||
#define a6 $f15
|
||||
#define a7 $f16
|
||||
#define a8 $f17
|
||||
#define t1 $f0
|
||||
#define t2 $f1
|
||||
#define t3 $f2
|
||||
#define t4 $f3
|
||||
#define t5 $f4
|
||||
#define t6 $f5
|
||||
#define t7 $f6
|
||||
#define t8 $f7
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define s3 $f23
|
||||
#define s4 $f9
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
MTC s1, $r0
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d N, N, -1
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
ADD s1, t1, t2
|
||||
bge $r0, N, .L999
|
||||
NOP
|
||||
ADD s2, t1, t2
|
||||
srai.d I, N, 2
|
||||
ADD s3, t1, t2
|
||||
ADD s4, t1, t2
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
LD a6, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
LD a8, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
FABS t1, a1
|
||||
LD a1, X, 0 * SIZE
|
||||
FABS t2, a2
|
||||
LD a2, X, 1 * SIZE
|
||||
FABS t3, a3
|
||||
add.d X, X, INCX
|
||||
FABS t4, a4
|
||||
NOP
|
||||
FABS t5, a5
|
||||
LD a3, X, 0 * SIZE
|
||||
FABS t6, a6
|
||||
LD a4, X, 1 * SIZE
|
||||
FABS t7, a7
|
||||
add.d X, X, INCX
|
||||
FABS t8, a8
|
||||
NOP
|
||||
ADD t1, t1, t2
|
||||
LD a5, X, 0 * SIZE
|
||||
ADD t3, t3, t4
|
||||
LD a6, X, 1 * SIZE
|
||||
ADD t5, t5, t6
|
||||
add.d X, X, INCX
|
||||
ADD t7, t7, t8
|
||||
NOP
|
||||
CMPLT $fcc0, t1, s1
|
||||
LD a7, X, 0 * SIZE
|
||||
CMPLT $fcc1, t3, s2
|
||||
LD a8, X, 1 * SIZE
|
||||
CMPLT $fcc2, t5, s3
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc3, t7, s4
|
||||
NOP
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
addi.d I, I, -1
|
||||
CMOVT s2, s2, t3, $fcc1
|
||||
NOP
|
||||
CMOVT s3, s3, t5, $fcc2
|
||||
CMOVT s4, s4, t7, $fcc3
|
||||
blt $r0, I, .L12
|
||||
NOP
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
FABS t3, a3
|
||||
FABS t4, a4
|
||||
FABS t5, a5
|
||||
FABS t6, a6
|
||||
FABS t7, a7
|
||||
FABS t8, a8
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
ADD t5, t5, t6
|
||||
ADD t7, t7, t8
|
||||
CMPLT $fcc0, t1, s1
|
||||
CMPLT $fcc1, t3, s2
|
||||
CMPLT $fcc2, t5, s3
|
||||
CMPLT $fcc3, t7, s4
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
CMOVT s2, s2, t3, $fcc1
|
||||
CMOVT s3, s3, t5, $fcc2
|
||||
CMOVT s4, s4, t7, $fcc3
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
ADD t1, t1, t2
|
||||
CMPLT $fcc0, t1, s1
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L16
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
CMPLT $fcc0, s2, s1
|
||||
CMPLT $fcc1, s4, s3
|
||||
CMOVT s1, s1, s2, $fcc0
|
||||
CMOVT s3, s3, s4, $fcc1
|
||||
CMPLT $fcc0, s3, s1
|
||||
CMOVT s1, s1, s3, $fcc0
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
NOP
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,158 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define a1 $f23
|
||||
#define a2 $f9
|
||||
#define a3 $f10
|
||||
#define a4 $f11
|
||||
#define a5 $f12
|
||||
#define a6 $f13
|
||||
#define a7 $f14
|
||||
#define a8 $f15
|
||||
#define t1 $f16
|
||||
#define t2 $f17
|
||||
#define t3 $f0
|
||||
#define t4 $f1
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
MTC s1, $r0
|
||||
MTC s2, $r0
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
srai.d I, N, 2
|
||||
bge $r0, N, .L999
|
||||
bge $r0, I, .L25
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
LD a6, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
LD a7, X, 0 * SIZE
|
||||
LD a8, X, 1 * SIZE
|
||||
FABS t3, a3
|
||||
FABS t4, a4
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
ADD s1, s1, t1
|
||||
LD a1, X, 0 * SIZE
|
||||
FABS t1, a5
|
||||
addi.d I, I, -1
|
||||
ADD s2, s2, t2
|
||||
LD a2, X, 1 * SIZE
|
||||
FABS t2, a6
|
||||
add.d X, X, INCX
|
||||
ADD s1, s1, t3
|
||||
LD a3, X, 0 * SIZE
|
||||
FABS t3, a7
|
||||
NOP
|
||||
ADD s2, s2, t4
|
||||
LD a4, X, 1 * SIZE
|
||||
FABS t4, a8
|
||||
add.d X, X, INCX
|
||||
ADD s1, s1, t1
|
||||
LD a5, X, 0 * SIZE
|
||||
FABS t1, a1
|
||||
NOP
|
||||
ADD s2, s2, t2
|
||||
LD a6, X, 1 * SIZE
|
||||
FABS t2, a2
|
||||
add.d X, X, INCX
|
||||
ADD s1, s1, t3
|
||||
LD a7, X, 0 * SIZE
|
||||
FABS t3, a3
|
||||
LD a8, X, 1 * SIZE
|
||||
ADD s2, s2, t4
|
||||
add.d X, X, INCX
|
||||
FABS t4, a4
|
||||
blt $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
ADD s1, s1, t1
|
||||
FABS t1, a5
|
||||
ADD s2, s2, t2
|
||||
FABS t2, a6
|
||||
ADD s1, s1, t3
|
||||
FABS t3, a7
|
||||
ADD s2, s2, t4
|
||||
FABS t4, a8
|
||||
ADD s1, s1, t1
|
||||
ADD s2, s2, t2
|
||||
ADD s1, s1, t3
|
||||
ADD s2, s2, t4
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
FABS t1, a1
|
||||
addi.d I, I, -1
|
||||
FABS t2, a2
|
||||
add.d X, X, INCX
|
||||
ADD s1, s1, t1
|
||||
ADD s2, s2, t2
|
||||
blt $r0, I, .L26
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
ADD s1, s1, s2
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,217 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define Y $r7
|
||||
#define INCY $r8
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define a1 $f22
|
||||
#define a2 $f8
|
||||
#define a3 $f23
|
||||
#define a4 $f9
|
||||
#define a5 $f10
|
||||
#define a6 $f11
|
||||
#define a7 $f12
|
||||
#define a8 $f13
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
LDINT INCY, 0(INCY)
|
||||
#endif
|
||||
|
||||
li.d TEMP, 2 * SIZE
|
||||
NOP
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
bge $r0, N, .L999
|
||||
slli.d INCY, INCY, ZBASE_SHIFT
|
||||
bne INCX, TEMP, .L20
|
||||
srai.d I, N, 2
|
||||
bne INCY, TEMP, .L20
|
||||
addi.d I, I, -1
|
||||
blt I, $r0, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, X, 2 * SIZE
|
||||
LD a4, X, 3 * SIZE
|
||||
LD a5, X, 4 * SIZE
|
||||
LD a6, X, 5 * SIZE
|
||||
LD a7, X, 6 * SIZE
|
||||
LD a8, X, 7 * SIZE
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
ST a1, Y, 0 * SIZE
|
||||
LD a1, X, 8 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
LD a2, X, 9 * SIZE
|
||||
ST a3, Y, 2 * SIZE
|
||||
LD a3, X, 10 * SIZE
|
||||
ST a4, Y, 3 * SIZE
|
||||
LD a4, X, 11 * SIZE
|
||||
ST a5, Y, 4 * SIZE
|
||||
LD a5, X, 12 * SIZE
|
||||
ST a6, Y, 5 * SIZE
|
||||
LD a6, X, 13 * SIZE
|
||||
ST a7, Y, 6 * SIZE
|
||||
LD a7, X, 14 * SIZE
|
||||
ST a8, Y, 7 * SIZE
|
||||
LD a8, X, 15 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
ST a3, Y, 2 * SIZE
|
||||
ST a4, Y, 3 * SIZE
|
||||
ST a5, Y, 4 * SIZE
|
||||
ST a6, Y, 5 * SIZE
|
||||
ST a7, Y, 6 * SIZE
|
||||
ST a8, Y, 7 * SIZE
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d X, X, 2 * SIZE
|
||||
addi.d Y, Y, 2 * SIZE
|
||||
ST a1, Y, -2 * SIZE
|
||||
addi.d I, I, -1
|
||||
ST a2, Y, -1 * SIZE
|
||||
blt $r0, I, .L16
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
NOP
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
srai.d I, N, 2
|
||||
addi.d I, I, -1
|
||||
blt I, $r0, .L25
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
LD a6, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
LD a8, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
ST a1, Y, 0 * SIZE
|
||||
LD a1, X, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a3, Y, 0 * SIZE
|
||||
LD a3, X, 0 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a5, Y, 0 * SIZE
|
||||
LD a5, X, 0 * SIZE
|
||||
ST a6, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a6, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a7, Y, 0 * SIZE
|
||||
LD a7, X, 0 * SIZE
|
||||
ST a8, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a8, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L22
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a5, Y, 0 * SIZE
|
||||
ST a6, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a7, Y, 0 * SIZE
|
||||
ST a8, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
addi.d I, I, -1
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L26
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,330 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define Y $r7
|
||||
#define INCY $r8
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define a1 $f10
|
||||
#define a2 $f11
|
||||
#define a3 $f12
|
||||
#define a4 $f13
|
||||
#define b1 $f14
|
||||
#define b2 $f15
|
||||
#define b3 $f16
|
||||
#define b4 $f17
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define s3 $f23
|
||||
#define s4 $f9
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
LDINT INCY, 0(INCY)
|
||||
#endif
|
||||
|
||||
MTC s1, $r0
|
||||
MOV s2, s1
|
||||
MOV s3, s2
|
||||
MOV s4, s3
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
li.d TEMP, 2 * SIZE
|
||||
slli.d INCY, INCY, ZBASE_SHIFT
|
||||
bge $r0, N, .L999
|
||||
srai.d I, N, 2
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L20
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD b1, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
LD b2, Y, 1 * SIZE
|
||||
bge $r0, I, .L14
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
MADD s1, b1, a1, s1
|
||||
LD a3, X, 2 * SIZE
|
||||
MADD s2, b1, a2, s2
|
||||
LD a4, X, 3 * SIZE
|
||||
MADD s3, b2, a1, s3
|
||||
LD b3, Y, 2 * SIZE
|
||||
MADD s4, b2, a2, s4
|
||||
LD b4, Y, 3 * SIZE
|
||||
MADD s1, b3, a3, s1
|
||||
LD a1, X, 4 * SIZE
|
||||
MADD s2, b3, a4, s2
|
||||
LD a2, X, 5 * SIZE
|
||||
MADD s3, b4, a3, s3
|
||||
LD b1, Y, 4 * SIZE
|
||||
MADD s4, b4, a4, s4
|
||||
LD b2, Y, 5 * SIZE
|
||||
MADD s1, b1, a1, s1
|
||||
LD a3, X, 6 * SIZE
|
||||
MADD s2, b1, a2, s2
|
||||
LD a4, X, 7 * SIZE
|
||||
MADD s3, b2, a1, s3
|
||||
LD b3, Y, 6 * SIZE
|
||||
MADD s4, b2, a2, s4
|
||||
LD b4, Y, 7 * SIZE
|
||||
MADD s1, b3, a3, s1
|
||||
LD a1, X, 8 * SIZE
|
||||
MADD s2, b3, a4, s2
|
||||
LD a2, X, 9 * SIZE
|
||||
MADD s3, b4, a3, s3
|
||||
LD b1, Y, 8 * SIZE
|
||||
MADD s4, b4, a4, s4
|
||||
LD b2, Y, 9 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L14:
|
||||
MADD s1, b1, a1, s1
|
||||
LD a3, X, 2 * SIZE
|
||||
MADD s2, b1, a2, s2
|
||||
LD a4, X, 3 * SIZE
|
||||
MADD s3, b2, a1, s3
|
||||
LD b3, Y, 2 * SIZE
|
||||
MADD s4, b2, a2, s4
|
||||
LD b4, Y, 3 * SIZE
|
||||
MADD s1, b3, a3, s1
|
||||
LD a1, X, 4 * SIZE
|
||||
MADD s2, b3, a4, s2
|
||||
LD a2, X, 5 * SIZE
|
||||
MADD s3, b4, a3, s3
|
||||
LD b1, Y, 4 * SIZE
|
||||
MADD s4, b4, a4, s4
|
||||
LD b2, Y, 5 * SIZE
|
||||
MADD s1, b1, a1, s1
|
||||
LD a3, X, 6 * SIZE
|
||||
MADD s2, b1, a2, s2
|
||||
LD a4, X, 7 * SIZE
|
||||
MADD s3, b2, a1, s3
|
||||
LD b3, Y, 6 * SIZE
|
||||
MADD s4, b2, a2, s4
|
||||
LD b4, Y, 7 * SIZE
|
||||
MADD s1, b3, a3, s1
|
||||
addi.d X, X, 8 * SIZE
|
||||
MADD s2, b3, a4, s2
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
MADD s3, b4, a3, s3
|
||||
MADD s4, b4, a4, s4
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD b1, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
LD b2, Y, 1 * SIZE
|
||||
bge $r0, I, .L17
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
MADD s1, b1, a1, s1
|
||||
addi.d I, I, -1
|
||||
MADD s2, b1, a2, s2
|
||||
LD b1, Y, 2 * SIZE
|
||||
MADD s3, b2, a1, s3
|
||||
LD a1, X, 2 * SIZE
|
||||
MADD s4, b2, a2, s4
|
||||
LD a2, X, 3 * SIZE
|
||||
LD b2, Y, 3 * SIZE
|
||||
addi.d X, X, 2 * SIZE
|
||||
addi.d Y, Y, 2 * SIZE
|
||||
blt $r0, I, .L16
|
||||
.align 3
|
||||
|
||||
.L17:
|
||||
MADD s1, b1, a1, s1
|
||||
MADD s2, b1, a2, s2
|
||||
MADD s3, b2, a1, s3
|
||||
MADD s4, b2, a2, s4
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
#ifdef F_INTERFACE
|
||||
bgez INCX, .L21
|
||||
addi.d TEMP, N, -1
|
||||
mult TEMP, INCX
|
||||
mflo TEMP
|
||||
dsub X, X, TEMP
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
bgez INCY, .L22
|
||||
addi.d TEMP, N, -1
|
||||
mult TEMP, INCY
|
||||
mflo TEMP
|
||||
dsub Y, Y, TEMP
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#endif
|
||||
bge $r0, I, .L25
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD b1, Y, 0 * SIZE
|
||||
LD b2, Y, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
addi.d I, I, -1
|
||||
add.d Y, Y, INCY
|
||||
bge $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
MADD s1, b1, a1, s1
|
||||
LD a3, X, 0 * SIZE
|
||||
MADD s2, b1, a2, s2
|
||||
LD a4, X, 1 * SIZE
|
||||
MADD s3, b2, a1, s3
|
||||
LD b3, Y, 0 * SIZE
|
||||
MADD s4, b2, a2, s4
|
||||
LD b4, Y, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
MADD s1, b3, a3, s1
|
||||
LD a1, X, 0 * SIZE
|
||||
MADD s2, b3, a4, s2
|
||||
LD a2, X, 1 * SIZE
|
||||
MADD s3, b4, a3, s3
|
||||
LD b1, Y, 0 * SIZE
|
||||
MADD s4, b4, a4, s4
|
||||
LD b2, Y, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
MADD s1, b1, a1, s1
|
||||
LD a3, X, 0 * SIZE
|
||||
MADD s2, b1, a2, s2
|
||||
LD a4, X, 1 * SIZE
|
||||
MADD s3, b2, a1, s3
|
||||
LD b3, Y, 0 * SIZE
|
||||
MADD s4, b2, a2, s4
|
||||
LD b4, Y, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
MADD s1, b3, a3, s1
|
||||
LD a1, X, 0 * SIZE
|
||||
MADD s2, b3, a4, s2
|
||||
LD a2, X, 1 * SIZE
|
||||
MADD s3, b4, a3, s3
|
||||
LD b1, Y, 0 * SIZE
|
||||
MADD s4, b4, a4, s4
|
||||
LD b2, Y, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
addi.d I, I, -1
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
MADD s1, b1, a1, s1
|
||||
LD a3, X, 0 * SIZE
|
||||
MADD s2, b1, a2, s2
|
||||
LD a4, X, 1 * SIZE
|
||||
MADD s3, b2, a1, s3
|
||||
LD b3, Y, 0 * SIZE
|
||||
MADD s4, b2, a2, s4
|
||||
LD b4, Y, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
MADD s1, b3, a3, s1
|
||||
LD a1, X, 0 * SIZE
|
||||
MADD s2, b3, a4, s2
|
||||
LD a2, X, 1 * SIZE
|
||||
MADD s3, b4, a3, s3
|
||||
LD b1, Y, 0 * SIZE
|
||||
MADD s4, b4, a4, s4
|
||||
LD b2, Y, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
MADD s1, b1, a1, s1
|
||||
LD a3, X, 0 * SIZE
|
||||
MADD s2, b1, a2, s2
|
||||
LD a4, X, 1 * SIZE
|
||||
MADD s3, b2, a1, s3
|
||||
LD b3, Y, 0 * SIZE
|
||||
MADD s4, b2, a2, s4
|
||||
LD b4, Y, 1 * SIZE
|
||||
MADD s1, b3, a3, s1
|
||||
add.d X, X, INCX
|
||||
MADD s2, b3, a4, s2
|
||||
add.d Y, Y, INCY
|
||||
MADD s3, b4, a3, s3
|
||||
MADD s4, b4, a4, s4
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
.L26:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD b1, Y, 0 * SIZE
|
||||
LD b2, Y, 1 * SIZE
|
||||
MADD s1, b1, a1, s1
|
||||
MADD s2, b1, a2, s2
|
||||
MADD s3, b2, a1, s3
|
||||
MADD s4, b2, a2, s4
|
||||
add.d X, X, INCX
|
||||
add.d Y, Y, INCY
|
||||
addi.d I, I, -1
|
||||
blt $r0, I, .L26
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
#ifndef CONJ
|
||||
SUB $f0, s1, s4
|
||||
#else
|
||||
ADD $f0, s1, s4
|
||||
#endif
|
||||
#ifndef CONJ
|
||||
ADD $f1, s3, s2
|
||||
#else
|
||||
SUB $f1, s3, s2
|
||||
#endif
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,648 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define M $r4
|
||||
#define N $r5
|
||||
#define A $r7
|
||||
#define LDA $r8
|
||||
#define X $r9
|
||||
#define INCX $r10
|
||||
#define Y $r11
|
||||
#define INCY $r6
|
||||
#define BUFFER $r17
|
||||
|
||||
#define YORIG $r18
|
||||
#define XX $r12
|
||||
#define YY $r13
|
||||
#define I $r14
|
||||
#define J $r15
|
||||
#define AO1 $r23
|
||||
#define AO2 $r24
|
||||
|
||||
#define ALPHA_R $f0
|
||||
#define ALPHA_I $f1
|
||||
#define a1 $f22
|
||||
#define a2 $f8
|
||||
#define a3 $f23
|
||||
#define a4 $f9
|
||||
#define a5 $f10
|
||||
#define a6 $f11
|
||||
#define a7 $f12
|
||||
#define a8 $f13
|
||||
#define x1 $f14
|
||||
#define x2 $f15
|
||||
#define x3 $f16
|
||||
#define x4 $f17
|
||||
#define y1 $f3
|
||||
#define y2 $f4
|
||||
#define y3 $f2
|
||||
#define y4 $f5
|
||||
#define t1 $f6
|
||||
#define t2 $f7
|
||||
#define t3 $f18
|
||||
#define t4 $f19
|
||||
#define t5 $f20
|
||||
#define t6 $f21
|
||||
#define t7 $f24
|
||||
#define t8 $f25
|
||||
|
||||
#if !defined(CONJ) && !defined(XCONJ)
|
||||
#define MADD1 MADD
|
||||
#define MADD2 MADD
|
||||
#define MADD3 NMSUB
|
||||
#define MADD4 MADD
|
||||
#endif
|
||||
#if defined(CONJ) && !defined(XCONJ)
|
||||
#define MADD1 MADD
|
||||
#define MADD2 MADD
|
||||
#define MADD3 MADD
|
||||
#define MADD4 NMSUB
|
||||
#endif
|
||||
#if !defined(CONJ) && defined(XCONJ)
|
||||
#define MADD1 MADD
|
||||
#define MADD2 NMSUB
|
||||
#define MADD3 MADD
|
||||
#define MADD4 MADD
|
||||
#endif
|
||||
#if defined(CONJ) && defined(XCONJ)
|
||||
#define MADD1 MADD
|
||||
#define MADD2 NMSUB
|
||||
#define MADD3 NMSUB
|
||||
#define MADD4 NMSUB
|
||||
#endif
|
||||
|
||||
PROLOGUE
|
||||
|
||||
LDARG INCY, $sp, 0
|
||||
LDARG BUFFER, $sp, 8
|
||||
#ifndef __64BIT__
|
||||
addi.d $sp, $sp, -64
|
||||
#else
|
||||
addi.d $sp, $sp, -32
|
||||
#endif
|
||||
SDARG $r23, $sp, 0
|
||||
SDARG $r24, $sp, 8
|
||||
fst.d $f24, $sp, 16
|
||||
fst.d $f25, $sp, 24
|
||||
#ifndef __64BIT__
|
||||
fst.d $f18, $sp, 32
|
||||
fst.d $f19, $sp, 40
|
||||
fst.d $f20, $sp, 48
|
||||
fst.d $f21, $sp, 56
|
||||
#endif
|
||||
slli.d LDA, LDA, ZBASE_SHIFT
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
bge $r0, M, .L999
|
||||
slli.d INCY, INCY, ZBASE_SHIFT
|
||||
bge $r0, N, .L999
|
||||
li.d I, 2 * SIZE
|
||||
move YORIG, Y
|
||||
beq INCY, I, .L10
|
||||
srai.d I, M, 2
|
||||
move YORIG, BUFFER
|
||||
move XX, Y
|
||||
move YY, BUFFER
|
||||
bge $r0, I, .L05
|
||||
.align 3
|
||||
|
||||
.L02:
|
||||
LD a1, XX, 0 * SIZE
|
||||
LD a2, XX, 1 * SIZE
|
||||
add.d XX, XX, INCY
|
||||
LD a3, XX, 0 * SIZE
|
||||
LD a4, XX, 1 * SIZE
|
||||
add.d XX, XX, INCY
|
||||
LD a5, XX, 0 * SIZE
|
||||
LD a6, XX, 1 * SIZE
|
||||
add.d XX, XX, INCY
|
||||
LD a7, XX, 0 * SIZE
|
||||
LD a8, XX, 1 * SIZE
|
||||
add.d XX, XX, INCY
|
||||
addi.d I, I, -1
|
||||
addi.d YY, YY, 8 * SIZE
|
||||
ST a1, YY, -8 * SIZE
|
||||
ST a2, YY, -7 * SIZE
|
||||
ST a3, YY, -6 * SIZE
|
||||
ST a4, YY, -5 * SIZE
|
||||
ST a5, YY, -4 * SIZE
|
||||
ST a6, YY, -3 * SIZE
|
||||
ST a7, YY, -2 * SIZE
|
||||
ST a8, YY, -1 * SIZE
|
||||
blt $r0, I, .L02
|
||||
.align 3
|
||||
|
||||
.L05:
|
||||
andi I, M, 3
|
||||
bge $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L06:
|
||||
LD a1, XX, 0 * SIZE
|
||||
LD a2, XX, 1 * SIZE
|
||||
add.d XX, XX, INCY
|
||||
addi.d I, I, -1
|
||||
ST a1, YY, 0 * SIZE
|
||||
ST a2, YY, 1 * SIZE
|
||||
addi.d YY, YY, 2 * SIZE
|
||||
blt $r0, I, .L06
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
srai.d J, N, 1
|
||||
bge $r0, J, .L20
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
LD x1, X, 0 * SIZE
|
||||
LD x2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD x3, X, 0 * SIZE
|
||||
LD x4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
MUL a1, ALPHA_R, x1
|
||||
move AO1, A
|
||||
MUL a2, ALPHA_I, x1
|
||||
add.d AO2, A, LDA
|
||||
MUL a3, ALPHA_R, x3
|
||||
add.d A, AO2, LDA
|
||||
MUL a4, ALPHA_I, x3
|
||||
#ifndef XCONJ
|
||||
NMSUB x1, x2, ALPHA_I, a1
|
||||
MADD x2, x2, ALPHA_R, a2
|
||||
NMSUB x3, x4, ALPHA_I, a3
|
||||
MADD x4, x4, ALPHA_R, a4
|
||||
#else
|
||||
MADD x1, x2, ALPHA_I, a1
|
||||
MSUB x2, x2, ALPHA_R, a2
|
||||
MADD x3, x4, ALPHA_I, a3
|
||||
MSUB x4, x4, ALPHA_R, a4
|
||||
#endif
|
||||
srai.d I, M, 2
|
||||
move YY, YORIG
|
||||
bge $r0, I, .L15
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD y2, YY, 1 * SIZE
|
||||
LD a3, AO1, 2 * SIZE
|
||||
LD y3, YY, 2 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
LD y4, YY, 3 * SIZE
|
||||
LD a4, AO1, 3 * SIZE
|
||||
LD a5, AO2, 0 * SIZE
|
||||
LD a6, AO2, 1 * SIZE
|
||||
LD a7, AO2, 2 * SIZE
|
||||
LD a8, AO2, 3 * SIZE
|
||||
MADD1 t1, a1, x1, y1
|
||||
LD y1, YY, 4 * SIZE
|
||||
MADD2 t2, a1, x2, y2
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD1 t3, a3, x1, y3
|
||||
LD y2, YY, 5 * SIZE
|
||||
MADD2 t4, a3, x2, y4
|
||||
LD a3, AO1, 6 * SIZE
|
||||
MADD3 t1, a2, x2, t1
|
||||
LD y3, YY, 6 * SIZE
|
||||
MADD4 t2, a2, x1, t2
|
||||
LD a2, AO1, 5 * SIZE
|
||||
MADD3 t3, a4, x2, t3
|
||||
LD y4, YY, 7 * SIZE
|
||||
MADD4 t4, a4, x1, t4
|
||||
LD a4, AO1, 7 * SIZE
|
||||
MADD1 t1, a5, x3, t1
|
||||
MADD2 t2, a5, x4, t2
|
||||
LD a5, AO2, 4 * SIZE
|
||||
MADD1 t3, a7, x3, t3
|
||||
MADD2 t4, a7, x4, t4
|
||||
LD a7, AO2, 6 * SIZE
|
||||
MADD3 t1, a6, x4, t1
|
||||
MADD4 t2, a6, x3, t2
|
||||
LD a6, AO2, 5 * SIZE
|
||||
MADD3 t3, a8, x4, t3
|
||||
addi.d I, I, -1
|
||||
MADD4 t4, a8, x3, t4
|
||||
LD a8, AO2, 7 * SIZE
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
.L12:
|
||||
MADD1 t5, a1, x1, y1
|
||||
LD y1, YY, 8 * SIZE
|
||||
MADD2 t6, a1, x2, y2
|
||||
LD a1, AO1, 8 * SIZE
|
||||
MADD1 t7, a3, x1, y3
|
||||
LD y2, YY, 9 * SIZE
|
||||
MADD2 t8, a3, x2, y4
|
||||
LD a3, AO1, 10 * SIZE
|
||||
MADD3 t5, a2, x2, t5
|
||||
LD y3, YY, 10 * SIZE
|
||||
MADD4 t6, a2, x1, t6
|
||||
LD a2, AO1, 9 * SIZE
|
||||
MADD3 t7, a4, x2, t7
|
||||
LD y4, YY, 11 * SIZE
|
||||
MADD4 t8, a4, x1, t8
|
||||
LD a4, AO1, 11 * SIZE
|
||||
MADD1 t5, a5, x3, t5
|
||||
ST t1, YY, 0 * SIZE
|
||||
MADD2 t6, a5, x4, t6
|
||||
LD a5, AO2, 8 * SIZE
|
||||
MADD1 t7, a7, x3, t7
|
||||
ST t2, YY, 1 * SIZE
|
||||
MADD2 t8, a7, x4, t8
|
||||
LD a7, AO2, 10 * SIZE
|
||||
MADD3 t5, a6, x4, t5
|
||||
ST t3, YY, 2 * SIZE
|
||||
MADD4 t6, a6, x3, t6
|
||||
LD a6, AO2, 9 * SIZE
|
||||
MADD3 t7, a8, x4, t7
|
||||
ST t4, YY, 3 * SIZE
|
||||
MADD4 t8, a8, x3, t8
|
||||
LD a8, AO2, 11 * SIZE
|
||||
MADD1 t1, a1, x1, y1
|
||||
LD y1, YY, 12 * SIZE
|
||||
MADD2 t2, a1, x2, y2
|
||||
LD a1, AO1, 12 * SIZE
|
||||
MADD1 t3, a3, x1, y3
|
||||
LD y2, YY, 13 * SIZE
|
||||
MADD2 t4, a3, x2, y4
|
||||
LD a3, AO1, 14 * SIZE
|
||||
MADD3 t1, a2, x2, t1
|
||||
LD y3, YY, 14 * SIZE
|
||||
MADD4 t2, a2, x1, t2
|
||||
LD a2, AO1, 13 * SIZE
|
||||
MADD3 t3, a4, x2, t3
|
||||
LD y4, YY, 15 * SIZE
|
||||
MADD4 t4, a4, x1, t4
|
||||
LD a4, AO1, 15 * SIZE
|
||||
MADD1 t1, a5, x3, t1
|
||||
ST t5, YY, 4 * SIZE
|
||||
MADD2 t2, a5, x4, t2
|
||||
LD a5, AO2, 12 * SIZE
|
||||
MADD1 t3, a7, x3, t3
|
||||
ST t6, YY, 5 * SIZE
|
||||
MADD2 t4, a7, x4, t4
|
||||
LD a7, AO2, 14 * SIZE
|
||||
MADD3 t1, a6, x4, t1
|
||||
ST t7, YY, 6 * SIZE
|
||||
MADD4 t2, a6, x3, t2
|
||||
LD a6, AO2, 13 * SIZE
|
||||
MADD3 t3, a8, x4, t3
|
||||
ST t8, YY, 7 * SIZE
|
||||
MADD4 t4, a8, x3, t4
|
||||
LD a8, AO2, 15 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d YY, YY, 8 * SIZE
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
addi.d AO2, AO2, 8 * SIZE
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
ST t1, YY, 0 * SIZE
|
||||
MADD1 t1, a1, x1, y1
|
||||
ST t2, YY, 1 * SIZE
|
||||
MADD2 t2, a1, x2, y2
|
||||
ST t3, YY, 2 * SIZE
|
||||
MADD1 t3, a3, x1, y3
|
||||
ST t4, YY, 3 * SIZE
|
||||
MADD2 t4, a3, x2, y4
|
||||
MADD3 t1, a2, x2, t1
|
||||
MADD4 t2, a2, x1, t2
|
||||
MADD3 t3, a4, x2, t3
|
||||
MADD4 t4, a4, x1, t4
|
||||
MADD1 t1, a5, x3, t1
|
||||
MADD2 t2, a5, x4, t2
|
||||
MADD1 t3, a7, x3, t3
|
||||
MADD2 t4, a7, x4, t4
|
||||
MADD3 t1, a6, x4, t1
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
MADD4 t2, a6, x3, t2
|
||||
addi.d AO2, AO2, 8 * SIZE
|
||||
MADD3 t3, a8, x4, t3
|
||||
addi.d YY, YY, 8 * SIZE
|
||||
MADD4 t4, a8, x3, t4
|
||||
ST t1, YY, -4 * SIZE
|
||||
ST t2, YY, -3 * SIZE
|
||||
ST t3, YY, -2 * SIZE
|
||||
ST t4, YY, -1 * SIZE
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, M, 2
|
||||
bge $r0, I, .L16
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
LD y2, YY, 1 * SIZE
|
||||
LD a3, AO1, 2 * SIZE
|
||||
LD y3, YY, 2 * SIZE
|
||||
LD a4, AO1, 3 * SIZE
|
||||
LD y4, YY, 3 * SIZE
|
||||
MADD1 t1, a1, x1, y1
|
||||
LD a5, AO2, 0 * SIZE
|
||||
MADD2 t2, a1, x2, y2
|
||||
LD a6, AO2, 1 * SIZE
|
||||
MADD1 t3, a3, x1, y3
|
||||
LD a7, AO2, 2 * SIZE
|
||||
MADD2 t4, a3, x2, y4
|
||||
LD a8, AO2, 3 * SIZE
|
||||
MADD3 t1, a2, x2, t1
|
||||
MADD4 t2, a2, x1, t2
|
||||
MADD3 t3, a4, x2, t3
|
||||
MADD4 t4, a4, x1, t4
|
||||
MADD1 t1, a5, x3, t1
|
||||
MADD2 t2, a5, x4, t2
|
||||
MADD1 t3, a7, x3, t3
|
||||
MADD2 t4, a7, x4, t4
|
||||
MADD3 t1, a6, x4, t1
|
||||
addi.d YY, YY, 4 * SIZE
|
||||
MADD4 t2, a6, x3, t2
|
||||
addi.d AO1, AO1, 4 * SIZE
|
||||
MADD3 t3, a8, x4, t3
|
||||
addi.d AO2, AO2, 4 * SIZE
|
||||
MADD4 t4, a8, x3, t4
|
||||
ST t1, YY, -4 * SIZE
|
||||
ST t2, YY, -3 * SIZE
|
||||
ST t3, YY, -2 * SIZE
|
||||
ST t4, YY, -1 * SIZE
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
andi I, M, 1
|
||||
bge $r0, I, .L19
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD y2, YY, 1 * SIZE
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
MADD1 t1, a1, x1, y1
|
||||
LD a5, AO2, 0 * SIZE
|
||||
MADD2 t2, a1, x2, y2
|
||||
LD a6, AO2, 1 * SIZE
|
||||
MADD3 t1, a2, x2, t1
|
||||
MADD4 t2, a2, x1, t2
|
||||
MADD1 t1, a5, x3, t1
|
||||
MADD2 t2, a5, x4, t2
|
||||
MADD3 t1, a6, x4, t1
|
||||
MADD4 t2, a6, x3, t2
|
||||
ST t1, YY, 0 * SIZE
|
||||
ST t2, YY, 1 * SIZE
|
||||
.align 3
|
||||
|
||||
.L19:
|
||||
addi.d J, J, -1
|
||||
blt $r0, J, .L11
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
andi J, N, 1
|
||||
bge $r0, J, .L900
|
||||
LD x1, X, 0 * SIZE
|
||||
LD x2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
MUL a1, ALPHA_R, x1
|
||||
move AO1, A
|
||||
MUL a2, ALPHA_I, x1
|
||||
#ifndef XCONJ
|
||||
NMSUB x1, x2, ALPHA_I, a1
|
||||
MADD x2, x2, ALPHA_R, a2
|
||||
#else
|
||||
MADD x1, x2, ALPHA_I, a1
|
||||
MSUB x2, x2, ALPHA_R, a2
|
||||
#endif
|
||||
srai.d I, M, 2
|
||||
move YY, YORIG
|
||||
bge $r0, I, .L25
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD y2, YY, 1 * SIZE
|
||||
LD a3, AO1, 2 * SIZE
|
||||
LD y3, YY, 2 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
LD y4, YY, 3 * SIZE
|
||||
LD a4, AO1, 3 * SIZE
|
||||
MADD1 t1, a1, x1, y1
|
||||
LD y1, YY, 4 * SIZE
|
||||
MADD2 t2, a1, x2, y2
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD1 t3, a3, x1, y3
|
||||
LD y2, YY, 5 * SIZE
|
||||
MADD2 t4, a3, x2, y4
|
||||
LD a3, AO1, 6 * SIZE
|
||||
MADD3 t1, a2, x2, t1
|
||||
LD y3, YY, 6 * SIZE
|
||||
MADD4 t2, a2, x1, t2
|
||||
LD a2, AO1, 5 * SIZE
|
||||
MADD3 t3, a4, x2, t3
|
||||
LD y4, YY, 7 * SIZE
|
||||
MADD4 t4, a4, x1, t4
|
||||
addi.d I, I, -1
|
||||
LD a4, AO1, 7 * SIZE
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
.L22:
|
||||
MADD1 t5, a1, x1, y1
|
||||
LD y1, YY, 8 * SIZE
|
||||
MADD2 t6, a1, x2, y2
|
||||
LD a1, AO1, 8 * SIZE
|
||||
MADD1 t7, a3, x1, y3
|
||||
LD y2, YY, 9 * SIZE
|
||||
MADD2 t8, a3, x2, y4
|
||||
LD a3, AO1, 10 * SIZE
|
||||
MADD3 t5, a2, x2, t5
|
||||
LD y3, YY, 10 * SIZE
|
||||
MADD4 t6, a2, x1, t6
|
||||
LD a2, AO1, 9 * SIZE
|
||||
MADD3 t7, a4, x2, t7
|
||||
LD y4, YY, 11 * SIZE
|
||||
MADD4 t8, a4, x1, t8
|
||||
LD a4, AO1, 11 * SIZE
|
||||
ST t1, YY, 0 * SIZE
|
||||
ST t2, YY, 1 * SIZE
|
||||
ST t3, YY, 2 * SIZE
|
||||
ST t4, YY, 3 * SIZE
|
||||
MADD1 t1, a1, x1, y1
|
||||
LD y1, YY, 12 * SIZE
|
||||
MADD2 t2, a1, x2, y2
|
||||
LD a1, AO1, 12 * SIZE
|
||||
MADD1 t3, a3, x1, y3
|
||||
LD y2, YY, 13 * SIZE
|
||||
MADD2 t4, a3, x2, y4
|
||||
LD a3, AO1, 14 * SIZE
|
||||
MADD3 t1, a2, x2, t1
|
||||
LD y3, YY, 14 * SIZE
|
||||
MADD4 t2, a2, x1, t2
|
||||
LD a2, AO1, 13 * SIZE
|
||||
MADD3 t3, a4, x2, t3
|
||||
LD y4, YY, 15 * SIZE
|
||||
MADD4 t4, a4, x1, t4
|
||||
LD a4, AO1, 15 * SIZE
|
||||
ST t5, YY, 4 * SIZE
|
||||
ST t6, YY, 5 * SIZE
|
||||
ST t7, YY, 6 * SIZE
|
||||
ST t8, YY, 7 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d YY, YY, 8 * SIZE
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
blt $r0, I, .L22
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
ST t1, YY, 0 * SIZE
|
||||
MADD1 t1, a1, x1, y1
|
||||
ST t2, YY, 1 * SIZE
|
||||
MADD2 t2, a1, x2, y2
|
||||
ST t3, YY, 2 * SIZE
|
||||
MADD1 t3, a3, x1, y3
|
||||
ST t4, YY, 3 * SIZE
|
||||
MADD2 t4, a3, x2, y4
|
||||
MADD3 t1, a2, x2, t1
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
MADD4 t2, a2, x1, t2
|
||||
addi.d YY, YY, 8 * SIZE
|
||||
MADD3 t3, a4, x2, t3
|
||||
MADD4 t4, a4, x1, t4
|
||||
ST t1, YY, -4 * SIZE
|
||||
ST t2, YY, -3 * SIZE
|
||||
ST t3, YY, -2 * SIZE
|
||||
ST t4, YY, -1 * SIZE
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, M, 2
|
||||
bge $r0, I, .L26
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
LD y2, YY, 1 * SIZE
|
||||
LD a3, AO1, 2 * SIZE
|
||||
LD y3, YY, 2 * SIZE
|
||||
LD a4, AO1, 3 * SIZE
|
||||
LD y4, YY, 3 * SIZE
|
||||
MADD1 t1, a1, x1, y1
|
||||
MADD2 t2, a1, x2, y2
|
||||
MADD1 t3, a3, x1, y3
|
||||
MADD2 t4, a3, x2, y4
|
||||
MADD3 t1, a2, x2, t1
|
||||
addi.d YY, YY, 4 * SIZE
|
||||
MADD4 t2, a2, x1, t2
|
||||
addi.d AO1, AO1, 4 * SIZE
|
||||
MADD3 t3, a4, x2, t3
|
||||
MADD4 t4, a4, x1, t4
|
||||
ST t1, YY, -4 * SIZE
|
||||
ST t2, YY, -3 * SIZE
|
||||
ST t3, YY, -2 * SIZE
|
||||
ST t4, YY, -1 * SIZE
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
andi I, M, 1
|
||||
bge $r0, I, .L900
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD y2, YY, 1 * SIZE
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
MADD1 t1, a1, x1, y1
|
||||
MADD2 t2, a1, x2, y2
|
||||
MADD3 t1, a2, x2, t1
|
||||
MADD4 t2, a2, x1, t2
|
||||
ST t1, YY, 0 * SIZE
|
||||
ST t2, YY, 1 * SIZE
|
||||
.align 3
|
||||
|
||||
.L900:
|
||||
li.d YORIG, 2 * SIZE
|
||||
srai.d I, M, 2
|
||||
beq INCY, YORIG, .L999
|
||||
move XX, BUFFER
|
||||
bge $r0, I, .L905
|
||||
.align 3
|
||||
|
||||
.L902:
|
||||
LD a1, XX, 0 * SIZE
|
||||
LD a2, XX, 1 * SIZE
|
||||
LD a3, XX, 2 * SIZE
|
||||
LD a4, XX, 3 * SIZE
|
||||
LD a5, XX, 4 * SIZE
|
||||
LD a6, XX, 5 * SIZE
|
||||
LD a7, XX, 6 * SIZE
|
||||
LD a8, XX, 7 * SIZE
|
||||
addi.d I, I, -1
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
ST a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a5, Y, 0 * SIZE
|
||||
ST a6, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a7, Y, 0 * SIZE
|
||||
ST a8, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
addi.d XX, XX, 8 * SIZE
|
||||
blt $r0, I, .L902
|
||||
.align 3
|
||||
|
||||
.L905:
|
||||
andi I, M, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L906:
|
||||
LD a1, XX, 0 * SIZE
|
||||
LD a2, XX, 1 * SIZE
|
||||
addi.d XX, XX, 2 * SIZE
|
||||
addi.d I, I, -1
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L906
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
LDARG $r23, $sp, 0
|
||||
LDARG $r24, $sp, 8
|
||||
fld.d $f24, $sp, 16
|
||||
fld.d $f25, $sp, 24
|
||||
#ifndef __64BIT__
|
||||
fld.d $f18, $sp, 32
|
||||
fld.d $f19, $sp, 40
|
||||
fld.d $f20, $sp, 48
|
||||
fld.d $f21, $sp, 56
|
||||
#endif
|
||||
#ifdef __64BIT__
|
||||
addi.d $sp, $sp, 32
|
||||
#else
|
||||
addi.d $sp, $sp, 64
|
||||
#endif
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,556 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define M $r4
|
||||
#define N $r5
|
||||
#define A $r7
|
||||
#define LDA $r8
|
||||
#define X $r9
|
||||
#define INCX $r10
|
||||
#define Y $r11
|
||||
#define INCY $r6
|
||||
#define BUFFER $r17
|
||||
|
||||
#define XORIG $r18
|
||||
#define XX $r12
|
||||
#define YY $r13
|
||||
#define I $r14
|
||||
#define J $r15
|
||||
#define AO1 $r23
|
||||
#define AO2 $r24
|
||||
|
||||
#define ALPHA_R $f0
|
||||
#define ALPHA_I $f1
|
||||
#define a1 $f22
|
||||
#define a2 $f8
|
||||
#define a3 $f23
|
||||
#define a4 $f9
|
||||
#define a5 $f10
|
||||
#define a6 $f11
|
||||
#define a7 $f12
|
||||
#define a8 $f13
|
||||
#define y1 $f14
|
||||
#define y2 $f15
|
||||
#define y3 $f16
|
||||
#define y4 $f17
|
||||
#define x1 $f3
|
||||
#define x2 $f4
|
||||
#define x3 $f2
|
||||
#define x4 $f5
|
||||
#define x5 $f6
|
||||
#define x6 $f7
|
||||
#define x7 $f18
|
||||
#define x8 $f19
|
||||
|
||||
#if !defined(CONJ) && !defined(XCONJ)
|
||||
#define MADD1 MADD
|
||||
#define MADD2 MADD
|
||||
#define MADD3 NMSUB
|
||||
#define MADD4 MADD
|
||||
#endif
|
||||
#if defined(CONJ) && !defined(XCONJ)
|
||||
#define MADD1 MADD
|
||||
#define MADD2 MADD
|
||||
#define MADD3 MADD
|
||||
#define MADD4 NMSUB
|
||||
#endif
|
||||
#if !defined(CONJ) && defined(XCONJ)
|
||||
#define MADD1 MADD
|
||||
#define MADD2 NMSUB
|
||||
#define MADD3 MADD
|
||||
#define MADD4 MADD
|
||||
#endif
|
||||
#if defined(CONJ) && defined(XCONJ)
|
||||
#define MADD1 MADD
|
||||
#define MADD2 NMSUB
|
||||
#define MADD3 NMSUB
|
||||
#define MADD4 NMSUB
|
||||
#endif
|
||||
|
||||
PROLOGUE
|
||||
|
||||
LDARG INCY, $sp, 0
|
||||
LDARG BUFFER, $sp, 8
|
||||
#ifdef __64BIT__
|
||||
addi.d $sp, $sp, -16
|
||||
#else
|
||||
addi.d $sp, $sp, -32
|
||||
#endif
|
||||
MTC y1, $r0
|
||||
SDARG $r23, $sp, 0
|
||||
SDARG $r24, $sp, 8
|
||||
slli.d LDA, LDA, ZBASE_SHIFT
|
||||
#ifndef __64BIT__
|
||||
fst.d $f18, $sp, 16
|
||||
fst.d $f19, $sp, 24
|
||||
#endif
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
bge $r0, M, .L999
|
||||
slli.d INCY, INCY, ZBASE_SHIFT
|
||||
bge $r0, N, .L999
|
||||
li.d I, 2 * SIZE
|
||||
move XORIG, X
|
||||
beq INCX, I, .L10
|
||||
srai.d I, M, 2
|
||||
move XORIG, BUFFER
|
||||
move YY, BUFFER
|
||||
bge $r0, I, .L05
|
||||
.align 3
|
||||
|
||||
.L02:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
LD a6, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
LD a8, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
addi.d I, I, -1
|
||||
addi.d YY, YY, 8 * SIZE
|
||||
ST a1, YY, -8 * SIZE
|
||||
ST a2, YY, -7 * SIZE
|
||||
ST a3, YY, -6 * SIZE
|
||||
ST a4, YY, -5 * SIZE
|
||||
ST a5, YY, -4 * SIZE
|
||||
ST a6, YY, -3 * SIZE
|
||||
ST a7, YY, -2 * SIZE
|
||||
ST a8, YY, -1 * SIZE
|
||||
blt $r0, I, .L02
|
||||
.align 3
|
||||
|
||||
.L05:
|
||||
andi I, M, 3
|
||||
bge $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L06:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, YY, 0 * SIZE
|
||||
ST a2, YY, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d YY, YY, 2 * SIZE
|
||||
blt $r0, I, .L06
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
srai.d J, N, 1
|
||||
move YY, Y
|
||||
bge $r0, J, .L20
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
move AO1, A
|
||||
MOV y2, y1
|
||||
add.d AO2, A, LDA
|
||||
MOV y3, y1
|
||||
add.d A, AO2, LDA
|
||||
MOV y4, y1
|
||||
srai.d I, M, 2
|
||||
move XX, XORIG
|
||||
bge $r0, I, .L15
|
||||
LD x1, XX, 0 * SIZE
|
||||
LD x2, XX, 1 * SIZE
|
||||
LD x4, XX, 3 * SIZE
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD a3, AO2, 0 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
LD a4, AO2, 1 * SIZE
|
||||
LD a5, AO1, 2 * SIZE
|
||||
LD a7, AO2, 2 * SIZE
|
||||
LD a6, AO1, 3 * SIZE
|
||||
LD a8, AO2, 3 * SIZE
|
||||
addi.d I, I, -1
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
.L12:
|
||||
MADD1 y1, a1, x1, y1
|
||||
LD x3, XX, 2 * SIZE
|
||||
MADD2 y2, a1, x2, y2
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD1 y3, a3, x1, y3
|
||||
MADD2 y4, a3, x2, y4
|
||||
LD a3, AO2, 4 * SIZE
|
||||
MADD3 y1, a2, x2, y1
|
||||
MADD4 y2, a2, x1, y2
|
||||
LD a2, AO1, 5 * SIZE
|
||||
MADD3 y3, a4, x2, y3
|
||||
LD x2, XX, 5 * SIZE
|
||||
MADD4 y4, a4, x1, y4
|
||||
LD a4, AO2, 5 * SIZE
|
||||
MADD1 y1, a5, x3, y1
|
||||
LD x1, XX, 4 * SIZE
|
||||
MADD2 y2, a5, x4, y2
|
||||
LD a5, AO1, 6 * SIZE
|
||||
MADD1 y3, a7, x3, y3
|
||||
MADD2 y4, a7, x4, y4
|
||||
LD a7, AO2, 6 * SIZE
|
||||
MADD3 y1, a6, x4, y1
|
||||
addi.d I, I, -1
|
||||
MADD4 y2, a6, x3, y2
|
||||
LD a6, AO1, 7 * SIZE
|
||||
MADD3 y3, a8, x4, y3
|
||||
LD x4, XX, 7 * SIZE
|
||||
MADD4 y4, a8, x3, y4
|
||||
LD a8, AO2, 7 * SIZE
|
||||
MADD1 y1, a1, x1, y1
|
||||
LD x3, XX, 6 * SIZE
|
||||
MADD2 y2, a1, x2, y2
|
||||
LD a1, AO1, 8 * SIZE
|
||||
MADD1 y3, a3, x1, y3
|
||||
MADD2 y4, a3, x2, y4
|
||||
LD a3, AO2, 8 * SIZE
|
||||
MADD3 y1, a2, x2, y1
|
||||
MADD4 y2, a2, x1, y2
|
||||
LD a2, AO1, 9 * SIZE
|
||||
MADD3 y3, a4, x2, y3
|
||||
LD x2, XX, 9 * SIZE
|
||||
MADD4 y4, a4, x1, y4
|
||||
LD a4, AO2, 9 * SIZE
|
||||
MADD1 y1, a5, x3, y1
|
||||
LD x1, XX, 8 * SIZE
|
||||
MADD2 y2, a5, x4, y2
|
||||
LD a5, AO1, 10 * SIZE
|
||||
MADD1 y3, a7, x3, y3
|
||||
addi.d XX, XX, 8 * SIZE
|
||||
MADD2 y4, a7, x4, y4
|
||||
LD a7, AO2, 10 * SIZE
|
||||
MADD3 y1, a6, x4, y1
|
||||
addi.d AO2, AO2, 8 * SIZE
|
||||
MADD4 y2, a6, x3, y2
|
||||
LD a6, AO1, 11 * SIZE
|
||||
MADD3 y3, a8, x4, y3
|
||||
LD x4, XX, 3 * SIZE
|
||||
MADD4 y4, a8, x3, y4
|
||||
LD a8, AO2, 3 * SIZE
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
MADD1 y1, a1, x1, y1
|
||||
LD x3, XX, 2 * SIZE
|
||||
MADD2 y2, a1, x2, y2
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD1 y3, a3, x1, y3
|
||||
MADD2 y4, a3, x2, y4
|
||||
LD a3, AO2, 4 * SIZE
|
||||
MADD3 y1, a2, x2, y1
|
||||
MADD4 y2, a2, x1, y2
|
||||
LD a2, AO1, 5 * SIZE
|
||||
MADD3 y3, a4, x2, y3
|
||||
LD x2, XX, 5 * SIZE
|
||||
MADD4 y4, a4, x1, y4
|
||||
LD a4, AO2, 5 * SIZE
|
||||
MADD1 y1, a5, x3, y1
|
||||
LD x1, XX, 4 * SIZE
|
||||
MADD2 y2, a5, x4, y2
|
||||
LD a5, AO1, 6 * SIZE
|
||||
MADD1 y3, a7, x3, y3
|
||||
MADD2 y4, a7, x4, y4
|
||||
LD a7, AO2, 6 * SIZE
|
||||
MADD3 y1, a6, x4, y1
|
||||
MADD4 y2, a6, x3, y2
|
||||
LD a6, AO1, 7 * SIZE
|
||||
MADD3 y3, a8, x4, y3
|
||||
LD x4, XX, 7 * SIZE
|
||||
MADD4 y4, a8, x3, y4
|
||||
LD a8, AO2, 7 * SIZE
|
||||
MADD1 y1, a1, x1, y1
|
||||
LD x3, XX, 6 * SIZE
|
||||
MADD2 y2, a1, x2, y2
|
||||
MADD1 y3, a3, x1, y3
|
||||
MADD2 y4, a3, x2, y4
|
||||
MADD3 y1, a2, x2, y1
|
||||
MADD4 y2, a2, x1, y2
|
||||
MADD3 y3, a4, x2, y3
|
||||
MADD4 y4, a4, x1, y4
|
||||
MADD1 y1, a5, x3, y1
|
||||
MADD2 y2, a5, x4, y2
|
||||
MADD1 y3, a7, x3, y3
|
||||
MADD2 y4, a7, x4, y4
|
||||
MADD3 y1, a6, x4, y1
|
||||
addi.d XX, XX, 8 * SIZE
|
||||
MADD4 y2, a6, x3, y2
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
MADD3 y3, a8, x4, y3
|
||||
addi.d AO2, AO2, 8 * SIZE
|
||||
MADD4 y4, a8, x3, y4
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, M, 2
|
||||
bge $r0, I, .L17
|
||||
LD x1, XX, 0 * SIZE
|
||||
LD x2, XX, 1 * SIZE
|
||||
LD x3, XX, 2 * SIZE
|
||||
LD x4, XX, 3 * SIZE
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD a3, AO2, 0 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
LD a4, AO2, 1 * SIZE
|
||||
LD a5, AO1, 2 * SIZE
|
||||
LD a7, AO2, 2 * SIZE
|
||||
LD a6, AO1, 3 * SIZE
|
||||
LD a8, AO2, 3 * SIZE
|
||||
MADD1 y1, a1, x1, y1
|
||||
MADD2 y2, a1, x2, y2
|
||||
MADD1 y3, a3, x1, y3
|
||||
MADD2 y4, a3, x2, y4
|
||||
MADD3 y1, a2, x2, y1
|
||||
MADD4 y2, a2, x1, y2
|
||||
MADD3 y3, a4, x2, y3
|
||||
MADD4 y4, a4, x1, y4
|
||||
MADD1 y1, a5, x3, y1
|
||||
MADD2 y2, a5, x4, y2
|
||||
MADD1 y3, a7, x3, y3
|
||||
MADD2 y4, a7, x4, y4
|
||||
MADD3 y1, a6, x4, y1
|
||||
addi.d XX, XX, 4 * SIZE
|
||||
MADD4 y2, a6, x3, y2
|
||||
addi.d AO1, AO1, 4 * SIZE
|
||||
MADD3 y3, a8, x4, y3
|
||||
addi.d AO2, AO2, 4 * SIZE
|
||||
MADD4 y4, a8, x3, y4
|
||||
.align 3
|
||||
|
||||
.L17:
|
||||
andi I, M, 1
|
||||
.align 3
|
||||
|
||||
bge $r0, I, .L19
|
||||
.L18:
|
||||
LD x1, XX, 0 * SIZE
|
||||
LD x2, XX, 1 * SIZE
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD a3, AO2, 0 * SIZE
|
||||
MADD1 y1, a1, x1, y1
|
||||
LD a2, AO1, 1 * SIZE
|
||||
MADD2 y2, a1, x2, y2
|
||||
LD a4, AO2, 1 * SIZE
|
||||
MADD1 y3, a3, x1, y3
|
||||
MADD2 y4, a3, x2, y4
|
||||
MADD3 y1, a2, x2, y1
|
||||
MADD4 y2, a2, x1, y2
|
||||
MADD3 y3, a4, x2, y3
|
||||
MADD4 y4, a4, x1, y4
|
||||
.align 3
|
||||
|
||||
.L19:
|
||||
LD a1, Y, 0 * SIZE
|
||||
LD a2, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a3, Y, 0 * SIZE
|
||||
LD a4, Y, 1 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
MADD a1, y1, ALPHA_R, a1
|
||||
MADD a2, y1, ALPHA_I, a2
|
||||
MADD a3, y3, ALPHA_R, a3
|
||||
MADD a4, y3, ALPHA_I, a4
|
||||
NMSUB a1, y2, ALPHA_I, a1
|
||||
MADD a2, y2, ALPHA_R, a2
|
||||
NMSUB a3, y4, ALPHA_I, a3
|
||||
MTC y1, $r0
|
||||
MADD a4, y4, ALPHA_R, a4
|
||||
addi.d J, J, -1
|
||||
ST a1, YY, 0 * SIZE
|
||||
ST a2, YY, 1 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
ST a3, YY, 0 * SIZE
|
||||
ST a4, YY, 1 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
blt $r0, J, .L11
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
andi J, N, 1
|
||||
MOV y2, y1
|
||||
srai.d I, M, 2
|
||||
bge $r0, J, .L999
|
||||
MOV y3, y1
|
||||
move AO1, A
|
||||
MOV y4, y1
|
||||
move XX, XORIG
|
||||
bge $r0, I, .L25
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD x1, XX, 0 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
LD x2, XX, 1 * SIZE
|
||||
LD a5, AO1, 2 * SIZE
|
||||
LD x4, XX, 3 * SIZE
|
||||
addi.d I, I, -1
|
||||
LD a6, AO1, 3 * SIZE
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
.L22:
|
||||
MADD1 y1, a1, x1, y1
|
||||
LD x3, XX, 2 * SIZE
|
||||
MADD2 y2, a1, x2, y2
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD3 y3, a2, x2, y3
|
||||
LD x2, XX, 5 * SIZE
|
||||
MADD4 y4, a2, x1, y4
|
||||
LD a2, AO1, 5 * SIZE
|
||||
MADD1 y1, a5, x3, y1
|
||||
LD x1, XX, 4 * SIZE
|
||||
MADD2 y2, a5, x4, y2
|
||||
LD a5, AO1, 6 * SIZE
|
||||
MADD3 y3, a6, x4, y3
|
||||
LD x4, XX, 7 * SIZE
|
||||
MADD4 y4, a6, x3, y4
|
||||
LD a6, AO1, 7 * SIZE
|
||||
MADD1 y1, a1, x1, y1
|
||||
LD x3, XX, 6 * SIZE
|
||||
MADD2 y2, a1, x2, y2
|
||||
LD a1, AO1, 8 * SIZE
|
||||
MADD3 y3, a2, x2, y3
|
||||
LD x2, XX, 9 * SIZE
|
||||
MADD4 y4, a2, x1, y4
|
||||
LD a2, AO1, 9 * SIZE
|
||||
MADD1 y1, a5, x3, y1
|
||||
LD x1, XX, 8 * SIZE
|
||||
MADD2 y2, a5, x4, y2
|
||||
LD a5, AO1, 10 * SIZE
|
||||
MADD3 y3, a6, x4, y3
|
||||
LD x4, XX, 11 * SIZE
|
||||
MADD4 y4, a6, x3, y4
|
||||
LD a6, AO1, 11 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d XX, XX, 8 * SIZE
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
blt $r0, I, .L22
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
MADD1 y1, a1, x1, y1
|
||||
LD x3, XX, 2 * SIZE
|
||||
MADD2 y2, a1, x2, y2
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD3 y3, a2, x2, y3
|
||||
LD x2, XX, 5 * SIZE
|
||||
MADD4 y4, a2, x1, y4
|
||||
LD a2, AO1, 5 * SIZE
|
||||
MADD1 y1, a5, x3, y1
|
||||
LD x1, XX, 4 * SIZE
|
||||
MADD2 y2, a5, x4, y2
|
||||
LD a5, AO1, 6 * SIZE
|
||||
MADD3 y3, a6, x4, y3
|
||||
LD x4, XX, 7 * SIZE
|
||||
MADD4 y4, a6, x3, y4
|
||||
LD a6, AO1, 7 * SIZE
|
||||
MADD1 y1, a1, x1, y1
|
||||
LD x3, XX, 6 * SIZE
|
||||
MADD2 y2, a1, x2, y2
|
||||
MADD3 y3, a2, x2, y3
|
||||
MADD4 y4, a2, x1, y4
|
||||
MADD1 y1, a5, x3, y1
|
||||
MADD2 y2, a5, x4, y2
|
||||
MADD3 y3, a6, x4, y3
|
||||
addi.d XX, XX, 8 * SIZE
|
||||
MADD4 y4, a6, x3, y4
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, M, 2
|
||||
bge $r0, I, .L27
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD x1, XX, 0 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
LD x2, XX, 1 * SIZE
|
||||
LD a5, AO1, 2 * SIZE
|
||||
MADD1 y1, a1, x1, y1
|
||||
LD x3, XX, 2 * SIZE
|
||||
MADD2 y2, a1, x2, y2
|
||||
LD a6, AO1, 3 * SIZE
|
||||
MADD3 y3, a2, x2, y3
|
||||
LD x4, XX, 3 * SIZE
|
||||
MADD4 y4, a2, x1, y4
|
||||
MADD1 y1, a5, x3, y1
|
||||
MADD2 y2, a5, x4, y2
|
||||
MADD3 y3, a6, x4, y3
|
||||
addi.d XX, XX, 4 * SIZE
|
||||
MADD4 y4, a6, x3, y4
|
||||
addi.d AO1, AO1, 4 * SIZE
|
||||
.align 3
|
||||
|
||||
.L27:
|
||||
andi I, M, 1
|
||||
.align 3
|
||||
|
||||
bge $r0, I, .L29
|
||||
.L28:
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD x1, XX, 0 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
LD x2, XX, 1 * SIZE
|
||||
MADD1 y1, a1, x1, y1
|
||||
MADD2 y2, a1, x2, y2
|
||||
MADD3 y3, a2, x2, y3
|
||||
MADD4 y4, a2, x1, y4
|
||||
.align 3
|
||||
|
||||
.L29:
|
||||
LD a1, Y, 0 * SIZE
|
||||
LD a2, Y, 1 * SIZE
|
||||
ADD y1, y1, y3
|
||||
ADD y2, y2, y4
|
||||
MADD a1, y1, ALPHA_R, a1
|
||||
MADD a2, y1, ALPHA_I, a2
|
||||
NMSUB a1, y2, ALPHA_I, a1
|
||||
MADD a2, y2, ALPHA_R, a2
|
||||
ST a1, YY, 0 * SIZE
|
||||
ST a2, YY, 1 * SIZE
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
LDARG $r23, $sp, 0
|
||||
LDARG $r24, $sp, 8
|
||||
#ifndef __64BIT__
|
||||
fld.d $f18, $sp, 16
|
||||
fld.d $f19, $sp, 24
|
||||
#endif
|
||||
#ifdef __64BIT__
|
||||
addi.d $sp, $sp, 16
|
||||
#else
|
||||
addi.d $sp, $sp, 32
|
||||
#endif
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue