Merge pull request #3394 from xianyi/develop

Merge from develop for 0.3.18
This commit is contained in:
Martin Kroeker 2021-10-02 19:35:27 +02:00 committed by GitHub
commit c75759876c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
165 changed files with 35976 additions and 982 deletions

View File

@ -1,33 +1,38 @@
# XXX: Precise is already deprecated, new default is Trusty.
# https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming
dist: precise
dist: focal
sudo: true
language: c
matrix:
include:
- &test-ubuntu
os: linux
# os: linux
compiler: gcc
addons:
apt:
packages:
- gfortran
# before_script: &common-before
# - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
# script:
# - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
# - make -C test $COMMON_FLAGS $BTYPE
# - make -C ctest $COMMON_FLAGS $BTYPE
# - make -C utest $COMMON_FLAGS $BTYPE
# env:
# - TARGET_BOX=LINUX64
# - BTYPE="BINARY=64"
#
# - <<: *test-ubuntu
os: linux-ppc64le
before_script: &common-before
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
script:
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
- make -C test $COMMON_FLAGS $BTYPE
- make -C ctest $COMMON_FLAGS $BTYPE
- make -C utest $COMMON_FLAGS $BTYPE
env:
- TARGET_BOX=LINUX64
- BTYPE="BINARY=64"
- <<: *test-ubuntu
os: linux-ppc64le
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
env:
# for matrix annotation only
- TARGET_BOX=PPC64LE_LINUX
@ -55,38 +60,38 @@ matrix:
- TARGET_BOX=IBMZ_LINUX
- BTYPE="BINARY=64 USE_OPENMP=0 CC=clang"
- <<: *test-ubuntu
env:
- TARGET_BOX=LINUX64
- BTYPE="BINARY=64 USE_OPENMP=1"
- <<: *test-ubuntu
env:
- TARGET_BOX=LINUX64
- BTYPE="BINARY=64 INTERFACE64=1"
- <<: *test-ubuntu
compiler: clang
env:
- TARGET_BOX=LINUX64
- BTYPE="BINARY=64 CC=clang"
- <<: *test-ubuntu
compiler: clang
env:
- TARGET_BOX=LINUX64
- BTYPE="BINARY=64 INTERFACE64=1 CC=clang"
- <<: *test-ubuntu
addons:
apt:
packages:
- gcc-multilib
- gfortran-multilib
env:
- TARGET_BOX=LINUX32
- BTYPE="BINARY=32"
# - <<: *test-ubuntu
# env:
# - TARGET_BOX=LINUX64
# - BTYPE="BINARY=64 USE_OPENMP=1"
#
# - <<: *test-ubuntu
# env:
# - TARGET_BOX=LINUX64
# - BTYPE="BINARY=64 INTERFACE64=1"
#
# - <<: *test-ubuntu
# compiler: clang
# env:
# - TARGET_BOX=LINUX64
# - BTYPE="BINARY=64 CC=clang"
#
# - <<: *test-ubuntu
# compiler: clang
# env:
# - TARGET_BOX=LINUX64
# - BTYPE="BINARY=64 INTERFACE64=1 CC=clang"
#
# - <<: *test-ubuntu
# addons:
# apt:
# packages:
# - gcc-multilib
# - gfortran-multilib
# env:
# - TARGET_BOX=LINUX32
# - BTYPE="BINARY=32"
#
- os: linux
arch: ppc64le
dist: bionic
@ -121,47 +126,47 @@ matrix:
# for matrix annotation only
- TARGET_BOX=PPC64LE_LINUX_P9
- os: linux
compiler: gcc
addons:
apt:
packages:
- binutils-mingw-w64-x86-64
- gcc-mingw-w64-x86-64
- gfortran-mingw-w64-x86-64
before_script: *common-before
script:
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
env:
- TARGET_BOX=WIN64
- BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran"
# - os: linux
# compiler: gcc
# addons:
# apt:
# packages:
# - binutils-mingw-w64-x86-64
# - gcc-mingw-w64-x86-64
# - gfortran-mingw-w64-x86-64
# before_script: *common-before
# script:
# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
# env:
# - TARGET_BOX=WIN64
# - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran"
#
# Build & test on Alpine Linux inside chroot, i.e. on system with musl libc.
# These jobs needs sudo, so Travis runs them on VM-based infrastructure
# which is slower than container-based infrastructure used for jobs
# that don't require sudo.
- &test-alpine
os: linux
dist: trusty
sudo: true
language: minimal
before_install:
- "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
&& echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1"
- alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
install:
- sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
before_script: *common-before
script:
# XXX: Disable some warnings for now to avoid exceeding Travis limit for log size.
- alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types"
- alpine make -C test $COMMON_FLAGS $BTYPE
- alpine make -C ctest $COMMON_FLAGS $BTYPE
- alpine make -C utest $COMMON_FLAGS $BTYPE
env:
- TARGET_BOX=LINUX64_MUSL
- BTYPE="BINARY=64"
# - &test-alpine
# os: linux
# dist: trusty
# sudo: true
# language: minimal
# before_install:
# - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
# && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1"
# - alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
# install:
# - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
# before_script: *common-before
# script:
# # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size.
# - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
# CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types"
# - alpine make -C test $COMMON_FLAGS $BTYPE
# - alpine make -C ctest $COMMON_FLAGS $BTYPE
# - alpine make -C utest $COMMON_FLAGS $BTYPE
# env:
# - TARGET_BOX=LINUX64_MUSL
# - BTYPE="BINARY=64"
# XXX: This job segfaults in TESTS OF THE COMPLEX LEVEL 3 BLAS,
# but only on Travis CI, cannot reproduce it elsewhere.
@ -171,98 +176,98 @@ matrix:
# - TARGET_BOX=LINUX64_MUSL
# - BTYPE="BINARY=64 USE_OPENMP=1"
- <<: *test-alpine
env:
- TARGET_BOX=LINUX64_MUSL
- BTYPE="BINARY=64 INTERFACE64=1"
# - <<: *test-alpine
# env:
# - TARGET_BOX=LINUX64_MUSL
# - BTYPE="BINARY=64 INTERFACE64=1"
#
# # Build with the same flags as Alpine do in OpenBLAS package.
# - <<: *test-alpine
# env:
# - TARGET_BOX=LINUX64_MUSL
# - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2"
# Build with the same flags as Alpine do in OpenBLAS package.
- <<: *test-alpine
env:
- TARGET_BOX=LINUX64_MUSL
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2"
# - &test-cmake
# os: linux
# compiler: clang
# addons:
# apt:
# packages:
# - gfortran
# - cmake
# dist: trusty
# sudo: true
# before_script:
# - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32"
# script:
# - mkdir build
# - CONFIG=Release
# - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG
# - cmake --build build --config $CONFIG -- -j2
# env:
# - CMAKE=1
# - <<: *test-cmake
# env:
# - CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1"
# - <<: *test-cmake
# compiler: gcc
# env:
# - CMAKE=1
- &test-cmake
os: linux
compiler: clang
addons:
apt:
packages:
- gfortran
- cmake
dist: trusty
sudo: true
before_script:
- COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32"
script:
- mkdir build
- CONFIG=Release
- cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG
- cmake --build build --config $CONFIG -- -j2
env:
- CMAKE=1
- <<: *test-cmake
env:
- CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1"
- <<: *test-cmake
compiler: gcc
env:
- CMAKE=1
- &test-macos
os: osx
osx_image: xcode11.5
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
script:
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
env:
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9"
- <<: *test-macos
osx_image: xcode12
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
- brew update
script:
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
env:
- BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10"
- <<: *test-macos
osx_image: xcode12
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
- brew update
script:
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
env:
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
# - &test-macos
# os: osx
# osx_image: xcode11.5
# before_script:
# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
# script:
# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
# env:
# - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9"
#
# - <<: *test-macos
# osx_image: xcode12
# before_script:
# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
# - brew update
# script:
# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
# env:
# - BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10"
#
# - <<: *test-macos
# osx_image: xcode12
# before_script:
# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
# - brew update
# script:
# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
# env:
# - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
# - <<: *test-macos
# osx_image: xcode10
# env:
# - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
- <<: *test-macos
osx_image: xcode11.5
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
- brew update
env:
# - <<: *test-macos
# osx_image: xcode11.5
# before_script:
# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
# - brew update
# env:
# - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
# - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
- CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
- CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0"
- BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
- <<: *test-macos
osx_image: xcode11.5
env:
# - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
# - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1"
- CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
- CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1"
- BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1"
# - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
# - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0"
# - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
# - <<: *test-macos
# osx_image: xcode11.5
# env:
## - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
## - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1"
# - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
# - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1"
# - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1"
- &test-graviton2
os: linux

View File

@ -132,7 +132,7 @@ endif ()
if (BUILD_BFLOAT16)
message(STATUS "Building Half Precision")
list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing
# list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing
endif ()
if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN")

View File

@ -1,4 +1,47 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.18
02-Oct-2021
general:
- when the build-time number of preconfigured threads is exceeded
at runtime (typically by an external program calling BLAS functions
from a larger number of threads in parallel), OpenBLAS will now
allocate an auxiliary control structure for up to 512 additional
threads instead of aborting
- added support for Loongson's LoongArch64 cpu architecture
- fixed building OpenBLAS with CMAKE and -DBUILD_BFLOAT16=ON
- added support for building OpenBLAS as a CMAKE subproject
- added support for building for Windows/ARM64 targets with clang
- improved support for building with the IBM xlf compiler
- imported Reference-LAPACK PR 625 (out-of-bounds reads in ?LARRV)
- imported Reference-LAPACK PR 597 for testsuite compatibility with
LLVM's libomp
x86_64:
- added SkylakeX S/DGEMM kernels for small problem sizes (M*N*K<=1000000)
- added optimized SBGEMM for Intel Cooper Lake
- reinstated the performance patch for AVX512 SGEMV_T with a proper fix
- added a workaround for a gcc11 tree-vectorizer bug that caused spurious
failures in the test programs for complex BLAS3 when compiling at -O3
(the default for cmake "release" builds)
- added support for runtime cpu count detection under Haiku OS
- worked around a long-standing miscompilation issue of the Haswell DGEMV_T
kernel with gcc that could produce NaN output in some corner cases
POWER:
- improved performance of DASUM on POWER10
ARMV8:
- fixed crashes (use of reserved register x18) on Apple M1 under OSX
- fixed building with gcc releases earlier than 5.1
MIPS:
- fixed building under BSD
MIPS64:
- fixed building under BSD
====================================================================
Version 0.3.17
15-Jul-2021

View File

@ -269,7 +269,7 @@ prof_lapack : lapack_prebuild
lapack_prebuild :
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
-@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc

View File

@ -1,4 +1,15 @@
ifneq ($(C_COMPILER), PGI)
ifneq ($(GCCVERSIONGT4), 1)
CCOMMON_OPT += -march=armv8-a
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a
endif
else
ifeq ($(CORE), ARMV8)
CCOMMON_OPT += -march=armv8-a
ifneq ($(F_COMPILER), NAG)
@ -138,4 +149,7 @@ FCOMMON_OPT += -march=armv8-a -mtune=emag
endif
endif
endif
endif
endif

3
Makefile.loongarch64 Normal file
View File

@ -0,0 +1,3 @@
ifdef BINARY64
else
endif

View File

@ -12,9 +12,13 @@ endif
ifeq ($(CORE), POWER10)
ifneq ($(C_COMPILER), PGI)
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
ifeq ($(F_COMPILER), IBM)
FCOMMON_OPT += -O2 -qrecur -qnosave
else
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
endif
endif
endif
ifeq ($(CORE), POWER9)
ifneq ($(C_COMPILER), PGI)
@ -33,7 +37,11 @@ else
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
endif
ifneq ($(F_COMPILER), PGI)
ifeq ($(F_COMPILER), IBM)
FCOMMON_OPT += -O2 -qrecur -qnosave
else
FCOMMON_OPT += -O2 -frecursive -fno-fast-math
endif
ifeq ($(C_COMPILER), GCC)
ifneq ($(GCCVERSIONGT4), 1)
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
@ -57,7 +65,11 @@ CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
endif
ifneq ($(F_COMPILER), PGI)
ifeq ($(OSNAME), AIX)
ifeq ($(F_COMPILER), IBM)
FCOMMON_OPT += -O2 -qrecur -qnosave
else
FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
endif
else
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
endif

View File

@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.3.17
VERSION = 0.3.17.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

View File

@ -33,6 +33,10 @@ else ifeq ($(ARCH), armv7)
override ARCH=arm
else ifeq ($(ARCH), aarch64)
override ARCH=arm64
else ifeq ($(ARCH), mipsel)
override ARCH=mips
else ifeq ($(ARCH), mips64el)
override ARCH=mips64
else ifeq ($(ARCH), zarch)
override ARCH=zarch
endif
@ -244,6 +248,14 @@ else
ONLY_CBLAS = 0
endif
#For small matrix optimization
ifeq ($(ARCH), x86_64)
SMALL_MATRIX_OPT = 1
endif
ifeq ($(SMALL_MATRIX_OPT), 1)
CCOMMON_OPT += -DSMALL_MATRIX_OPT
endif
# This operation is expensive, so execution should be once.
ifndef GOTOBLAS_MAKEFILE
export GOTOBLAS_MAKEFILE = 1
@ -780,6 +792,11 @@ NO_BINARY_MODE = 1
BINARY_DEFINED = 1
endif
ifeq ($(ARCH), loongarch64)
NO_BINARY_MODE = 1
BINARY_DEFINED = 1
endif
#
# C Compiler dependent settings
@ -850,6 +867,13 @@ ifeq ($(OSNAME), AIX)
BINARY_DEFINED = 1
endif
ifeq ($(ARCH), loongarch64)
ifeq ($(CORE), LOONGSON3R5)
CCOMMON_OPT += -march=loongarch64 -mabi=lp64
FCOMMON_OPT += -march=loongarch64 -mabi=lp64
endif
endif
endif
ifndef BINARY_DEFINED

View File

@ -2,7 +2,7 @@
[![Join the chat at https://gitter.im/xianyi/OpenBLAS](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS)
Travis CI: [![Build Status](https://travis-ci.com/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.com/xianyi/OpenBLAS)
AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
@ -128,6 +128,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
- **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
- **Intel Cooper Lake**: as Skylake-X with improved BFLOAT16 support.
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
@ -153,6 +154,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
- **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS
- **Cortex-A53**: same as ARMV8 (different cpu specifications)
- **Cortex-A55**: same as ARMV8 (different cpu specifications)
- **Cortex A57**: Optimized Level-3 and Level-2 functions
- **Cortex A72**: same as A57 ( different cpu specifications)
- **Cortex A73**: same as A57 (different cpu specifications)
@ -178,10 +180,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
#### RISC-V
- **C910V**: Optimized Leve-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1.
- **C910V**: Optimized Level-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1.
```sh
make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran
```
(also known to work on C906)
### Support for multiple targets in a single library

View File

@ -110,3 +110,5 @@ Z14
RISCV64_GENERIC
C910V
11.LOONGARCH64:
LOONGSON3R5

View File

@ -19,7 +19,7 @@ jobs:
# of gcc / glibc
- job: manylinux1_gcc
pool:
vmImage: 'ubuntu-16.04'
vmImage: 'ubuntu-latest'
steps:
- script: |
echo "FROM quay.io/pypa/manylinux1_x86_64
@ -35,7 +35,7 @@ jobs:
displayName: Run manylinux1 docker build
- job: Intel_SDE_skx
pool:
vmImage: 'ubuntu-16.04'
vmImage: 'ubuntu-latest'
steps:
- script: |
# at the time of writing the available Azure Ubuntu vm image
@ -83,6 +83,8 @@ jobs:
- script: |
brew update
make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10
make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 PREFIX=../blasinst install
ls -lR ../blasinst
- job: OSX_GCC_Nothreads
pool:
@ -104,6 +106,38 @@ jobs:
brew install llvm libomp
make TARGET=CORE2 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10
- job: OSX_OpenMP_Clang_cmake
pool:
vmImage: 'macOS-10.15'
variables:
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
LIBRARY_PATH: /usr/local/opt/llvm/lib
steps:
- script: |
brew update
brew install llvm libomp
mkdir build
cd build
cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNOFORTRAN=1 -DNO_AVX512=1 ..
make
ctest
- job: OSX_OpenMP_Clang_gf_cmake
pool:
vmImage: 'macOS-10.15'
variables:
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
LIBRARY_PATH: /usr/local/opt/llvm/lib
steps:
- script: |
brew update
brew install llvm libomp
mkdir build
cd build
cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNO_AVX512=1 ..
make
ctest
- job: OSX_Ifort_Clang
pool:
vmImage: 'macOS-10.15'
@ -147,13 +181,34 @@ jobs:
export ANDROID_NDK_HOME=/usr/local/share/android-ndk
make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4
- job: OSX_IOS_ARMV8
pool:
vmImage: 'macOS-10.15'
variables:
CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch arm64 -miphoneos-version-min=10.0
steps:
- script: |
make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
- job: OSX_IOS_ARMV7
pool:
vmImage: 'macOS-10.15'
variables:
CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch armv7 -miphoneos-version-min=5.1
steps:
- script: |
make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
- job: ALPINE_MUSL
pool:
vmImage: 'ubuntu-latest'
steps:
- script: |
wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
&& echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1
wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.1/alpine-chroot-install \
&& echo '7c7e3fa378e69aecc7f5f01bbc759e5f0a9d9b74 alpine-chroot-install' | sha1sum -c \
|| exit 1
alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo'
alpine make DYNAMIC_ARCH=1 BINARY=64

View File

@ -94,6 +94,7 @@ $architecture = arm if ($data =~ /ARCH_ARM/);
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
$defined = 0;
@ -143,6 +144,11 @@ if ($architecture eq "riscv64") {
$binary = 64;
}
if ($architecture eq "loongarch64") {
$defined = 1;
$binary = 64;
}
if ($compiler eq "PGI") {
$compiler_name .= " -tp p7" if ($binary eq "32");
$compiler_name .= " -tp p7-64" if ($binary eq "64");
@ -226,6 +232,7 @@ $architecture = ia64 if ($data =~ /ARCH_IA64/);
$architecture = arm if ($data =~ /ARCH_ARM/);
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
$binformat = bin32;
$binformat = bin64 if ($data =~ /BINARY_64/);

View File

@ -400,6 +400,8 @@ void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE
float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy);
void cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy);
void cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
#ifdef __cplusplus
}
#endif /* __cplusplus */

View File

@ -113,6 +113,10 @@ if (MIPS64)
set(NO_BINARY_MODE 1)
endif ()
if (LOONGARCH64)
set(NO_BINARY_MODE 1)
endif ()
if (${ARCH} STREQUAL "alpha")
set(NO_BINARY_MODE 1)
set(BINARY_DEFINED 1)

View File

@ -29,6 +29,15 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS
set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64")
endif ()
if (LOONGARCH64)
if (BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64")
else ()
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32")
endif ()
set(BINARY_DEFINED 1)
endif ()
if (CMAKE_SYSTEM_NAME STREQUAL "AIX")
set(BINARY_DEFINED 1)
endif ()
@ -124,9 +133,9 @@ if (NOT DYNAMIC_ARCH)
if (HAVE_AVX)
set (CCOMMON_OPT "${CCOMMON_OPT} -mavx")
endif ()
if (HAVE_FMA3)
set (CCOMMON_OPT "${CCOMMON_OPT} -mfma")
endif ()
# if (HAVE_FMA3)
#set (CCOMMON_OPT "${CCOMMON_OPT} -mfma")
#endif ()
if (HAVE_SSE)
set (CCOMMON_OPT "${CCOMMON_OPT} -msse")
endif ()

View File

@ -61,6 +61,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN")
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32")
endif ()
endif ()
if (LOONGARCH64)
if (BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
endif ()
endif ()
else ()
if (BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
@ -97,7 +104,7 @@ endif ()
if (${F_COMPILER} STREQUAL "IBM")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM")
# FCOMMON_OPT += -qarch=440
set(FCOMMON_OPT "${FCOMMON_OPT} -qrecur")
if (BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -q64")
if (INTERFACE64)

View File

@ -134,6 +134,8 @@ if (BUILD_BFLOAT16)
set(SHSWAPKERNEL ../arm/swap.c)
set(TOBF16KERNEL ../x86_64/tobf16.c)
set(BF16TOKERNEL ../x86_64/bf16to.c)
set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
endif ()
endmacro ()

View File

@ -186,11 +186,11 @@ if (DEFINED TARGET)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
endif()
endif()
if (DEFINED HAVE_FMA3)
if (NOT NO_AVX2)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
endif()
endif()
# if (DEFINED HAVE_FMA3)
# if (NOT NO_AVX2)
# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
# endif()
# endif()
if (DEFINED HAVE_SSE)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
endif()
@ -258,6 +258,13 @@ if (NEED_PIC)
endif()
endif ()
if (X86_64)
set(SMALL_MATRIX_OPT TRUE)
endif ()
if (SMALL_MATRIX_OPT)
set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT")
endif ()
if (DYNAMIC_ARCH)
if (X86 OR X86_64 OR ARM64 OR PPC)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
@ -462,6 +469,9 @@ endif()
if (BUILD_COMPLEX16)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16")
endif()
if (BUILD_BFLOAT16)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_BFLOAT16")
endif()
if(NOT MSVC)
set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}")
endif()

View File

@ -38,6 +38,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
set(PPC 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
set(MIPS64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
set(LOONGARCH64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
if (NOT BINARY)
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
@ -95,7 +97,7 @@ else()
endif ()
if (NOT BINARY)
if (X86_64 OR ARM64 OR PPC OR MIPS64)
if (X86_64 OR ARM64 OR PPC OR MIPS64 OR LOONGARCH64)
set(BINARY 64)
else ()
set(BINARY 32)

View File

@ -157,31 +157,31 @@ endfunction ()
# STRING - compiles only the given type (e.g. DOUBLE)
function(GenerateNamedObjects sources_in)
if (DEFINED ARGV1)
if (${ARGC} GREATER 1)
set(defines_in ${ARGV1})
endif ()
if (DEFINED ARGV2 AND NOT "${ARGV2}" STREQUAL "")
if (${ARGC} GREATER 2 AND NOT "${ARGV2}" STREQUAL "")
set(name_in ${ARGV2})
# strip off extension for kernel files that pass in the object name.
get_filename_component(name_in ${name_in} NAME_WE)
endif ()
if (DEFINED ARGV3)
if (${ARGC} GREATER 3)
set(use_cblas ${ARGV3})
else ()
set(use_cblas false)
endif ()
if (DEFINED ARGV4)
if (${ARGC} GREATER 4)
set(replace_last_with ${ARGV4})
endif ()
if (DEFINED ARGV5)
if (${ARGC} GREATER 5)
set(append_with ${ARGV5})
endif ()
if (DEFINED ARGV6)
if (${ARGC} GREATER 6)
set(no_float_type ${ARGV6})
else ()
set(no_float_type false)
@ -196,7 +196,7 @@ function(GenerateNamedObjects sources_in)
set(real_only false)
set(complex_only false)
set(mangle_complex_sources false)
if (DEFINED ARGV7 AND NOT "${ARGV7}" STREQUAL "")
if (${ARGC} GREATER 7 AND NOT "${ARGV7}" STREQUAL "")
if (${ARGV7} EQUAL 1)
set(real_only true)
elseif (${ARGV7} EQUAL 2)
@ -311,7 +311,15 @@ function(GenerateNamedObjects sources_in)
configure_file(${new_source_file}.tmp ${new_source_file} COPYONLY)
file(REMOVE ${new_source_file}.tmp)
list(APPEND SRC_LIST_OUT ${new_source_file})
message (STATUS ${new_source_file})
if (DEFINED HAVE_FMA3)
if ( ${new_source_file} MATCHES "(s|d?)rot_k.*c")
set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma")
endif ()
if ( ${new_source_file} MATCHES "dgemv_t_k.*c")
set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma")
endif ()
endif ()
endforeach ()
endforeach ()
@ -334,17 +342,17 @@ endfunction ()
function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_defines_in replace_scheme)
set(alternate_name_in "")
if (DEFINED ARGV5)
if (${ARGC} GREATER 5)
set(alternate_name_in ${ARGV5})
endif ()
set(no_float_type false)
if (DEFINED ARGV6)
if (${ARGC} GREATER 6)
set(no_float_type ${ARGV6})
endif ()
set(complex_filename_scheme "")
if (DEFINED ARGV7)
if (${ARGC} GREATER 7)
set(complex_filename_scheme ${ARGV7})
endif ()

View File

@ -470,6 +470,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#include "common_zarch.h"
#endif
#ifdef ARCH_LOONGARCH64
#include "common_loongarch64.h"
#endif
#ifndef ASSEMBLER
#ifdef OS_WINDOWSSTORE
typedef char env_var_t[MAX_PATH];

View File

@ -120,7 +120,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
.text ;
.p2align 2 ;
.global REALNAME ;
#ifndef __APPLE__
#if !defined(__APPLE__) && !defined(_WIN32)
.type REALNAME, %function ;
#endif
REALNAME:

View File

@ -232,6 +232,8 @@
#define CGEADD_K cgeadd_k
#define CGEMM_SMALL_MATRIX_PERMIT cgemm_small_matrix_permit
#else
#define CAMAX_K gotoblas -> camax_k
@ -426,8 +428,51 @@
#define CGEADD_K gotoblas -> cgeadd_k
#define CGEMM_SMALL_MATRIX_PERMIT gotoblas -> cgemm_small_matrix_permit
#endif
#define CGEMM_SMALL_KERNEL_NN FUNC_OFFSET(cgemm_small_kernel_nn)
#define CGEMM_SMALL_KERNEL_NT FUNC_OFFSET(cgemm_small_kernel_nt)
#define CGEMM_SMALL_KERNEL_NR FUNC_OFFSET(cgemm_small_kernel_nr)
#define CGEMM_SMALL_KERNEL_NC FUNC_OFFSET(cgemm_small_kernel_nc)
#define CGEMM_SMALL_KERNEL_TN FUNC_OFFSET(cgemm_small_kernel_tn)
#define CGEMM_SMALL_KERNEL_TT FUNC_OFFSET(cgemm_small_kernel_tt)
#define CGEMM_SMALL_KERNEL_TR FUNC_OFFSET(cgemm_small_kernel_tr)
#define CGEMM_SMALL_KERNEL_TC FUNC_OFFSET(cgemm_small_kernel_tc)
#define CGEMM_SMALL_KERNEL_RN FUNC_OFFSET(cgemm_small_kernel_rn)
#define CGEMM_SMALL_KERNEL_RT FUNC_OFFSET(cgemm_small_kernel_rt)
#define CGEMM_SMALL_KERNEL_RR FUNC_OFFSET(cgemm_small_kernel_rr)
#define CGEMM_SMALL_KERNEL_RC FUNC_OFFSET(cgemm_small_kernel_rc)
#define CGEMM_SMALL_KERNEL_CN FUNC_OFFSET(cgemm_small_kernel_cn)
#define CGEMM_SMALL_KERNEL_CT FUNC_OFFSET(cgemm_small_kernel_ct)
#define CGEMM_SMALL_KERNEL_CR FUNC_OFFSET(cgemm_small_kernel_cr)
#define CGEMM_SMALL_KERNEL_CC FUNC_OFFSET(cgemm_small_kernel_cc)
#define CGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(cgemm_small_kernel_b0_nn)
#define CGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(cgemm_small_kernel_b0_nt)
#define CGEMM_SMALL_KERNEL_B0_NR FUNC_OFFSET(cgemm_small_kernel_b0_nr)
#define CGEMM_SMALL_KERNEL_B0_NC FUNC_OFFSET(cgemm_small_kernel_b0_nc)
#define CGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(cgemm_small_kernel_b0_tn)
#define CGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(cgemm_small_kernel_b0_tt)
#define CGEMM_SMALL_KERNEL_B0_TR FUNC_OFFSET(cgemm_small_kernel_b0_tr)
#define CGEMM_SMALL_KERNEL_B0_TC FUNC_OFFSET(cgemm_small_kernel_b0_tc)
#define CGEMM_SMALL_KERNEL_B0_RN FUNC_OFFSET(cgemm_small_kernel_b0_rn)
#define CGEMM_SMALL_KERNEL_B0_RT FUNC_OFFSET(cgemm_small_kernel_b0_rt)
#define CGEMM_SMALL_KERNEL_B0_RR FUNC_OFFSET(cgemm_small_kernel_b0_rr)
#define CGEMM_SMALL_KERNEL_B0_RC FUNC_OFFSET(cgemm_small_kernel_b0_rc)
#define CGEMM_SMALL_KERNEL_B0_CN FUNC_OFFSET(cgemm_small_kernel_b0_cn)
#define CGEMM_SMALL_KERNEL_B0_CT FUNC_OFFSET(cgemm_small_kernel_b0_ct)
#define CGEMM_SMALL_KERNEL_B0_CR FUNC_OFFSET(cgemm_small_kernel_b0_cr)
#define CGEMM_SMALL_KERNEL_B0_CC FUNC_OFFSET(cgemm_small_kernel_b0_cc)
#define CGEMM_NN cgemm_nn
#define CGEMM_CN cgemm_cn
#define CGEMM_TN cgemm_tn

View File

@ -157,6 +157,8 @@
#define DIMATCOPY_K_RT dimatcopy_k_rt
#define DGEADD_K dgeadd_k
#define DGEMM_SMALL_MATRIX_PERMIT dgemm_small_matrix_permit
#else
#define DAMAX_K gotoblas -> damax_k
@ -281,8 +283,21 @@
#define DGEADD_K gotoblas -> dgeadd_k
#define DGEMM_SMALL_MATRIX_PERMIT gotoblas -> dgemm_small_matrix_permit
#endif
#define DGEMM_SMALL_KERNEL_NN FUNC_OFFSET(dgemm_small_kernel_nn)
#define DGEMM_SMALL_KERNEL_NT FUNC_OFFSET(dgemm_small_kernel_nt)
#define DGEMM_SMALL_KERNEL_TN FUNC_OFFSET(dgemm_small_kernel_tn)
#define DGEMM_SMALL_KERNEL_TT FUNC_OFFSET(dgemm_small_kernel_tt)
#define DGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(dgemm_small_kernel_b0_nn)
#define DGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(dgemm_small_kernel_b0_nt)
#define DGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(dgemm_small_kernel_b0_tn)
#define DGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(dgemm_small_kernel_b0_tt)
#define DGEMM_NN dgemm_nn
#define DGEMM_CN dgemm_tn
#define DGEMM_TN dgemm_tn

View File

@ -515,6 +515,129 @@ int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xidouble *, xidouble *, xidouble
int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);
#endif
#ifdef SMALL_MATRIX_OPT
int sbgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
int sbgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int sbgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int sbgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int sbgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int sgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
int sgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int sgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int sgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int sgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int dgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta);
int dgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
int sbgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
int sbgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
int sbgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
int sbgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
int sgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int sgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int sgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int sgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int dgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int dgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int dgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int dgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int cgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha0, float alpha1, float beta0, float beta1);
int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int zgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha0, double alpha1, double beta0, double beta1);
int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int cgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int zgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
#endif
int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
int cgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
int cgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);

199
common_loongarch64.h Normal file
View File

@ -0,0 +1,199 @@
/*****************************************************************************
Copyright (c) 2011-2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#ifndef COMMON_LOONGARCH64
#define COMMON_LOONGARCH64
#define MB __sync_synchronize()
#define WMB __sync_synchronize()
#define RMB __sync_synchronize()
#define INLINE inline
#ifndef ASSEMBLER
static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
}
#ifdef DOUBLE
#define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory")
#else
#define GET_IMAGE(res) __asm__ __volatile__("fmov.s %0, $f2" : "=f"(res) : : "memory")
#endif
#define GET_IMAGE_CANCEL
#else
#ifdef DOUBLE
#define LD fld.d
#define ST fst.d
#define MADD fmadd.d
#define NMADD fnmadd.d
#define MSUB fmsub.d
#define NMSUB fnmsub.d
#define ADD fadd.d
#define SUB fsub.d
#define MUL fmul.d
#define MOV fmov.d
#define CMOVT fsel
#define MTC movgr2fr.d
#define FABS fabs.d
#define CMPEQ fcmp.ceq.d
#define CMPLE fcmp.cle.d
#define CMPLT fcmp.clt.d
#define NEG fneg.d
#else
#define LD fld.s
#define ST fst.s
#define MADD fmadd.s
#define NMADD fnmadd.s
#define MSUB fmsub.s
#define NMSUB fnmsub.s
#define ADD fadd.s
#define SUB fsub.s
#define MUL fmul.s
#define MOV fmov.s
#define CMOVT fsel
#define MTC movgr2fr.w
#define FABS fabs.s
#define CMPEQ fcmp.ceq.s
#define CMPLE fcmp.cle.s
#define CMPLT fcmp.clt.s
#define NEG fneg.s
#endif /* defined(DOUBLE) */
#if defined(__64BIT__) && defined(USE64BITINT)
#define LDINT ld.d
#define LDARG ld.d
#define SDARG st.d
#elif defined(__64BIT__) && !defined(USE64BITINT)
#define LDINT ld.w
#define LDARG ld.d
#define SDARG st.d
#else
#define LDINT ld.w
#define LDARG ld.w
#define SDARG st.w
#endif
#ifndef F_INTERFACE
#define REALNAME ASMNAME
#else
#define REALNAME ASMFNAME
#endif /* defined(F_INTERFACE) */
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
#define PROLOGUE \
.text ;\
.align 5 ;\
.globl REALNAME ;\
.type REALNAME, @function ;\
REALNAME: ;\
#if defined(__linux__) && defined(__ELF__)
#define GNUSTACK .section .note.GNU-stack,"",@progbits
#else
#define GNUSTACK
#endif /* defined(__linux__) && defined(__ELF__) */
#define EPILOGUE \
.end REALNAME ;\
GNUSTACK
#define PROFCODE
#define MOVT(dst, src, cc) \
bceqz cc, 1f; \
add.d dst, src, $r0; \
1:
#endif /* defined(ASSEMBLER) && !defined(NEEDPARAM) */
#endif /* defined(ASSEMBLER) */
#define SEEK_ADDRESS
#define BUFFER_SIZE ( 32 << 20)
#define PAGESIZE (16UL << 10)
#define FIXED_PAGESIZE (16UL << 10)
#define HUGE_PAGESIZE ( 2 << 20)
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
#ifndef MAP_ANONYMOUS
#define MAP_ANONYMOUS MAP_ANON
#endif
#endif

View File

@ -644,6 +644,17 @@
#define GEADD_K DGEADD_K
#define GEMM_SMALL_MATRIX_PERMIT DGEMM_SMALL_MATRIX_PERMIT
#define GEMM_SMALL_KERNEL_NN DGEMM_SMALL_KERNEL_NN
#define GEMM_SMALL_KERNEL_NT DGEMM_SMALL_KERNEL_NT
#define GEMM_SMALL_KERNEL_TN DGEMM_SMALL_KERNEL_TN
#define GEMM_SMALL_KERNEL_TT DGEMM_SMALL_KERNEL_TT
#define GEMM_SMALL_KERNEL_B0_NN DGEMM_SMALL_KERNEL_B0_NN
#define GEMM_SMALL_KERNEL_B0_NT DGEMM_SMALL_KERNEL_B0_NT
#define GEMM_SMALL_KERNEL_B0_TN DGEMM_SMALL_KERNEL_B0_TN
#define GEMM_SMALL_KERNEL_B0_TT DGEMM_SMALL_KERNEL_B0_TT
#elif defined(BFLOAT16)
#define D_TO_BF16_K SBDTOBF16_K
@ -931,6 +942,18 @@
#define GEADD_K SGEADD_K
#define GEMM_SMALL_MATRIX_PERMIT SBGEMM_SMALL_MATRIX_PERMIT
#define GEMM_SMALL_KERNEL_NN SBGEMM_SMALL_KERNEL_NN
#define GEMM_SMALL_KERNEL_NT SBGEMM_SMALL_KERNEL_NT
#define GEMM_SMALL_KERNEL_TN SBGEMM_SMALL_KERNEL_TN
#define GEMM_SMALL_KERNEL_TT SBGEMM_SMALL_KERNEL_TT
#define GEMM_SMALL_KERNEL_B0_NN SBGEMM_SMALL_KERNEL_B0_NN
#define GEMM_SMALL_KERNEL_B0_NT SBGEMM_SMALL_KERNEL_B0_NT
#define GEMM_SMALL_KERNEL_B0_TN SBGEMM_SMALL_KERNEL_B0_TN
#define GEMM_SMALL_KERNEL_B0_TT SBGEMM_SMALL_KERNEL_B0_TT
#endif
#else
@ -1236,6 +1259,19 @@
#define IMATCOPY_K_RT SIMATCOPY_K_RT
#define GEADD_K SGEADD_K
#define GEMM_SMALL_MATRIX_PERMIT SGEMM_SMALL_MATRIX_PERMIT
#define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN
#define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT
#define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN
#define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT
#define GEMM_SMALL_KERNEL_B0_NN SGEMM_SMALL_KERNEL_B0_NN
#define GEMM_SMALL_KERNEL_B0_NT SGEMM_SMALL_KERNEL_B0_NT
#define GEMM_SMALL_KERNEL_B0_TN SGEMM_SMALL_KERNEL_B0_TN
#define GEMM_SMALL_KERNEL_B0_TT SGEMM_SMALL_KERNEL_B0_TT
#endif
#else
#ifdef XDOUBLE
@ -2063,6 +2099,48 @@
#define GEADD_K ZGEADD_K
#define GEMM_SMALL_MATRIX_PERMIT ZGEMM_SMALL_MATRIX_PERMIT
#define GEMM_SMALL_KERNEL_NN ZGEMM_SMALL_KERNEL_NN
#define GEMM_SMALL_KERNEL_NT ZGEMM_SMALL_KERNEL_NT
#define GEMM_SMALL_KERNEL_NR ZGEMM_SMALL_KERNEL_NR
#define GEMM_SMALL_KERNEL_NC ZGEMM_SMALL_KERNEL_NC
#define GEMM_SMALL_KERNEL_TN ZGEMM_SMALL_KERNEL_TN
#define GEMM_SMALL_KERNEL_TT ZGEMM_SMALL_KERNEL_TT
#define GEMM_SMALL_KERNEL_TR ZGEMM_SMALL_KERNEL_TR
#define GEMM_SMALL_KERNEL_TC ZGEMM_SMALL_KERNEL_TC
#define GEMM_SMALL_KERNEL_RN ZGEMM_SMALL_KERNEL_RN
#define GEMM_SMALL_KERNEL_RT ZGEMM_SMALL_KERNEL_RT
#define GEMM_SMALL_KERNEL_RR ZGEMM_SMALL_KERNEL_RR
#define GEMM_SMALL_KERNEL_RC ZGEMM_SMALL_KERNEL_RC
#define GEMM_SMALL_KERNEL_CN ZGEMM_SMALL_KERNEL_CN
#define GEMM_SMALL_KERNEL_CT ZGEMM_SMALL_KERNEL_CT
#define GEMM_SMALL_KERNEL_CR ZGEMM_SMALL_KERNEL_CR
#define GEMM_SMALL_KERNEL_CC ZGEMM_SMALL_KERNEL_CC
#define GEMM_SMALL_KERNEL_B0_NN ZGEMM_SMALL_KERNEL_B0_NN
#define GEMM_SMALL_KERNEL_B0_NT ZGEMM_SMALL_KERNEL_B0_NT
#define GEMM_SMALL_KERNEL_B0_NR ZGEMM_SMALL_KERNEL_B0_NR
#define GEMM_SMALL_KERNEL_B0_NC ZGEMM_SMALL_KERNEL_B0_NC
#define GEMM_SMALL_KERNEL_B0_TN ZGEMM_SMALL_KERNEL_B0_TN
#define GEMM_SMALL_KERNEL_B0_TT ZGEMM_SMALL_KERNEL_B0_TT
#define GEMM_SMALL_KERNEL_B0_TR ZGEMM_SMALL_KERNEL_B0_TR
#define GEMM_SMALL_KERNEL_B0_TC ZGEMM_SMALL_KERNEL_B0_TC
#define GEMM_SMALL_KERNEL_B0_RN ZGEMM_SMALL_KERNEL_B0_RN
#define GEMM_SMALL_KERNEL_B0_RT ZGEMM_SMALL_KERNEL_B0_RT
#define GEMM_SMALL_KERNEL_B0_RR ZGEMM_SMALL_KERNEL_B0_RR
#define GEMM_SMALL_KERNEL_B0_RC ZGEMM_SMALL_KERNEL_B0_RC
#define GEMM_SMALL_KERNEL_B0_CN ZGEMM_SMALL_KERNEL_B0_CN
#define GEMM_SMALL_KERNEL_B0_CT ZGEMM_SMALL_KERNEL_B0_CT
#define GEMM_SMALL_KERNEL_B0_CR ZGEMM_SMALL_KERNEL_B0_CR
#define GEMM_SMALL_KERNEL_B0_CC ZGEMM_SMALL_KERNEL_B0_CC
#else
#define AMAX_K CAMAX_K
@ -2486,11 +2564,54 @@
#define GEADD_K CGEADD_K
#define GEMM_SMALL_MATRIX_PERMIT CGEMM_SMALL_MATRIX_PERMIT
#define GEMM_SMALL_KERNEL_NN CGEMM_SMALL_KERNEL_NN
#define GEMM_SMALL_KERNEL_NT CGEMM_SMALL_KERNEL_NT
#define GEMM_SMALL_KERNEL_NR CGEMM_SMALL_KERNEL_NR
#define GEMM_SMALL_KERNEL_NC CGEMM_SMALL_KERNEL_NC
#define GEMM_SMALL_KERNEL_TN CGEMM_SMALL_KERNEL_TN
#define GEMM_SMALL_KERNEL_TT CGEMM_SMALL_KERNEL_TT
#define GEMM_SMALL_KERNEL_TR CGEMM_SMALL_KERNEL_TR
#define GEMM_SMALL_KERNEL_TC CGEMM_SMALL_KERNEL_TC
#define GEMM_SMALL_KERNEL_RN CGEMM_SMALL_KERNEL_RN
#define GEMM_SMALL_KERNEL_RT CGEMM_SMALL_KERNEL_RT
#define GEMM_SMALL_KERNEL_RR CGEMM_SMALL_KERNEL_RR
#define GEMM_SMALL_KERNEL_RC CGEMM_SMALL_KERNEL_RC
#define GEMM_SMALL_KERNEL_CN CGEMM_SMALL_KERNEL_CN
#define GEMM_SMALL_KERNEL_CT CGEMM_SMALL_KERNEL_CT
#define GEMM_SMALL_KERNEL_CR CGEMM_SMALL_KERNEL_CR
#define GEMM_SMALL_KERNEL_CC CGEMM_SMALL_KERNEL_CC
#define GEMM_SMALL_KERNEL_B0_NN CGEMM_SMALL_KERNEL_B0_NN
#define GEMM_SMALL_KERNEL_B0_NT CGEMM_SMALL_KERNEL_B0_NT
#define GEMM_SMALL_KERNEL_B0_NR CGEMM_SMALL_KERNEL_B0_NR
#define GEMM_SMALL_KERNEL_B0_NC CGEMM_SMALL_KERNEL_B0_NC
#define GEMM_SMALL_KERNEL_B0_TN CGEMM_SMALL_KERNEL_B0_TN
#define GEMM_SMALL_KERNEL_B0_TT CGEMM_SMALL_KERNEL_B0_TT
#define GEMM_SMALL_KERNEL_B0_TR CGEMM_SMALL_KERNEL_B0_TR
#define GEMM_SMALL_KERNEL_B0_TC CGEMM_SMALL_KERNEL_B0_TC
#define GEMM_SMALL_KERNEL_B0_RN CGEMM_SMALL_KERNEL_B0_RN
#define GEMM_SMALL_KERNEL_B0_RT CGEMM_SMALL_KERNEL_B0_RT
#define GEMM_SMALL_KERNEL_B0_RR CGEMM_SMALL_KERNEL_B0_RR
#define GEMM_SMALL_KERNEL_B0_RC CGEMM_SMALL_KERNEL_B0_RC
#define GEMM_SMALL_KERNEL_B0_CN CGEMM_SMALL_KERNEL_B0_CN
#define GEMM_SMALL_KERNEL_B0_CT CGEMM_SMALL_KERNEL_B0_CT
#define GEMM_SMALL_KERNEL_B0_CR CGEMM_SMALL_KERNEL_B0_CR
#define GEMM_SMALL_KERNEL_B0_CC CGEMM_SMALL_KERNEL_B0_CC
#endif
#endif
#ifndef ASSEMBLER
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\
|| defined(ARCH_LOONGARCH64)
extern BLASLONG gemm_offset_a;
extern BLASLONG gemm_offset_b;
extern BLASLONG sbgemm_p;

View File

@ -145,6 +145,19 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG);
int (*sbneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *);
int (*sblaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *);
#ifdef SMALL_MATRIX_OPT
int (*sbgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
int (*sbgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int (*sbgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int (*sbgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int (*sbgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int (*sbgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
int (*sbgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
int (*sbgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
int (*sbgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
#endif
#endif
#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX)
@ -207,6 +220,20 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
#endif
#ifdef BUILD_SINGLE
#ifdef SMALL_MATRIX_OPT
int (*sgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
int (*sgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int (*sgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int (*sgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int (*sgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int (*sgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int (*sgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int (*sgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int (*sgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
#endif
int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
@ -314,6 +341,19 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
#endif
#ifdef BUILD_DOUBLE
#ifdef SMALL_MATRIX_OPT
int (*dgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta);
int (*dgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
int (*dgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
int (*dgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
int (*dgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
int (*dgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int (*dgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int (*dgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int (*dgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
#endif
int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
@ -513,6 +553,50 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
int (*cgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
#ifdef SMALL_MATRIX_OPT
int (*cgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha0, float alpha1, float beta0, float beta1);
int (*cgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_nr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_nc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_tr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_tc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_rn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_rt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_rr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_rc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_cn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_ct )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_cr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_cc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_b0_nr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_b0_nc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_b0_tr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_b0_tc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_b0_rn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_b0_rt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_b0_rr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_b0_rc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_b0_cn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_b0_ct )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_b0_cr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int (*cgemm_small_kernel_b0_cc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
#endif
int (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
@ -679,6 +763,50 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
int (*zgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
int (*zgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
#ifdef SMALL_MATRIX_OPT
int (*zgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha0, double alpha1, double beta0, double beta1);
int (*zgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_nr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_nc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_tr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_tc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_rn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_rt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_rr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_rc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_cn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_ct )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_cr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_cc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_b0_nr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_b0_nc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_b0_tr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_b0_tc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_b0_rn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_b0_rt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_b0_rr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_b0_rc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_b0_cn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_b0_ct )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_b0_cr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int (*zgemm_small_kernel_b0_cc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
#endif
int (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
int (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
int (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
@ -1069,6 +1197,8 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
extern gotoblas_t *gotoblas;
#define FUNC_OFFSET(func) (size_t)(&((gotoblas_t *)NULL)->func)
#define DTB_ENTRIES gotoblas -> dtb_entries
#define GEMM_OFFSET_A gotoblas -> offsetA
#define GEMM_OFFSET_B gotoblas -> offsetB
@ -1174,6 +1304,8 @@ extern gotoblas_t *gotoblas;
#else
#define FUNC_OFFSET(func) (size_t)(func)
#define DTB_ENTRIES DTB_DEFAULT_ENTRIES
#define GEMM_OFFSET_A GEMM_DEFAULT_OFFSET_A

View File

@ -164,6 +164,8 @@
#define SGEADD_K sgeadd_k
#define SGEMM_SMALL_MATRIX_PERMIT sgemm_small_matrix_permit
#else
#define SAMAX_K gotoblas -> samax_k
@ -299,8 +301,21 @@
#define SGEADD_K gotoblas -> sgeadd_k
#define SGEMM_SMALL_MATRIX_PERMIT gotoblas -> sgemm_small_matrix_permit
#endif
#define SGEMM_SMALL_KERNEL_NN FUNC_OFFSET(sgemm_small_kernel_nn)
#define SGEMM_SMALL_KERNEL_NT FUNC_OFFSET(sgemm_small_kernel_nt)
#define SGEMM_SMALL_KERNEL_TN FUNC_OFFSET(sgemm_small_kernel_tn)
#define SGEMM_SMALL_KERNEL_TT FUNC_OFFSET(sgemm_small_kernel_tt)
#define SGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(sgemm_small_kernel_b0_nn)
#define SGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(sgemm_small_kernel_b0_nt)
#define SGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(sgemm_small_kernel_b0_tn)
#define SGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(sgemm_small_kernel_b0_tt)
#define SGEMM_NN sgemm_nn
#define SGEMM_CN sgemm_tn
#define SGEMM_TN sgemm_tn

View File

@ -24,6 +24,7 @@
#define SBGEMM_BETA sbgemm_beta
#define SBGEMM_KERNEL sbgemm_kernel
#define SBGEMM_SMALL_MATRIX_PERMIT sbgemm_small_matrix_permit
#else
#define SBDOT_K gotoblas -> sbdot_k
@ -41,8 +42,19 @@
#define SBGEMM_BETA gotoblas -> sbgemm_beta
#define SBGEMM_KERNEL gotoblas -> sbgemm_kernel
#define SBGEMM_SMALL_MATRIX_PERMIT gotoblas -> sbgemm_small_matrix_permit
#endif
#define SBGEMM_SMALL_KERNEL_NN FUNC_OFFSET(sbgemm_small_kernel_nn)
#define SBGEMM_SMALL_KERNEL_NT FUNC_OFFSET(sbgemm_small_kernel_nt)
#define SBGEMM_SMALL_KERNEL_TN FUNC_OFFSET(sbgemm_small_kernel_tn)
#define SBGEMM_SMALL_KERNEL_TT FUNC_OFFSET(sbgemm_small_kernel_tt)
#define SBGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(sbgemm_small_kernel_b0_nn)
#define SBGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(sbgemm_small_kernel_b0_nt)
#define SBGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(sbgemm_small_kernel_b0_tn)
#define SBGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(sbgemm_small_kernel_b0_tt)
#define SBGEMM_NN sbgemm_nn
#define SBGEMM_CN sbgemm_tn
#define SBGEMM_TN sbgemm_tn

View File

@ -232,6 +232,8 @@
#define ZGEADD_K zgeadd_k
#define ZGEMM_SMALL_MATRIX_PERMIT zgemm_small_matrix_permit
#else
#define ZAMAX_K gotoblas -> zamax_k
@ -426,8 +428,51 @@
#define ZGEADD_K gotoblas -> zgeadd_k
#define ZGEMM_SMALL_MATRIX_PERMIT gotoblas -> zgemm_small_matrix_permit
#endif
#define ZGEMM_SMALL_KERNEL_NN FUNC_OFFSET(zgemm_small_kernel_nn)
#define ZGEMM_SMALL_KERNEL_NT FUNC_OFFSET(zgemm_small_kernel_nt)
#define ZGEMM_SMALL_KERNEL_NR FUNC_OFFSET(zgemm_small_kernel_nr)
#define ZGEMM_SMALL_KERNEL_NC FUNC_OFFSET(zgemm_small_kernel_nc)
#define ZGEMM_SMALL_KERNEL_TN FUNC_OFFSET(zgemm_small_kernel_tn)
#define ZGEMM_SMALL_KERNEL_TT FUNC_OFFSET(zgemm_small_kernel_tt)
#define ZGEMM_SMALL_KERNEL_TR FUNC_OFFSET(zgemm_small_kernel_tr)
#define ZGEMM_SMALL_KERNEL_TC FUNC_OFFSET(zgemm_small_kernel_tc)
#define ZGEMM_SMALL_KERNEL_RN FUNC_OFFSET(zgemm_small_kernel_rn)
#define ZGEMM_SMALL_KERNEL_RT FUNC_OFFSET(zgemm_small_kernel_rt)
#define ZGEMM_SMALL_KERNEL_RR FUNC_OFFSET(zgemm_small_kernel_rr)
#define ZGEMM_SMALL_KERNEL_RC FUNC_OFFSET(zgemm_small_kernel_rc)
#define ZGEMM_SMALL_KERNEL_CN FUNC_OFFSET(zgemm_small_kernel_cn)
#define ZGEMM_SMALL_KERNEL_CT FUNC_OFFSET(zgemm_small_kernel_ct)
#define ZGEMM_SMALL_KERNEL_CR FUNC_OFFSET(zgemm_small_kernel_cr)
#define ZGEMM_SMALL_KERNEL_CC FUNC_OFFSET(zgemm_small_kernel_cc)
#define ZGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(zgemm_small_kernel_b0_nn)
#define ZGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(zgemm_small_kernel_b0_nt)
#define ZGEMM_SMALL_KERNEL_B0_NR FUNC_OFFSET(zgemm_small_kernel_b0_nr)
#define ZGEMM_SMALL_KERNEL_B0_NC FUNC_OFFSET(zgemm_small_kernel_b0_nc)
#define ZGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(zgemm_small_kernel_b0_tn)
#define ZGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(zgemm_small_kernel_b0_tt)
#define ZGEMM_SMALL_KERNEL_B0_TR FUNC_OFFSET(zgemm_small_kernel_b0_tr)
#define ZGEMM_SMALL_KERNEL_B0_TC FUNC_OFFSET(zgemm_small_kernel_b0_tc)
#define ZGEMM_SMALL_KERNEL_B0_RN FUNC_OFFSET(zgemm_small_kernel_b0_rn)
#define ZGEMM_SMALL_KERNEL_B0_RT FUNC_OFFSET(zgemm_small_kernel_b0_rt)
#define ZGEMM_SMALL_KERNEL_B0_RR FUNC_OFFSET(zgemm_small_kernel_b0_rr)
#define ZGEMM_SMALL_KERNEL_B0_RC FUNC_OFFSET(zgemm_small_kernel_b0_rc)
#define ZGEMM_SMALL_KERNEL_B0_CN FUNC_OFFSET(zgemm_small_kernel_b0_cn)
#define ZGEMM_SMALL_KERNEL_B0_CT FUNC_OFFSET(zgemm_small_kernel_b0_ct)
#define ZGEMM_SMALL_KERNEL_B0_CR FUNC_OFFSET(zgemm_small_kernel_b0_cr)
#define ZGEMM_SMALL_KERNEL_B0_CC FUNC_OFFSET(zgemm_small_kernel_b0_cc)
#define ZGEMM_NN zgemm_nn
#define ZGEMM_CN zgemm_cn
#define ZGEMM_TN zgemm_tn

110
cpuid_loongarch64.c Normal file
View File

@ -0,0 +1,110 @@
/*****************************************************************************
Copyright (c) 2011-2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include <stdint.h>
#define CPU_UNKNOWN 0
#define CPU_LOONGSON3R5 1
#define LOONGARCH_CFG2 0x02
#define LOONGARCH_LASX 1<<7
static char *cpuname[] = {
"UNKNOWN",
"LOONGSON3R5"
};
int detect(void) {
uint32_t reg = 0;
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(reg)
: "r"(LOONGARCH_CFG2)
);
if (reg & LOONGARCH_LASX)
return CPU_LOONGSON3R5;
else
return CPU_UNKNOWN;
}
char *get_corename(void) {
return cpuname[detect()];
}
void get_architecture(void) {
printf("LOONGARCH64");
}
void get_subarchitecture(void) {
if (detect() == CPU_LOONGSON3R5) {
printf("LOONGSON3R5");
} else {
printf("UNKNOWN");
}
}
void get_subdirname(void) {
printf("loongarch64");
}
void get_cpuconfig(void) {
if (detect() == CPU_LOONGSON3R5) {
printf("#define LOONGSON3R5\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
} else {
printf("#define LOONGSON3R5\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
}
}
void get_libname(void){
if (detect() == CPU_LOONGSON3R5) {
printf("loongson3r5\n");
} else {
printf("loongarch64\n");
}
}

View File

@ -84,7 +84,7 @@ OS_AIX
OS_OSF
#endif
#if defined(__WIN32) || defined(__WIN64) || defined(__WINNT)
#if defined(__WIN32) || defined(__WIN64) || defined(_WIN32) || defined(_WIN64) || defined(__WINNT)
OS_WINNT
#endif
@ -141,7 +141,7 @@ ARCH_SPARC
ARCH_IA64
#endif
#if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__)
#if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) || defined(__aarch64__)
BINARY_64
#endif
@ -157,6 +157,10 @@ ARCH_ARM64
ARCH_RISCV64
#endif
#ifdef __loongarch64
ARCH_LOONGARCH64
#endif
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
HAVE_C11
#endif

View File

@ -4,6 +4,9 @@ include_directories(${PROJECT_BINARY_DIR})
enable_language(Fortran)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS")
if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize")
endif()
if(WIN32)
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1

View File

@ -6,6 +6,9 @@ TOPDIR = ..
include $(TOPDIR)/Makefile.system
override CFLAGS += -DADD$(BU) -DCBLAS
ifeq ($(F_COMPILER),GFORTRAN)
override FFLAGS += -fno-tree-vectorize
endif
override TARGET_ARCH=
override TARGET_MACH=

View File

@ -81,6 +81,7 @@ foreach (float_type ${FLOAT_TYPES})
GenerateNamedObjects("gbmv_thread.c" "TRANSA" "gbmv_thread_t" false "" "" false ${float_type})
endif ()
# special defines for complex
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
foreach (u_source ${U_SOURCES})
@ -197,6 +198,13 @@ foreach (float_type ${FLOAT_TYPES})
endif ()
endforeach ()
if (BUILD_BFLOAT16)
if (USE_THREAD)
GenerateNamedObjects("sbgemv_thread.c" "" "gemv_thread_n" false "" "" false "BFLOAT16")
GenerateNamedObjects("sbgemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "BFLOAT16")
endif ()
endif ()
if ( BUILD_COMPLEX AND NOT BUILD_SINGLE)
if (USE_THREAD)
GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "SINGLE")

View File

@ -12,6 +12,12 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES})
if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3)
GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0)
endif ()
if (BUILD_BFLOAT16)
GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16")
if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3)
GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16")
endif ()
endif ()
endforeach ()
if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)

View File

@ -6,10 +6,6 @@ extern gotoblas_t gotoblas_POWER8;
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
extern gotoblas_t gotoblas_POWER9;
#endif
//#if (!defined __GNUC__) || ( __GNUC__ >= 11) \
// || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
//#define HAVE_P10_SUPPORT 1
//#endif
#ifdef HAVE_P10_SUPPORT
extern gotoblas_t gotoblas_POWER10;
#endif

View File

@ -73,6 +73,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#ifndef likely
#ifdef __GNUC__
#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
#else
#define likely(x) (x)
#define unlikely(x) (x)
#endif
#endif
#if defined(USE_TLS) && defined(SMP)
#define COMPILE_TLS
@ -428,7 +438,7 @@ extern int openblas_goto_num_threads_env();
extern int openblas_omp_num_threads_env();
int blas_get_cpu_number(void){
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
int max_num;
#endif
int blas_goto_num = 0;
@ -436,7 +446,7 @@ int blas_get_cpu_number(void){
if (blas_num_threads) return blas_num_threads;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
max_num = get_num_procs();
#endif
@ -460,7 +470,7 @@ int blas_get_cpu_number(void){
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
else blas_num_threads = MAX_CPU_NUMBER;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif
@ -1291,7 +1301,12 @@ UNLOCK_COMMAND(&alloc_lock);
return (void *)(((char *)alloc_info) + sizeof(struct alloc_t));
error:
printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
printf("OpenBLAS : Program will terminate because you tried to allocate too many TLS memory regions.\n");
printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
printf("a sufficiently small number. This error typically occurs when the software that relies on\n");
printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n");
printf("cpu cores than what OpenBLAS was configured to handle.\n");
return NULL;
}
@ -1979,7 +1994,7 @@ extern int openblas_goto_num_threads_env();
extern int openblas_omp_num_threads_env();
int blas_get_cpu_number(void){
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
int max_num;
#endif
int blas_goto_num = 0;
@ -1987,7 +2002,7 @@ int blas_get_cpu_number(void){
if (blas_num_threads) return blas_num_threads;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
max_num = get_num_procs();
#endif
@ -2011,7 +2026,7 @@ int blas_get_cpu_number(void){
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
else blas_num_threads = MAX_CPU_NUMBER;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif
@ -2055,6 +2070,7 @@ struct release_t {
int hugetlb_allocated = 0;
static struct release_t release_info[NUM_BUFFERS];
static struct release_t *new_release_info;
static int release_pos = 0;
#if defined(OS_LINUX) && !defined(NO_WARMUP)
@ -2105,8 +2121,13 @@ static void *alloc_mmap(void *address){
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
if (likely(release_pos < NUM_BUFFERS)) {
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
} else {
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free;
}
release_pos ++;
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
@ -2269,8 +2290,13 @@ static void *alloc_mmap(void *address){
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
if (likely(release_pos < NUM_BUFFERS)) {
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
} else {
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free;
}
release_pos ++;
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
@ -2302,8 +2328,13 @@ static void *alloc_malloc(void *address){
if (map_address == (void *)NULL) map_address = (void *)-1;
if (map_address != (void *)-1) {
if (likely(release_pos < NUM_BUFFERS)) {
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_malloc_free;
} else {
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
new_release_info[release_pos-NUM_BUFFERS].func = alloc_malloc_free;
}
release_pos ++;
}
@ -2336,8 +2367,13 @@ static void *alloc_qalloc(void *address){
if (map_address == (void *)NULL) map_address = (void *)-1;
if (map_address != (void *)-1) {
if (likely(release_pos < NUM_BUFFERS)) {
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_qalloc_free;
} else {
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
new_release_info[release_pos-NUM_BUFFERS].func = alloc_qalloc_free;
}
release_pos ++;
}
@ -2365,8 +2401,13 @@ static void *alloc_windows(void *address){
if (map_address == (void *)NULL) map_address = (void *)-1;
if (map_address != (void *)-1) {
if (likely(release_pos < NUM_BUFFERS)) {
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_windows_free;
} else {
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
new_release_info[release_pos-NUM_BUFFERS].func = alloc_windows_free;
}
release_pos ++;
}
@ -2409,9 +2450,15 @@ static void *alloc_devicedirver(void *address){
fd, 0);
if (map_address != (void *)-1) {
if (likely(release_pos < NUM_BUFFERS)) {
release_info[release_pos].address = map_address;
release_info[release_pos].attr = fd;
release_info[release_pos].func = alloc_devicedirver_free;
} else {
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
new_release_info[release_pos-NUM_BUFFERS].attr = fd;
new_release_info[release_pos-NUM_BUFFERS].func = alloc_devicedirver_free;
}
release_pos ++;
}
@ -2445,9 +2492,15 @@ static void *alloc_shm(void *address){
shmctl(shmid, IPC_RMID, 0);
if (likely(release_pos < NUM_BUFFERS)) {
release_info[release_pos].address = map_address;
release_info[release_pos].attr = shmid;
release_info[release_pos].func = alloc_shm_free;
} else {
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
new_release_info[release_pos-NUM_BUFFERS].attr = shmid;
new_release_info[release_pos-NUM_BUFFERS].func = alloc_shm_free;
}
release_pos ++;
}
@ -2551,8 +2604,13 @@ static void *alloc_hugetlb(void *address){
#endif
if (map_address != (void *)-1){
if (likely(release_pos < NUM_BUFFERS)) {
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_hugetlb_free;
} else {
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
new_release_info[release_pos-NUM_BUFFERS].func = alloc_hugetlb_free;
}
release_pos ++;
}
@ -2599,9 +2657,15 @@ static void *alloc_hugetlbfile(void *address){
fd, 0);
if (map_address != (void *)-1) {
if (likely(release_pos < NUM_BUFFERS)) {
release_info[release_pos].address = map_address;
release_info[release_pos].attr = fd;
release_info[release_pos].func = alloc_hugetlbfile_free;
} else {
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
new_release_info[release_pos-NUM_BUFFERS].attr = fd;
new_release_info[release_pos-NUM_BUFFERS].func = alloc_hugetlbfile_free;
}
release_pos ++;
}
@ -2631,8 +2695,25 @@ static volatile struct {
} memory[NUM_BUFFERS];
static int memory_initialized = 0;
struct newmemstruct
{
BLASULONG lock;
void *addr;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
int pos;
#endif
int used;
#ifndef __64BIT__
char dummy[48];
#else
char dummy[40];
#endif
};
static volatile struct newmemstruct *newmemory;
static int memory_initialized = 0;
static int memory_overflowed = 0;
/* Memory allocation routine */
/* procpos ... indicates where it comes from */
/* 0 : Level 3 functions */
@ -2641,6 +2722,8 @@ static int memory_initialized = 0;
void *blas_memory_alloc(int procpos){
int i;
int position;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
int mypos = 0;
@ -2774,6 +2857,29 @@ void *blas_memory_alloc(int procpos){
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
if (memory_overflowed) {
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
do {
RMB;
#if defined(USE_OPENMP)
if (!newmemory[position-NUM_BUFFERS].used) {
blas_lock(&newmemory[position-NUM_BUFFERS].lock);
#endif
if (!newmemory[position-NUM_BUFFERS].used) goto allocation2;
#if defined(USE_OPENMP)
blas_unlock(&newmemory[position-NUM_BUFFERS].lock);
}
#endif
position ++;
} while (position < 512+NUM_BUFFERS);
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
}
goto error;
allocation :
@ -2878,8 +2984,97 @@ void *blas_memory_alloc(int procpos){
return (void *)memory[position].addr;
error:
printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
if (memory_overflowed) goto terminate;
fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n");
memory_overflowed=1;
new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t));
newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct));
for (i = 0; i < 512; i++) {
newmemory[i].addr = (void *)0;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
newmemory[i].pos = -1;
#endif
newmemory[i].used = 0;
newmemory[i].lock = 0;
}
newmemory[position-NUM_BUFFERS].used = 1;
allocation2:
newmemory[position-NUM_BUFFERS].used = 1;
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#else
blas_unlock(&newmemory[position-NUM_BUFFERS].lock);
#endif
do {
#ifdef DEBUG
printf("Allocation Start : %lx\n", base_address);
#endif
map_address = (void *)-1;
func = &memoryalloc[0];
while ((func != NULL) && (map_address == (void *) -1)) {
map_address = (*func)((void *)base_address);
#ifdef ALLOC_DEVICEDRIVER
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
}
#endif
#ifdef ALLOC_HUGETLBFILE
if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
#ifndef OS_WINDOWS
fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
#endif
}
#endif
#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
#endif
func ++;
}
#ifdef DEBUG
printf(" Success -> %08lx\n", map_address);
#endif
if (((BLASLONG) map_address) == -1) base_address = 0UL;
if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
} while ((BLASLONG)map_address == -1);
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
newmemory[position-NUM_BUFFERS].addr = map_address;
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
#ifdef DEBUG
printf(" Mapping Succeeded. %p(%d)\n", (void *)newmemory[position-NUM_BUFFERS].addr, position);
#endif
#if defined(WHEREAMI) && !defined(USE_OPENMP)
if (newmemory[position-NUM_BUFFERS].pos == -1) newmemory[position-NUM_BUFFERS].pos = mypos;
#endif
return (void *)newmemory[position-NUM_BUFFERS].addr;
terminate:
printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
printf("a sufficiently small number. This error typically occurs when the software that relies on\n");
printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n");
printf("cpu cores than what OpenBLAS was configured to handle.\n");
return NULL;
}
@ -2898,13 +3093,28 @@ void blas_memory_free(void *free_area){
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
position++;
if (position >= NUM_BUFFERS) goto error;
if (position >= NUM_BUFFERS && !memory_overflowed) goto error;
#ifdef DEBUG
if (memory[position].addr != free_area) goto error;
printf(" Position : %d\n", position);
#endif
if (unlikely(memory_overflowed && position >= NUM_BUFFERS)) {
while ((position < NUM_BUFFERS+512) && (newmemory[position-NUM_BUFFERS].addr != free_area))
position++;
// arm: ensure all writes are finished before other thread takes this memory
WMB;
newmemory[position].used = 0;
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
#ifdef DEBUG
printf("Unmap from overflow area succeeded.\n\n");
#endif
return;
} else {
// arm: ensure all writes are finished before other thread takes this memory
WMB;
@ -2918,7 +3128,7 @@ void blas_memory_free(void *free_area){
#endif
return;
}
error:
printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
@ -2953,7 +3163,10 @@ void blas_shutdown(void){
LOCK_COMMAND(&alloc_lock);
for (pos = 0; pos < release_pos; pos ++) {
if (likely(pos < NUM_BUFFERS))
release_info[pos].func(&release_info[pos]);
else
new_release_info[pos-NUM_BUFFERS].func(&new_release_info[pos-NUM_BUFFERS]);
}
#ifdef SEEK_ADDRESS
@ -2970,6 +3183,15 @@ void blas_shutdown(void){
#endif
memory[pos].lock = 0;
}
if (memory_overflowed)
for (pos = 0; pos < 512; pos ++){
newmemory[pos].addr = (void *)0;
newmemory[pos].used = 0;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
newmemory[pos].pos = -1;
#endif
newmemory[pos].lock = 0;
}
UNLOCK_COMMAND(&alloc_lock);

View File

@ -524,6 +524,9 @@ void blas_set_parameter(void){
xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M;
#endif
#ifdef BUILD_BFLOAT16
sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15;
#endif
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
@ -629,7 +632,9 @@ void blas_set_parameter(void){
xgemm_p = 16 * (size + 1);
#endif
#ifdef BUILD_BFLOAT16
sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15;
#endif
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;

111
getarch.c
View File

@ -142,6 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_SICORTEX */
/* #define FORCE_LOONGSON3R3 */
/* #define FORCE_LOONGSON3R4 */
/* #define FORCE_LOONGSON3R5 */
/* #define FORCE_I6400 */
/* #define FORCE_P6600 */
/* #define FORCE_P5600 */
@ -312,6 +313,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#ifdef NO_AVX
#define SUBARCHITECTURE "NEHALEM"
#define ARCHCONFIG "-DNEHALEM " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
#define LIBNAME "nehalem"
#define CORENAME "NEHALEM"
#else
#define SUBARCHITECTURE "SANDYBRIDGE"
#define ARCHCONFIG "-DSANDYBRIDGE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@ -321,12 +332,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define LIBNAME "sandybridge"
#define CORENAME "SANDYBRIDGE"
#endif
#endif
#ifdef FORCE_HASWELL
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#ifdef NO_AVX2
#ifdef NO_AVX
#define SUBARCHITECTURE "NEHALEM"
#define ARCHCONFIG "-DNEHALEM " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
#define LIBNAME "nehalem"
#define CORENAME "NEHALEM"
#else
#define SUBARCHITECTURE "SANDYBRIDGE"
#define ARCHCONFIG "-DSANDYBRIDGE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@ -335,6 +357,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
#define LIBNAME "sandybridge"
#define CORENAME "SANDYBRIDGE"
#endif
#else
#define SUBARCHITECTURE "HASWELL"
#define ARCHCONFIG "-DHASWELL " \
@ -349,10 +372,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef FORCE_SKYLAKEX
#ifdef NO_AVX512
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#ifdef NO_AVX512
#ifdef NO_AVX2
#ifdef NO_AVX
#define SUBARCHITECTURE "NEHALEM"
#define ARCHCONFIG "-DNEHALEM " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
#define LIBNAME "nehalem"
#define CORENAME "NEHALEM"
#else
#define SUBARCHITECTURE "SANDYBRIDGE"
#define ARCHCONFIG "-DSANDYBRIDGE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
#define LIBNAME "sandybridge"
#define CORENAME "SANDYBRIDGE"
#endif
#else
#define SUBARCHITECTURE "HASWELL"
#define ARCHCONFIG "-DHASWELL " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@ -362,10 +406,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
#define LIBNAME "haswell"
#define CORENAME "HASWELL"
#endif
#else
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "SKYLAKEX"
#define ARCHCONFIG "-DSKYLAKEX " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@ -379,10 +421,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef FORCE_COOPERLAKE
#ifdef NO_AVX512
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#ifdef NO_AVX512
#ifdef NO_AVX2
#ifdef NO_AVX
#define SUBARCHITECTURE "NEHALEM"
#define ARCHCONFIG "-DNEHALEM " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
#define LIBNAME "nehalem"
#define CORENAME "NEHALEM"
#else
#define SUBARCHITECTURE "SANDYBRIDGE"
#define ARCHCONFIG "-DSANDYBRIDGE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
#define LIBNAME "sandybridge"
#define CORENAME "SANDYBRIDGE"
#endif
#else
#define SUBARCHITECTURE "HASWELL"
#define ARCHCONFIG "-DHASWELL " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@ -392,10 +455,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
#define LIBNAME "haswell"
#define CORENAME "HASWELL"
#endif
#else
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "COOPERLAKE"
#define ARCHCONFIG "-DCOOPERLAKE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@ -563,6 +624,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#ifdef NO_AVX2
#ifdef NO_AVX
#define SUBARCHITECTURE "NEHALEM"
#define ARCHCONFIG "-DNEHALEM " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2"
#define LIBNAME "nehalem"
#define CORENAME "NEHALEM"
#else
#define SUBARCHITECTURE "SANDYBRIDGE"
#define ARCHCONFIG "-DSANDYBRIDGE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@ -571,6 +642,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
#define LIBNAME "sandybridge"
#define CORENAME "SANDYBRIDGE"
#endif
#else
#define SUBARCHITECTURE "ZEN"
#define ARCHCONFIG "-DZEN " \
@ -842,6 +914,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif
#ifdef FORCE_LOONGSON3R5
#define FORCE
#define ARCHITECTURE "LOONGARCH"
#define SUBARCHITECTURE "LOONGSON3R5"
#define SUBDIRNAME "loongarch64"
#define ARCHCONFIG "-DLOONGSON3R5 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 "
#define LIBNAME "loongson3r5"
#define CORENAME "LOONGSON3R5"
#else
#endif
#ifdef FORCE_I6400
#define FORCE
#define ARCHITECTURE "MIPS"
@ -1388,6 +1474,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define OPENBLAS_SUPPORTED
#endif
#ifdef __loongarch64
#include "cpuid_loongarch64.c"
#define OPENBLAS_SUPPORTED
#endif
#ifdef __riscv
#include "cpuid_riscv64.c"
#define OPENBLAS_SUPPORTED
@ -1463,7 +1554,7 @@ int main(int argc, char *argv[]){
#ifdef FORCE
printf("CORE=%s\n", CORENAME);
#else
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc)
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
printf("CORE=%s\n", get_corename());
#endif
#endif
@ -1611,7 +1702,7 @@ printf("ELF_VERSION=2\n");
#ifdef FORCE
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
#else
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc)
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
#endif
#endif

View File

@ -82,6 +82,7 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS})
GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX})
GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX})
GenerateNamedObjects("xerbla.c" "" "xerbla" ${CBLAS_FLAG} "" "" true)
#sdsdot, dsdot
if (BUILD_SINGLE OR BUILD_DOUBLE)
GenerateNamedObjects("sdsdot.c" "" "sdsdot" ${CBLAS_FLAG} "" "" true "SINGLE")
@ -104,6 +105,15 @@ endif ()
GenerateNamedObjects("imax.c" "USE_ABS;USE_MIN" "i*amin" ${CBLAS_FLAG})
GenerateNamedObjects("imax.c" "USE_MIN" "i*min" ${CBLAS_FLAG})
if (BUILD_BFLOAT16)
GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("bf16to.c" "SINGLE_PREC" "sbf16tos" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("bf16to.c" "DOUBLE_PREC" "dbf16tod" ${CBLAS_FLAG} "" "" true "BFLOAT16")
endif ()
# complex-specific sources
foreach (float_type ${FLOAT_TYPES})

View File

@ -105,6 +105,55 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B
#endif
};
#if defined(SMALL_MATRIX_OPT) && !defined(GEMM3M) && !defined(XDOUBLE)
#define USE_SMALL_MATRIX_OPT 1
#else
#define USE_SMALL_MATRIX_OPT 0
#endif
#if USE_SMALL_MATRIX_OPT
#ifndef DYNAMIC_ARCH
#define SMALL_KERNEL_ADDR(table, idx) ((void *)(table[idx]))
#else
#define SMALL_KERNEL_ADDR(table, idx) ((void *)(*(uintptr_t *)((char *)gotoblas + (size_t)(table[idx]))))
#endif
#ifndef COMPLEX
static size_t gemm_small_kernel[] = {
GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, 0, 0,
GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, 0, 0,
};
static size_t gemm_small_kernel_b0[] = {
GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, 0, 0,
GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, 0, 0,
};
#define GEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, IFLOAT *, BLASLONG, FLOAT, IFLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel_b0, (idx))
#define GEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, IFLOAT *, BLASLONG, FLOAT, IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(gemm_small_kernel, (idx))
#else
static size_t zgemm_small_kernel[] = {
GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN,
GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT,
GEMM_SMALL_KERNEL_NR, GEMM_SMALL_KERNEL_TR, GEMM_SMALL_KERNEL_RR, GEMM_SMALL_KERNEL_CR,
GEMM_SMALL_KERNEL_NC, GEMM_SMALL_KERNEL_TC, GEMM_SMALL_KERNEL_RC, GEMM_SMALL_KERNEL_CC,
};
static size_t zgemm_small_kernel_b0[] = {
GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN,
GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT,
GEMM_SMALL_KERNEL_B0_NR, GEMM_SMALL_KERNEL_B0_TR, GEMM_SMALL_KERNEL_B0_RR, GEMM_SMALL_KERNEL_B0_CR,
GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC,
};
#define ZGEMM_SMALL_KERNEL(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel, (idx))
#define ZGEMM_SMALL_KERNEL_B0(idx) (int (*)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG)) SMALL_KERNEL_ADDR(zgemm_small_kernel_b0, (idx))
#endif
#endif
#ifndef CBLAS
void NAME(char *TRANSA, char *TRANSB,
@ -224,8 +273,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
blasint m, blasint n, blasint k,
#ifndef COMPLEX
FLOAT alpha,
FLOAT *a, blasint lda,
FLOAT *b, blasint ldb,
IFLOAT *a, blasint lda,
IFLOAT *b, blasint ldb,
FLOAT beta,
FLOAT *c, blasint ldc) {
#else
@ -277,7 +326,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
PRINT_DEBUG_CNAME;
#if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT)
#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT)
#ifdef DYNAMIC_ARCH
if (support_avx512() )
#endif
@ -417,6 +466,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
FUNCTION_PROFILE_START();
#if USE_SMALL_MATRIX_OPT
#if !defined(COMPLEX)
if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, *(FLOAT *)(args.alpha), *(FLOAT *)(args.beta))){
if(*(FLOAT *)(args.beta) == 0.0){
(GEMM_SMALL_KERNEL_B0((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc);
}else{
(GEMM_SMALL_KERNEL((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc);
}
return;
}
#else
if(GEMM_SMALL_MATRIX_PERMIT(transa, transb, args.m, args.n, args.k, alpha[0], alpha[1], beta[0], beta[1])){
if(beta[0] == 0.0 && beta[1] == 0.0){
(ZGEMM_SMALL_KERNEL_B0((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, args.c, args.ldc);
}else{
(ZGEMM_SMALL_KERNEL((transb << 2) | transa))(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, beta[0], beta[1], args.c, args.ldc);
}
return;
}
#endif
#endif
buffer = (XFLOAT *)blas_memory_alloc(0);
sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A);

View File

@ -119,7 +119,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) {
FLOAT *buffer;
int trans, uplo;
int uplo;
blasint info;
FLOAT * ALPHA = &alpha;
FLOAT alpha_r = ALPHA[0];
@ -130,7 +130,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO
PRINT_DEBUG_CNAME;
trans = -1;
uplo = -1;
info = 0;

View File

@ -91,6 +91,15 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "d*dot_k" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "dsdot_k" false "" "" false "SINGLE")
# sbdot
if (BUILD_BFLOAT16)
GenerateNamedObjects("${KERNELDIR}/${SBDOTKERNEL}" "SBDOT" "dot_k" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "SINGLE" "f16tos_k" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "DOUBLE" "bf16tod_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "SINGLE" "stobf16_k" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "DOUBLE" "dtobf16_k" false "" "" false "BFLOAT16")
endif()
if ((BUILD_COMPLEX OR BUILD_DOUBLE) AND NOT BUILD_SINGLE)
GenerateNamedObjects("${KERNELDIR}/${SAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${SAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "SINGLE")
@ -149,9 +158,6 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3)
foreach (float_type ${FLOAT_TYPES})
string(SUBSTRING ${float_type} 0 1 float_char)
if (${float_type} STREQUAL "BFLOAT16")
set (float_char "SB")
endif ()
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type})
@ -185,6 +191,10 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE")
endif ()
if (BUILD_BFLOAT16)
GenerateNamedObjects("${KERNELDIR}/${SBGEMVNKERNEL}" "" "gemv_n" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMVTKERNEL}" "" "gemv_t" false "" "" false "BFLOAT16")
endif ()
# Makefile.L3
set(USE_TRMM false)
string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE)
@ -209,15 +219,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE)
endif()
foreach (float_type SINGLE DOUBLE BFLOAT16)
foreach (float_type SINGLE DOUBLE)
string(SUBSTRING ${float_type} 0 1 float_char)
if (${float_type} STREQUAL "BFLOAT16")
if (NOT ${BUILD_BFLOAT16})
continue ()
else ()
set (float_char "SB")
endif ()
endif ()
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type})
endforeach()
if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
@ -253,11 +256,24 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${SGEMM_BETA}" "" "gemm_beta" false "" "" false "SINGLE")
endif ()
if (BUILD_BFLOAT16)
if (SBGEMMINCOPY)
GenerateNamedObjects("${KERNELDIR}/${SBGEMMINCOPY}" "" "${SBGEMMINCOPYOBJ}" false "" "" true "BFLOAT16")
endif ()
if (SBGEMMITCOPY)
GenerateNamedObjects("${KERNELDIR}/${SBGEMMITCOPY}" "" "${SBGEMMITCOPYOBJ}" false "" "" true "BFLOAT16")
endif ()
if (SBGEMMONCOPY)
GenerateNamedObjects("${KERNELDIR}/${SBGEMMONCOPY}" "" "${SBGEMMONCOPYOBJ}" false "" "" true "BFLOAT16")
endif ()
if (SBGEMMOTCOPY)
GenerateNamedObjects("${KERNELDIR}/${SBGEMMOTCOPY}" "" "${SBGEMMOTCOPYOBJ}" false "" "" true "BFLOAT16")
endif ()
GenerateNamedObjects("${KERNELDIR}/${SBGEMMKERNEL}" "" "gemm_kernel" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_BETA}" "" "gemm_beta" false "" "" false "BFLOAT16")
endif ()
foreach (float_type ${FLOAT_TYPES})
string(SUBSTRING ${float_type} 0 1 float_char)
if (${float_type} STREQUAL "BFLOAT16")
set (float_char "SB")
endif ()
if (${float_char}GEMMINCOPY)
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type})
endif ()
@ -458,7 +474,155 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}TRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false ${float_type})
if (NOT DEFINED ${float_char}GEMM_SMALL_M_PERMIT)
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
set(${float_char}GEMM_SMALL_M_PERMIT ../generic/zgemm_small_matrix_permit.c)
else ()
set(${float_char}GEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c)
endif ()
endif ()
if (NOT DEFINED ${float_char}GEMM_SMALL_K_NN)
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
set(${float_char}GEMM_SMALL_K_NN ../generic/zgemm_small_matrix_kernel_nn.c)
else ()
set(${float_char}GEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c)
endif ()
endif ()
if (NOT DEFINED ${float_char}GEMM_SMALL_K_NT)
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
set(${float_char}GEMM_SMALL_K_NT ../generic/zgemm_small_matrix_kernel_nt.c)
else ()
set(${float_char}GEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c)
endif ()
endif ()
if (NOT DEFINED ${float_char}GEMM_SMALL_K_TN)
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
set(${float_char}GEMM_SMALL_K_TN ../generic/zgemm_small_matrix_kernel_tn.c)
else ()
set(${float_char}GEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c)
endif ()
endif ()
if (NOT DEFINED ${float_char}GEMM_SMALL_K_TT)
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
set(${float_char}GEMM_SMALL_K_TT ../generic/zgemm_small_matrix_kernel_tt.c)
else ()
set(${float_char}GEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c)
endif ()
endif ()
if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NN)
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
set(${float_char}GEMM_SMALL_K_B0_NN ../generic/zgemm_small_matrix_kernel_nn.c)
else ()
set(${float_char}GEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c)
endif ()
endif ()
if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_NT)
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
set(${float_char}GEMM_SMALL_K_B0_NT ../generic/zgemm_small_matrix_kernel_nt.c)
else ()
set(${float_char}GEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c)
endif ()
endif ()
if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TN)
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
set(${float_char}GEMM_SMALL_K_B0_TN ../generic/zgemm_small_matrix_kernel_tn.c)
else ()
set(${float_char}GEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c)
endif ()
endif ()
if (NOT DEFINED ${float_char}GEMM_SMALL_K_B0_TT)
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
set(${float_char}GEMM_SMALL_K_B0_TT ../generic/zgemm_small_matrix_kernel_tt.c)
else ()
set(${float_char}GEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
endif ()
endif ()
if (SMALL_MATRIX_OPT)
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false ${float_type})
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "NN" "gemm_small_kernel_nn" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "NR" "gemm_small_kernel_nr" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "RN" "gemm_small_kernel_rn" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "RR" "gemm_small_kernel_rr" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "NT" "gemm_small_kernel_nt" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "NC" "gemm_small_kernel_nc" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "RT" "gemm_small_kernel_rt" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "RC" "gemm_small_kernel_rc" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "TN" "gemm_small_kernel_tn" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "TR" "gemm_small_kernel_tr" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "CN" "gemm_small_kernel_cn" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "CR" "gemm_small_kernel_cr" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "TT" "gemm_small_kernel_tt" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "TC" "gemm_small_kernel_tc" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CT" "gemm_small_kernel_ct" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TT}" "CC" "gemm_small_kernel_cc" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NN;B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "NR;B0" "gemm_small_kernel_b0_nr" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RN;B0" "gemm_small_kernel_b0_rn" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "RR;B0" "gemm_small_kernel_b0_rr" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NT;B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "NC;B0" "gemm_small_kernel_b0_nc" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RT;B0" "gemm_small_kernel_b0_rt" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "RC;B0" "gemm_small_kernel_b0_rc" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TN;B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "TR;B0" "gemm_small_kernel_b0_tr" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CN;B0" "gemm_small_kernel_b0_cn" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "CR;B0" "gemm_small_kernel_b0_cr" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TT;B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "TC;B0" "gemm_small_kernel_b0_tc" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CT;B0" "gemm_small_kernel_b0_ct" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TT}" "CC;B0" "gemm_small_kernel_b0_cc" false "" "" false ${float_type})
else ()
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type})
endif ()
if (BUILD_BFLOAT16)
if (NOT DEFINED SBGEMM_SMALL_M_PERMIT)
set(SBGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c)
endif ()
if (NOT DEFINED SBGEMM_SMALL_K_NN)
set(SBGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c)
endif ()
if (NOT DEFINED SBGEMM_SMALL_K_NT)
set(SBGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c)
endif ()
if (NOT DEFINED SBGEMM_SMALL_K_TN)
set(SBGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c)
endif ()
if (NOT DEFINED SBGEMM_SMALL_K_TT)
set(SBGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c)
endif ()
if (NOT DEFINED SBGEMM_SMALL_K_B0_NN)
set(SBGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c)
endif ()
if (NOT DEFINED SBGEMM_SMALL_K_B0_NT)
set(SBGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c)
endif ()
if (NOT DEFINED SBGEMM_SMALL_K_B0_TN)
set(SBGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c)
endif ()
if (NOT DEFINED SBGEMM_SMALL_K_B0_TT)
set($SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
endif ()
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16")
endif ()
endif ()
if (NOT DEFINED ${float_char}OMATCOPY_CN)
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C")
@ -592,6 +756,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
#geadd
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type})
endforeach ()
if (BUILD_DOUBLE AND NOT BUILD_SINGLE)
GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false "SINGLE")

View File

@ -1,3 +1,10 @@
FMAFLAG=
ifndef OLDGCC
ifdef HAVE_FMA3
FMAFLAG = -mfma
endif
endif
### GEMV ###
ifndef SGEMVNKERNEL
@ -263,7 +270,7 @@ $(KDIR)dgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_n$(TSUFFIX).$(PSUFFIX) : $(KER
$(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -UTRANS $< -o $@
$(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@
$(CC) -c $(CFLAGS) $(FMAFLAG) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@
endif
$(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVNKERNEL)

View File

@ -447,6 +447,72 @@ XBLASOBJS += \
endif
###### BLAS small matrix optimization #####
ifeq ($(SMALL_MATRIX_OPT), 1)
ifeq ($(BUILD_BFLOAT16),1)
SBBLASOBJS += \
sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
endif
SBLASOBJS += \
sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
DBLASOBJS += \
dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
CBLASOBJS += \
cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX)
ZBLASOBJS += \
zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX)
endif
###### BLAS extensions #####
ifeq ($(BUILD_SINGLE),1)
@ -4237,3 +4303,469 @@ endif
$(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@
###### BLAS small matrix optimization #####
ifndef DGEMM_SMALL_M_PERMIT
DGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
endif
ifndef DGEMM_SMALL_K_NN
DGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
endif
ifndef DGEMM_SMALL_K_NT
DGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
endif
ifndef DGEMM_SMALL_K_TN
DGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
endif
ifndef DGEMM_SMALL_K_TT
DGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
endif
$(KDIR)dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_M_PERMIT)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
ifndef DGEMM_SMALL_K_B0_NN
DGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
endif
ifndef DGEMM_SMALL_K_B0_NT
DGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
endif
ifndef DGEMM_SMALL_K_B0_TN
DGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
endif
ifndef DGEMM_SMALL_K_B0_TT
DGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
endif
$(KDIR)dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
$(KDIR)dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
$(KDIR)dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
$(KDIR)dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
ifndef SGEMM_SMALL_M_PERMIT
SGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
endif
ifndef SGEMM_SMALL_K_NN
SGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
endif
ifndef SGEMM_SMALL_K_NT
SGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
endif
ifndef SGEMM_SMALL_K_TN
SGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
endif
ifndef SGEMM_SMALL_K_TT
SGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
endif
$(KDIR)sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_M_PERMIT)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
ifndef SGEMM_SMALL_K_B0_NN
SGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
endif
ifndef SGEMM_SMALL_K_B0_NT
SGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
endif
ifndef SGEMM_SMALL_K_B0_TN
SGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
endif
ifndef SGEMM_SMALL_K_B0_TT
SGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
endif
$(KDIR)sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
$(KDIR)sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
$(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
$(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
ifeq ($(BUILD_BFLOAT16), 1)
ifndef SBGEMM_SMALL_M_PERMIT
SBGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
endif
ifndef SBGEMM_SMALL_K_NN
SBGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
endif
ifndef SBGEMM_SMALL_K_NT
SBGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
endif
ifndef SBGEMM_SMALL_K_TN
SBGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
endif
ifndef SBGEMM_SMALL_K_TT
SBGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
endif
$(KDIR)sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_M_PERMIT)
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
ifndef SBGEMM_SMALL_K_B0_NN
SBGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
endif
ifndef SBGEMM_SMALL_K_B0_NT
SBGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
endif
ifndef SBGEMM_SMALL_K_B0_TN
SBGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
endif
ifndef SBGEMM_SMALL_K_B0_TT
SBGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
endif
$(KDIR)sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
$(KDIR)sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
$(KDIR)sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
$(KDIR)sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
endif
ifndef CGEMM_SMALL_M_PERMIT
CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c
endif
ifndef CGEMM_SMALL_K_NN
CGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c
endif
ifndef CGEMM_SMALL_K_NT
CGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c
endif
ifndef CGEMM_SMALL_K_TN
CGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c
endif
ifndef CGEMM_SMALL_K_TT
CGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c
endif
$(KDIR)cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_M_PERMIT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@
$(KDIR)cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
$(KDIR)cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $@
$(KDIR)cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $@
$(KDIR)cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $@
$(KDIR)cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $@
$(KDIR)cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
$(KDIR)cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@
$(KDIR)cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $@
$(KDIR)cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@
$(KDIR)cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $@
$(KDIR)cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
$(KDIR)cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $@
$(KDIR)cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@
$(KDIR)cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $@
$(KDIR)cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $@
$(KDIR)cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
ifndef CGEMM_SMALL_K_B0_NN
CGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c
endif
ifndef CGEMM_SMALL_K_B0_NT
CGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c
endif
ifndef CGEMM_SMALL_K_B0_TN
CGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c
endif
ifndef CGEMM_SMALL_K_B0_TT
CGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c
endif
$(KDIR)cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@
$(KDIR)cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@
$(KDIR)cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@
$(KDIR)cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@
$(KDIR)cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@
$(KDIR)cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@
$(KDIR)cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@
$(KDIR)cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@
$(KDIR)cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@
$(KDIR)cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@
$(KDIR)cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@
$(KDIR)cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@
$(KDIR)cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@
$(KDIR)cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@
$(KDIR)cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@
$(KDIR)cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@
ifndef ZGEMM_SMALL_M_PERMIT
ZGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c
endif
ifndef ZGEMM_SMALL_K_NN
ZGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c
endif
ifndef ZGEMM_SMALL_K_NT
ZGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c
endif
ifndef ZGEMM_SMALL_K_TN
ZGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c
endif
ifndef ZGEMM_SMALL_K_TT
ZGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c
endif
$(KDIR)zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_M_PERMIT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@
$(KDIR)zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
$(KDIR)zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $@
$(KDIR)zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $@
$(KDIR)zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $@
$(KDIR)zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $@
$(KDIR)zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
$(KDIR)zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@
$(KDIR)zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $@
$(KDIR)zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@
$(KDIR)zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $@
$(KDIR)zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
$(KDIR)zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $@
$(KDIR)zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@
$(KDIR)zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $@
$(KDIR)zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $@
$(KDIR)zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
ifndef ZGEMM_SMALL_K_B0_NN
ZGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c
endif
ifndef ZGEMM_SMALL_K_B0_NT
ZGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c
endif
ifndef ZGEMM_SMALL_K_B0_TN
ZGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c
endif
ifndef ZGEMM_SMALL_K_B0_TT
ZGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c
endif
$(KDIR)zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@
$(KDIR)zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@
$(KDIR)zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@
$(KDIR)zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@
$(KDIR)zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@
$(KDIR)zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@
$(KDIR)zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@
$(KDIR)zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@
$(KDIR)zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@
$(KDIR)zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@
$(KDIR)zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@
$(KDIR)zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@
$(KDIR)zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@
$(KDIR)zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@
$(KDIR)zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@
$(KDIR)zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@

View File

@ -50,11 +50,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define B03 x16
#define B04 x17
#define I x18
#define J x19
#define I x19
#define J x20
#define TEMP1 x20
#define TEMP2 x21
#define TEMP1 x21
#define A_PREFETCH 2560
#define B_PREFETCH 256

View File

@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow3 x15
#define pA x16
#define alpha x17
#define temp x18
//#define temp x18
#define tempOffset x19
#define tempK x20
#define temp x21
#define alpha0 d10
#define alphaV0 v10.d[0]

View File

@ -30,7 +30,7 @@ All rights reserved.
#define B00 x22
#define I x18
#define I x21
#define J x19
#define TEMP1 x20

View File

@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow3 x15
#define pA x16
#define alpha w17
#define temp x18
//#define temp x18
#define tempOffset x19
#define tempK x20
#define temp x21
#define alpha0 s10
#define alphaV0 v10.s[0]

View File

@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow2 x14
#define pCRow3 x15
#define pA x16
#define alphaR x17
#define alphaI x18
#define alphaR x19
#define alphaI x20
#define alpha0_R d10
#define alphaV0_R v10.d[0]

View File

@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow3 x15
#define pA x16
#define alphaR x17
#define alphaI x18
#define alphaI x22
#define temp x19
#define tempOffset x20
#define tempK x21

View File

@ -47,7 +47,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
if ( (inc_x == 1) && (inc_y == 1) )
{
int n1 = n & -4;
#if V_SIMD && !defined(DSDOT)
const int vstep = v_nlanes_f32;
const int unrollx4 = n & (-vstep * 4);
@ -84,6 +83,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
}
dot = v_sum_f32(vsum0);
#elif defined(DSDOT)
int n1 = n & -4;
for (; i < n1; i += 4)
{
dot += (double) y[i] * (double) x[i]
@ -92,6 +92,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+ (double) y[i+3] * (double) x[i+3] ;
}
#else
int n1 = n & -4;
for (; i < n1; i += 4)
{
dot += y[i] * x[i]

View File

@ -0,0 +1,56 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#ifdef B0
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
#else
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
#endif
{
//naive implemtation
//Column major
BLASLONG i,j,k;
FLOAT result=0.0;
for(i=0; i<M; i++){
for(j=0; j<N; j++){
result=0.0;
for(k=0; k<K; k++){
result += A[i+k*lda] * B[k+j*ldb];
}
#ifdef B0
C[i+j*ldc]=alpha * result;
#else
C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
#endif
}
}
return 0;
}

View File

@ -0,0 +1,56 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#ifdef B0
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
#else
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
#endif
{
//naive implemtation
//Column major
BLASLONG i,j,k;
FLOAT result=0.0;
for(i=0; i<M; i++){
for(j=0; j<N; j++){
result=0.0;
for(k=0; k<K; k++){
result += A[i+k*lda] * B[k*ldb+j];
}
#ifdef B0
C[i+j*ldc]=alpha * result;
#else
C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
#endif
}
}
return 0;
}

View File

@ -0,0 +1,57 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#ifdef B0
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
#else
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
#endif
{
//naive implemtation
//Column major
BLASLONG i,j,k;
FLOAT result=0.0;
for(i=0; i<M; i++){
for(j=0; j<N; j++){
result=0.0;
for(k=0; k<K; k++){
result += A[i*lda+k] * B[k+j*ldb];
}
#ifdef B0
C[i+j*ldc]=alpha * result;
#else
C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
#endif
}
}
return 0;
}

View File

@ -0,0 +1,57 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#ifdef B0
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
#else
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
#endif
{
//naive implemtation
//Column major
BLASLONG i,j,k;
FLOAT result=0.0;
for(i=0; i<M; i++){
for(j=0; j<N; j++){
result=0.0;
for(k=0; k<K; k++){
result += A[i*lda+k] * B[k*ldb+j];
}
#ifdef B0
C[i+j*ldc]=alpha * result;
#else
C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
#endif
}
}
return 0;
}

View File

@ -0,0 +1,40 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta)
{
return 0;
/*
double MNK = (double) M * (double) N * (double) K;
if (MNK <= 100.0*100.0*100.0)
return 1;
else
return 0;
*/
}

View File

@ -0,0 +1,89 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#ifndef B0
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
#else
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
#endif
{
FLOAT real, imag;
#ifndef B0
FLOAT tmp0, tmp1;
#endif
int i, j, l;
for(i = 0; i < M; i++){
for(j = 0; j < N; j++){
real=0;
imag=0;
for(l = 0; l < K; l++){
#if defined(NN)
real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
-A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
+ A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
#elif defined(NR)
real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
+A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
+ A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
#elif defined(RN)
real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
+A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
- A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
#elif defined(RR)
real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
-A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
- A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
#endif
}
#ifndef B0
tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
#else
C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
#endif
}
}
return 0;
}

View File

@ -0,0 +1,93 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#ifndef B0
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
#else
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
#endif
{
FLOAT real, imag;
#ifndef B0
FLOAT tmp0, tmp1;
#endif
int i, j, l;
for(i = 0; i < M; i++){
for(j = 0; j < N; j++){
real=0;
imag=0;
for(l = 0; l < K; l++){
#if defined(NT)
real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
-A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
+ A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
#elif defined(NC)
real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
+A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
+ A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
#elif defined(RT)
real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
+A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
- A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
#elif defined(RC)
real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
-A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
- A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
#endif
}
#ifndef B0
tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
#else
C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
#endif
}
}
return 0;
}

View File

@ -0,0 +1,93 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#ifndef B0
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
#else
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
#endif
{
FLOAT real, imag;
#ifndef B0
FLOAT tmp0, tmp1;
#endif
int i, j, l;
for(i = 0; i < M; i++){
for(j = 0; j < N; j++){
real=0;
imag=0;
for(l = 0; l < K; l++){
#if defined(TN)
real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
-A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
+ A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
#elif defined(TR)
real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
+A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
+ A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
#elif defined(CN)
real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
+A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
- A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
#elif defined(CR)
real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
-A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
- A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
#endif
}
#ifndef B0
tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
#else
C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
#endif
}
}
return 0;
}

View File

@ -0,0 +1,93 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#ifndef B0
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
#else
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
#endif
{
FLOAT real, imag;
#ifndef B0
FLOAT tmp0, tmp1;
#endif
int i, j, l;
for(i = 0; i < M; i++){
for(j = 0; j < N; j++){
real=0;
imag=0;
for(l = 0; l < K; l++){
#if defined(TT)
real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
-A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
+ A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
#elif defined(TC)
real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
+A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
+ A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
#elif defined(CT)
real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
+A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
- A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
#elif defined(CC)
real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
-A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
- A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
#endif
}
#ifndef B0
tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
#else
C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
#endif
}
}
return 0;
}

View File

@ -0,0 +1,40 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha0, FLOAT alpha1, FLOAT beta0, FLOAT beta1)
{
return 0;
/*
double MNK = (double) M * (double) N * (double) K;
if (MNK <= 100.0*100.0*100.0)
return 1;
else
return 0;
*/
}

238
kernel/loongarch64/KERNEL Normal file
View File

@ -0,0 +1,238 @@
ifndef SAXPYKERNEL
SAXPYKERNEL = ../arm/axpy.c
endif
ifndef DAXPYKERNEL
DAXPYKERNEL = ../arm/axpy.c
endif
ifndef CAXPYKERNEL
CAXPYKERNEL = ../arm/zaxpy.c
endif
ifndef ZAXPYKERNEL
ZAXPYKERNEL = ../arm/zaxpy.c
endif
ifndef SROTKERNEL
SROTKERNEL = ../arm/rot.c
endif
ifndef DROTKERNEL
DROTKERNEL = ../arm/rot.c
endif
ifndef CROTKERNEL
CROTKERNEL = ../arm/zrot.c
endif
ifndef ZROTKERNEL
ZROTKERNEL = ../arm/zrot.c
endif
ifndef CSWAPKERNEL
CSWAPKERNEL = ../arm/zswap.c
endif
ifndef ZSWAPKERNEL
ZSWAPKERNEL = ../arm/zswap.c
endif
ifndef SSUMKERNEL
SSUMKERNEL = ../arm/sum.c
endif
ifndef DSUMKERNEL
DSUMKERNEL = ../arm/sum.c
endif
ifndef CSUMKERNEL
CSUMKERNEL = ../arm/zsum.c
endif
ifndef ZSUMKERNEL
ZSUMKERNEL = ../arm/zsum.c
endif
ifndef ISMAXKERNEL
ISMAXKERNEL = ../arm/imax.c
endif
ifndef IDMAXKERNEL
IDMAXKERNEL = ../arm/imax.c
endif
ifndef ISMINKERNEL
ISMINKERNEL = ../arm/imin.c
endif
ifndef IDMINKERNEL
IDMINKERNEL = ../arm/imin.c
endif
ifndef SNRM2KERNEL
SNRM2KERNEL = snrm2.S
endif
ifndef DNRM2KERNEL
DNRM2KERNEL = dnrm2.S
endif
ifndef CNRM2KERNEL
CNRM2KERNEL = cnrm2.S
endif
ifndef ZNRM2KERNEL
ZNRM2KERNEL = znrm2.S
endif
ifndef SCABS_KERNEL
SCABS_KERNEL = ../generic/cabs.c
endif
ifndef DCABS_KERNEL
DCABS_KERNEL = ../generic/cabs.c
endif
ifndef QCABS_KERNEL
QCABS_KERNEL = ../generic/cabs.c
endif
ifndef LSAME_KERNEL
LSAME_KERNEL = ../generic/lsame.c
endif
ifndef SGEMMKERNEL
SGEMMKERNEL = gemm_kernel.S
SGEMMINCOPY = ../generic/gemm_ncopy_2.c
SGEMMITCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
endif
ifndef DGEMMKERNEL
DGEMMKERNEL = gemm_kernel.S
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPY = ../generic/gemm_ncopy_8.c
DGEMMOTCOPY = ../generic/gemm_tcopy_8.c
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
endif
ifndef CGEMMKERNEL
CGEMMKERNEL = zgemm_kernel.S
CGEMMINCOPY = ../generic/zgemm_ncopy_1.c
CGEMMITCOPY = ../generic/zgemm_tcopy_1.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMINCOPYOBJ = cgemm_incopy.o
CGEMMITCOPYOBJ = cgemm_itcopy.o
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
endif
ifndef ZGEMMKERNEL
ZGEMMKERNEL = zgemm_kernel.S
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMINCOPYOBJ = zgemm_incopy.o
ZGEMMITCOPYOBJ = zgemm_itcopy.o
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
endif
ifndef SGEMM_BETA
SGEMM_BETA = ../generic/gemm_beta.c
endif
ifndef DGEMM_BETA
DGEMM_BETA = ../generic/gemm_beta.c
endif
ifndef CGEMM_BETA
CGEMM_BETA = ../generic/zgemm_beta.c
endif
ifndef ZGEMM_BETA
ZGEMM_BETA = ../generic/zgemm_beta.c
endif
ifndef STRSMKERNEL_LN
STRSMKERNEL_LN = trsm_kernel_LN.S
endif
ifndef STRSMKERNEL_LT
STRSMKERNEL_LT = trsm_kernel_LT.S
endif
ifndef STRSMKERNEL_RN
STRSMKERNEL_RN = trsm_kernel_LT.S
endif
ifndef STRSMKERNEL_RT
STRSMKERNEL_RT = trsm_kernel_RT.S
endif
ifndef DTRSMKERNEL_LN
DTRSMKERNEL_LN = trsm_kernel_LN.S
endif
ifndef DTRSMKERNEL_LT
DTRSMKERNEL_LT = trsm_kernel_LT.S
endif
ifndef DTRSMKERNEL_RN
DTRSMKERNEL_RN = trsm_kernel_LT.S
endif
ifndef DTRSMKERNEL_RT
DTRSMKERNEL_RT = trsm_kernel_RT.S
endif
ifndef CTRSMKERNEL_LN
CTRSMKERNEL_LN = ztrsm_kernel_LT.S
endif
ifndef CTRSMKERNEL_LT
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
endif
ifndef CTRSMKERNEL_RN
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
endif
ifndef CTRSMKERNEL_RT
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
endif
ifndef ZTRSMKERNEL_LN
ZTRSMKERNEL_LN = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_LT
ZTRSMKERNEL_LT = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_RN
ZTRSMKERNEL_RN = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_RT
ZTRSMKERNEL_RT = ztrsm_kernel_RT.S
endif
ifndef CGEMM3MKERNEL
CGEMM3MKERNEL = zgemm3m_kernel.S
endif
ifndef ZGEMM3MKERNEL
ZGEMM3MKERNEL = zgemm3m_kernel.S
endif
DSDOTKERNEL = dot.S

View File

@ -0,0 +1 @@
#TODO: Add loongarch64 SIMD optimizations

View File

@ -0,0 +1,167 @@
SGEMM_BETA = ../generic/gemm_beta.c
DGEMM_BETA = ../generic/gemm_beta.c
CGEMM_BETA = ../generic/zgemm_beta.c
ZGEMM_BETA = ../generic/zgemm_beta.c
STRMMKERNEL = ../generic/trmmkernel_2x2.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
#Pure C for other kernels
SAMAXKERNEL = ../arm/amax.c
DAMAXKERNEL = ../arm/amax.c
CAMAXKERNEL = ../arm/zamax.c
ZAMAXKERNEL = ../arm/zamax.c
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMAXKERNEL = ../arm/iamax.c
IDAMAXKERNEL = ../arm/iamax.c
ICAMAXKERNEL = ../arm/izamax.c
IZAMAXKERNEL = ../arm/izamax.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
SASUMKERNEL = ../arm/asum.c
DASUMKERNEL = ../arm/asum.c
CASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = ../arm/zasum.c
SSUMKERNEL = ../arm/sum.c
DSUMKERNEL = ../arm/sum.c
CSUMKERNEL = ../arm/zsum.c
ZSUMKERNEL = ../arm/zsum.c
SAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = ../arm/axpy.c
CAXPYKERNEL = ../arm/zaxpy.c
ZAXPYKERNEL = ../arm/zaxpy.c
SCOPYKERNEL = ../arm/copy.c
DCOPYKERNEL = ../arm/copy.c
CCOPYKERNEL = ../arm/zcopy.c
ZCOPYKERNEL = ../arm/zcopy.c
SDOTKERNEL = ../generic/dot.c
DDOTKERNEL = ../arm/dot.c
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
SNRM2KERNEL = ../arm/nrm2.c
DNRM2KERNEL = ../arm/nrm2.c
CNRM2KERNEL = ../arm/znrm2.c
ZNRM2KERNEL = ../arm/znrm2.c
SROTKERNEL = ../arm/rot.c
DROTKERNEL = ../arm/rot.c
CROTKERNEL = ../arm/zrot.c
ZROTKERNEL = ../arm/zrot.c
SSCALKERNEL = ../arm/scal.c
DSCALKERNEL = ../arm/scal.c
CSCALKERNEL = ../arm/zscal.c
ZSCALKERNEL = ../arm/zscal.c
SSWAPKERNEL = ../arm/swap.c
DSWAPKERNEL = ../arm/swap.c
CSWAPKERNEL = ../arm/zswap.c
ZSWAPKERNEL = ../arm/zswap.c
SGEMVNKERNEL = ../arm/gemv_n.c
DGEMVNKERNEL = ../arm/gemv_n.c
CGEMVNKERNEL = ../arm/zgemv_n.c
ZGEMVNKERNEL = ../arm/zgemv_n.c
SGEMVTKERNEL = ../arm/gemv_t.c
DGEMVTKERNEL = ../arm/gemv_t.c
CGEMVTKERNEL = ../arm/zgemv_t.c
ZGEMVTKERNEL = ../arm/zgemv_t.c
SSYMV_U_KERNEL = ../generic/symv_k.c
SSYMV_L_KERNEL = ../generic/symv_k.c
DSYMV_U_KERNEL = ../generic/symv_k.c
DSYMV_L_KERNEL = ../generic/symv_k.c
QSYMV_U_KERNEL = ../generic/symv_k.c
QSYMV_L_KERNEL = ../generic/symv_k.c
CSYMV_U_KERNEL = ../generic/zsymv_k.c
CSYMV_L_KERNEL = ../generic/zsymv_k.c
ZSYMV_U_KERNEL = ../generic/zsymv_k.c
ZSYMV_L_KERNEL = ../generic/zsymv_k.c
XSYMV_U_KERNEL = ../generic/zsymv_k.c
XSYMV_L_KERNEL = ../generic/zsymv_k.c
ZHEMV_U_KERNEL = ../generic/zhemv_k.c
ZHEMV_L_KERNEL = ../generic/zhemv_k.c
LSAME_KERNEL = ../generic/lsame.c
SCABS_KERNEL = ../generic/cabs.c
DCABS_KERNEL = ../generic/cabs.c
QCABS_KERNEL = ../generic/cabs.c
#Dump kernel
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c

View File

@ -0,0 +1 @@
clean ::

230
kernel/loongarch64/amax.S Normal file
View File

@ -0,0 +1,230 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
addi.d N, N, -1
add.d X, X, INCX
FABS s1, a1
FABS s2, a1
bge $r0, N, .L999
FABS s3, a1
srai.d I, N, 3
FABS s4, a1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
add.d X, X, INCX
FABS t3, a3
LD a2, X, 0 * SIZE
FABS t4, a4
add.d X, X, INCX
CMPLT $fcc0, s1, t1
LD a3, X, 0 * SIZE
CMPLT $fcc1, s2, t2
add.d X, X, INCX
CMPLT $fcc2, s3, t3
LD a4, X, 0 * SIZE
CMPLT $fcc3, s4, t4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
FABS t1, a5
LD a5, X, 0 * SIZE
FABS t2, a6
add.d X, X, INCX
FABS t3, a7
LD a6, X, 0 * SIZE
FABS t4, a8
add.d X, X, INCX
CMPLT $fcc0, s1, t1
LD a7, X, 0 * SIZE
CMPLT $fcc1, s2, t2
add.d X, X, INCX
CMPLT $fcc2, s3, t3
LD a8, X, 0 * SIZE
CMPLT $fcc3, s4, t4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
addi.d I, I, -1
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t2
CMPLT $fcc2, s3, t3
CMPLT $fcc3, s4, t4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
FABS t1, a5
FABS t2, a6
FABS t3, a7
FABS t4, a8
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t2
CMPLT $fcc2, s3, t3
CMPLT $fcc3, s4, t4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
addi.d I, I, -1
FABS t1, a1
CMPLT $fcc0, s1, t1
CMOVT s1, s1, t1, $fcc0
add.d X, X, INCX
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s1, s2
CMPLT $fcc1, s3, s4
CMOVT s1, s1, s2, $fcc0
CMOVT s3, s3, s4, $fcc1
CMPLT $fcc0, s1, s3
CMOVT s1, s1, s3, $fcc0
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

186
kernel/loongarch64/amin.S Normal file
View File

@ -0,0 +1,186 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
addi.d N, N, -1
add.d X, X, INCX
FABS s1, a1
FABS s2, a1
bge $r0, N, .L999
FABS s3, a1
srai.d I, N, 3
FABS s4, a1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
add.d X, X, INCX
FABS t3, a3
LD a2, X, 0 * SIZE
FABS t4, a4
add.d X, X, INCX
CMPLT $fcc0, t1, s1
LD a3, X, 0 * SIZE
CMPLT $fcc1, t2, s2
add.d X, X, INCX
CMPLT $fcc2, t3, s3
LD a4, X, 0 * SIZE
CMPLT $fcc3, t4, s4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
FABS t1, a5
LD a5, X, 0 * SIZE
FABS t2, a6
add.d X, X, INCX
FABS t3, a7
LD a6, X, 0 * SIZE
FABS t4, a8
add.d X, X, INCX
CMPLT $fcc0, t1, s1
LD a7, X, 0 * SIZE
CMPLT $fcc1, t2, s2
add.d X, X, INCX
CMPLT $fcc2, t3, s3
LD a8, X, 0 * SIZE
CMPLT $fcc3, t4, s4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
addi.d I, I, -1
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
CMPLT $fcc0, t1, s1
CMPLT $fcc1, t2, s2
CMPLT $fcc2, t3, s3
CMPLT $fcc3, t4, s4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
FABS t1, a5
FABS t2, a6
FABS t3, a7
FABS t4, a8
CMPLT $fcc0, t1, s1
CMPLT $fcc1, t2, s2
CMPLT $fcc2, t3, s3
CMPLT $fcc3, t4, s4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
.align 3
.L15:
andi I, N, 7
NOP
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
addi.d I, I, -1
FABS t1, a1
CMPLT $fcc0, t1, s1
CMOVT s1, s1, t1, $fcc0
add.d X, X, INCX
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s2, s1
CMPLT $fcc1, s4, s3
CMOVT s1, s1, s2, $fcc0
CMOVT s3, s3, s4, $fcc1
CMPLT $fcc0, s3, s1
CMOVT s1, s1, s3, $fcc0
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

232
kernel/loongarch64/asum.S Normal file
View File

@ -0,0 +1,232 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f23
#define a2 $f9
#define a3 $f10
#define a4 $f11
#define a5 $f12
#define a6 $f13
#define a7 $f14
#define a8 $f15
#define t1 $f16
#define t2 $f17
#define t3 $f0
#define t4 $f1
#define s1 $f22
#define s2 $f8
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
MTC s2, $r0
slli.d INCX, INCX, BASE_SHIFT
li.d TEMP, SIZE
bge $r0, N, .L999
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
LD a3, X, 2 * SIZE
LD a4, X, 3 * SIZE
LD a5, X, 4 * SIZE
FABS t1, a1
LD a6, X, 5 * SIZE
FABS t2, a2
LD a7, X, 6 * SIZE
FABS t3, a3
FABS t4, a4
addi.d I, I, -1
LD a8, X, 7 * SIZE
bge $r0, I, .L13
.align 3
.L12:
ADD s1, s1, t1
LD a1, X, 8 * SIZE
FABS t1, a5
addi.d I, I, -1
ADD s2, s2, t2
LD a2, X, 9 * SIZE
FABS t2, a6
NOP
ADD s1, s1, t3
LD a3, X, 10 * SIZE
FABS t3, a7
NOP
ADD s2, s2, t4
LD a4, X, 11 * SIZE
FABS t4, a8
addi.d X, X, 8 * SIZE
ADD s1, s1, t1
LD a5, X, 4 * SIZE
FABS t1, a1
NOP
ADD s2, s2, t2
LD a6, X, 5 * SIZE
FABS t2, a2
NOP
ADD s1, s1, t3
LD a7, X, 6 * SIZE
FABS t3, a3
NOP
ADD s2, s2, t4
LD a8, X, 7 * SIZE
FABS t4, a4
blt $r0, I, .L12
.align 3
.L13:
ADD s1, s1, t1
addi.d X, X, 8 * SIZE
FABS t1, a5
NOP
ADD s2, s2, t2
FABS t2, a6
ADD s1, s1, t3
FABS t3, a7
ADD s2, s2, t4
FABS t4, a8
ADD s1, s1, t1
ADD s2, s2, t2
ADD s1, s1, t3
ADD s2, s2, t4
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L16:
LD a1, X, 0 * SIZE
addi.d I, I, -1
FABS t1, a1
ADD s1, s1, t1
addi.d X, X, SIZE
blt $r0, I, .L16
b .L999
.align 3
.L20:
bge $r0, I, .L25
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
FABS t1, a1
LD a7, X, 0 * SIZE
FABS t2, a2
add.d X, X, INCX
FABS t3, a3
LD a8, X, 0 * SIZE
FABS t4, a4
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L24
.align 3
.L23:
ADD s1, s1, t1
LD a1, X, 0 * SIZE
FABS t1, a5
add.d X, X, INCX
ADD s2, s2, t2
LD a2, X, 0 * SIZE
FABS t2, a6
add.d X, X, INCX
ADD s1, s1, t3
LD a3, X, 0 * SIZE
FABS t3, a7
add.d X, X, INCX
ADD s2, s2, t4
LD a4, X, 0 * SIZE
FABS t4, a8
add.d X, X, INCX
ADD s1, s1, t1
LD a5, X, 0 * SIZE
FABS t1, a1
add.d X, X, INCX
ADD s2, s2, t2
LD a6, X, 0 * SIZE
FABS t2, a2
add.d X, X, INCX
ADD s1, s1, t3
LD a7, X, 0 * SIZE
FABS t3, a3
add.d X, X, INCX
ADD s2, s2, t4
LD a8, X, 0 * SIZE
FABS t4, a4
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L23
.align 3
.L24:
ADD s1, s1, t1
FABS t1, a5
ADD s2, s2, t2
FABS t2, a6
ADD s1, s1, t3
FABS t3, a7
ADD s2, s2, t4
FABS t4, a8
ADD s1, s1, t1
ADD s2, s2, t2
ADD s1, s1, t3
ADD s2, s2, t4
.align 3
.L25:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
addi.d I, I, -1
FABS t1, a1
add.d X, X, INCX
ADD s1, s1, t1
blt $r0, I, .L26
.align 3
.L999:
ADD s1, s1, s2
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

159
kernel/loongarch64/cnrm2.S Normal file
View File

@ -0,0 +1,159 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define a5 $f16
#define a6 $f17
#define a7 $f0
#define a8 $f1
#define s1 $f22
#define s2 $f8
#define t1 $f23
#define t2 $f9
#define t3 $f10
#define t4 $f11
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
movgr2fr.d s1, $r0
li.d TEMP, 2 * SIZE
fmov.d s2, s1
bge $r0, N, .L999
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, INCX, .L999
srai.d I, N, 2
bge $r0, I, .L25
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
LD a4, X, 1 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
LD a6, X, 1 * SIZE
add.d X, X, INCX
fcvt.d.s t1, a1
LD a7, X, 0 * SIZE
fcvt.d.s t2, a2
LD a8, X, 1 * SIZE
fcvt.d.s t3, a3
addi.d I, I, -1
fcvt.d.s t4, a4
add.d X, X, INCX
bge $r0, I, .L24
.align 3
.L23:
fmadd.d s1, t1, t1, s1
LD a1, X, 0 * SIZE
fcvt.d.s t1, a5
fmadd.d s2, t2, t2, s2
LD a2, X, 1 * SIZE
fcvt.d.s t2, a6
add.d X, X, INCX
fmadd.d s1, t3, t3, s1
LD a3, X, 0 * SIZE
fcvt.d.s t3, a7
fmadd.d s2, t4, t4, s2
LD a4, X, 1 * SIZE
fcvt.d.s t4, a8
add.d X, X, INCX
fmadd.d s1, t1, t1, s1
LD a5, X, 0 * SIZE
fcvt.d.s t1, a1
addi.d I, I, -1
fmadd.d s2, t2, t2, s2
LD a6, X, 1 * SIZE
fcvt.d.s t2, a2
add.d X, X, INCX
fmadd.d s1, t3, t3, s1
LD a7, X, 0 * SIZE
fcvt.d.s t3, a3
LD a8, X, 1 * SIZE
fmadd.d s2, t4, t4, s2
add.d X, X, INCX
fcvt.d.s t4, a4
blt $r0, I, .L23
.align 3
.L24:
fmadd.d s1, t1, t1, s1
fcvt.d.s t1, a5
fmadd.d s2, t2, t2, s2
fcvt.d.s t2, a6
fmadd.d s1, t3, t3, s1
fcvt.d.s t3, a7
fmadd.d s2, t4, t4, s2
fcvt.d.s t4, a8
fmadd.d s1, t1, t1, s1
fmadd.d s2, t2, t2, s2
fmadd.d s1, t3, t3, s1
fmadd.d s2, t4, t4, s2
.align 3
.L25:
andi I, N, 3
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
addi.d I, I, -1
fcvt.d.s t1, a1
fcvt.d.s t2, a2
fmadd.d s1, t1, t1, s1
add.d X, X, INCX
fmadd.d s2, t2, t2, s2
blt $r0, I, .L26
.align 3
.L999:
fadd.d s1, s1, s2
fsqrt.d s1, s1
move $r4, $r17
fcvt.s.d $f0, s1
jirl $r0, $r1, 0x0
EPILOGUE

225
kernel/loongarch64/copy.S Normal file
View File

@ -0,0 +1,225 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
#define a1 $f22
#define a2 $f8
#define a3 $f23
#define a4 $f9
#define a5 $f10
#define a6 $f11
#define a7 $f12
#define a8 $f13
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
LDINT INCY, 0(INCY)
#endif
li.d TEMP, SIZE
NOP
slli.d INCX, INCX, BASE_SHIFT
bge $r0, N, .L999
slli.d INCY, INCY, BASE_SHIFT
bne INCX, TEMP, .L20
srai.d I, N, 3
bne INCY, TEMP, .L20
addi.d I, I, -1
blt I, $r0, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
LD a3, X, 2 * SIZE
LD a4, X, 3 * SIZE
LD a5, X, 4 * SIZE
LD a6, X, 5 * SIZE
LD a7, X, 6 * SIZE
LD a8, X, 7 * SIZE
bge $r0, I, .L13
.align 3
.L12:
ST a1, Y, 0 * SIZE
LD a1, X, 8 * SIZE
ST a2, Y, 1 * SIZE
LD a2, X, 9 * SIZE
ST a3, Y, 2 * SIZE
LD a3, X, 10 * SIZE
ST a4, Y, 3 * SIZE
LD a4, X, 11 * SIZE
ST a5, Y, 4 * SIZE
LD a5, X, 12 * SIZE
ST a6, Y, 5 * SIZE
LD a6, X, 13 * SIZE
ST a7, Y, 6 * SIZE
LD a7, X, 14 * SIZE
ST a8, Y, 7 * SIZE
LD a8, X, 15 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L12
.align 3
.L13:
ST a1, Y, 0 * SIZE
ST a2, Y, 1 * SIZE
ST a3, Y, 2 * SIZE
ST a4, Y, 3 * SIZE
ST a5, Y, 4 * SIZE
ST a6, Y, 5 * SIZE
ST a7, Y, 6 * SIZE
ST a8, Y, 7 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L16:
LD a1, X, 0 * SIZE
addi.d X, X, SIZE
addi.d I, I, -1
addi.d Y, Y, SIZE
ST a1, Y, -1 * SIZE
blt $r0, I, .L16
b .L999
.align 3
.L20:
srai.d I, N, 3
addi.d I, I, -1
blt I, $r0, .L25
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
add.d X, X, INCX
bge $r0, I, .L23
.align 3
.L22:
ST a1, Y, 0 * SIZE
add.d Y, Y, INCY
LD a1, X, 0 * SIZE
add.d X, X, INCX
ST a2, Y, 0 * SIZE
add.d Y, Y, INCY
LD a2, X, 0 * SIZE
add.d X, X, INCX
ST a3, Y, 0 * SIZE
add.d Y, Y, INCY
LD a3, X, 0 * SIZE
add.d X, X, INCX
ST a4, Y, 0 * SIZE
add.d Y, Y, INCY
LD a4, X, 0 * SIZE
add.d X, X, INCX
ST a5, Y, 0 * SIZE
add.d Y, Y, INCY
LD a5, X, 0 * SIZE
add.d X, X, INCX
ST a6, Y, 0 * SIZE
add.d Y, Y, INCY
LD a6, X, 0 * SIZE
add.d X, X, INCX
ST a7, Y, 0 * SIZE
add.d Y, Y, INCY
LD a7, X, 0 * SIZE
add.d X, X, INCX
ST a8, Y, 0 * SIZE
add.d Y, Y, INCY
LD a8, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L22
.align 3
.L23:
ST a1, Y, 0 * SIZE
add.d Y, Y, INCY
ST a2, Y, 0 * SIZE
add.d Y, Y, INCY
ST a3, Y, 0 * SIZE
add.d Y, Y, INCY
ST a4, Y, 0 * SIZE
add.d Y, Y, INCY
ST a5, Y, 0 * SIZE
add.d Y, Y, INCY
ST a6, Y, 0 * SIZE
add.d Y, Y, INCY
ST a7, Y, 0 * SIZE
add.d Y, Y, INCY
ST a8, Y, 0 * SIZE
add.d Y, Y, INCY
.align 3
.L25:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
add.d X, X, INCX
addi.d I, I, -1
ST a1, Y, 0 * SIZE
add.d Y, Y, INCY
blt $r0, I, .L26
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

314
kernel/loongarch64/dnrm2.S Normal file
View File

@ -0,0 +1,314 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define XX $r7
#define I $r17
#define TEMP $r18
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
#define ALPHA $f4
#define max $f5
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999
move XX, X
NOP
LD a1, X, 0 * SIZE
addi.d N, N, -1
add.d X, X, INCX
FABS s1, a1
FABS s2, a1
bge $r0, N, .L999
FABS s3, a1
srai.d I, N, 3
FABS s4, a1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
add.d X, X, INCX
FABS t3, a3
LD a2, X, 0 * SIZE
FABS t4, a4
add.d X, X, INCX
CMPLT $fcc0, s1, t1
LD a3, X, 0 * SIZE
CMPLT $fcc1, s2, t2
add.d X, X, INCX
CMPLT $fcc2, s3, t3
LD a4, X, 0 * SIZE
CMPLT $fcc3, s4, t4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
FABS t1, a5
LD a5, X, 0 * SIZE
FABS t2, a6
add.d X, X, INCX
FABS t3, a7
LD a6, X, 0 * SIZE
FABS t4, a8
add.d X, X, INCX
CMPLT $fcc0, s1, t1
LD a7, X, 0 * SIZE
CMPLT $fcc1, s2, t2
add.d X, X, INCX
CMPLT $fcc2, s3, t3
LD a8, X, 0 * SIZE
CMPLT $fcc3, s4, t4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
addi.d I, I, -1
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t2
CMPLT $fcc2, s3, t3
CMPLT $fcc3, s4, t4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
FABS t1, a5
FABS t2, a6
FABS t3, a7
FABS t4, a8
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t2
CMPLT $fcc2, s3, t3
CMPLT $fcc3, s4, t4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L100
.align 3
.L16:
LD a1, X, 0 * SIZE
addi.d I, I, -1
FABS t1, a1
CMPLT $fcc0, s1, t1
CMOVT s1, s1, t1, $fcc0
add.d X, X, INCX
blt $r0, I, .L16
.align 3
.L100:
CMPLT $fcc0, s1, s2
CMPLT $fcc1, s3, s4
CMOVT s1, s1, s2, $fcc0
CMOVT s3, s3, s4, $fcc1
CMPLT $fcc0, s1, s3
CMOVT s1, s1, s3, $fcc0
addi.d N, N, 1
lu12i.w TEMP, 0x3f800
movgr2fr.d a1, $r0
movgr2fr.w ALPHA, TEMP
CMPEQ $fcc0, s1, a1
fcvt.d.s ALPHA, ALPHA
bcnez $fcc0, .L999
fdiv.d ALPHA, ALPHA, s1
MOV max, s1
MOV s1, a1
MOV s2, a1
MOV s3, a1
MOV s4, a1
srai.d I, N, 3
bge $r0, I, .L105
LD a1, XX, 0 * SIZE
add.d XX, XX, INCX
LD a2, XX, 0 * SIZE
add.d XX, XX, INCX
LD a3, XX, 0 * SIZE
add.d XX, XX, INCX
LD a4, XX, 0 * SIZE
add.d XX, XX, INCX
LD a5, XX, 0 * SIZE
add.d XX, XX, INCX
LD a6, XX, 0 * SIZE
add.d XX, XX, INCX
LD a7, XX, 0 * SIZE
add.d XX, XX, INCX
LD a8, XX, 0 * SIZE
addi.d I, I, -1
add.d XX, XX, INCX
bge $r0, I, .L104
.align 3
.L103:
MUL t1, ALPHA, a1
LD a1, XX, 0 * SIZE
MUL t2, ALPHA, a2
add.d XX, XX, INCX
MUL t3, ALPHA, a3
LD a2, XX, 0 * SIZE
MUL t4, ALPHA, a4
add.d XX, XX, INCX
MADD s1, t1, t1, s1
LD a3, XX, 0 * SIZE
MADD s2, t2, t2, s2
add.d XX, XX, INCX
MADD s3, t3, t3, s3
LD a4, XX, 0 * SIZE
MADD s4, t4, t4, s4
add.d XX, XX, INCX
MUL t1, ALPHA, a5
LD a5, XX, 0 * SIZE
MUL t2, ALPHA, a6
add.d XX, XX, INCX
MUL t3, ALPHA, a7
LD a6, XX, 0 * SIZE
MUL t4, ALPHA, a8
add.d XX, XX, INCX
MADD s1, t1, t1, s1
LD a7, XX, 0 * SIZE
MADD s2, t2, t2, s2
add.d XX, XX, INCX
MADD s3, t3, t3, s3
LD a8, XX, 0 * SIZE
MADD s4, t4, t4, s4
addi.d I, I, -1
add.d XX, XX, INCX
blt $r0, I, .L103
.align 3
.L104:
MUL t1, ALPHA, a1
MUL t2, ALPHA, a2
MUL t3, ALPHA, a3
MUL t4, ALPHA, a4
MADD s1, t1, t1, s1
MADD s2, t2, t2, s2
MADD s3, t3, t3, s3
MADD s4, t4, t4, s4
MUL t1, ALPHA, a5
MUL t2, ALPHA, a6
MUL t3, ALPHA, a7
MUL t4, ALPHA, a8
MADD s1, t1, t1, s1
MADD s2, t2, t2, s2
MADD s3, t3, t3, s3
MADD s4, t4, t4, s4
.align 3
.L105:
andi I, N, 7
bge $r0, I, .L998
.align 3
.L106:
LD a1, XX, 0 * SIZE
addi.d I, I, -1
MUL t1, ALPHA, a1
add.d XX, XX, INCX
MADD s1, t1, t1, s1
blt $r0, I, .L106
.align 3
.L998:
ADD s1, s1, s2
ADD s3, s3, s4
ADD s1, s1, s3
fsqrt.d s1, s1
move $r4, $r17
MUL $f0, max, s1
jirl $r0, $r1, 0x0
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

391
kernel/loongarch64/dot.S Normal file
View File

@ -0,0 +1,391 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
#define a1 $f23
#define a2 $f9
#define a3 $f10
#define a4 $f11
#define b1 $f12
#define b2 $f13
#define b3 $f14
#define b4 $f15
#define s1 $f22
#define s2 $f8
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
LDINT INCY, 0(INCY)
#endif
MTC s1, $r0
MTC s2, $r0
slli.d INCX, INCX, BASE_SHIFT
li.d TEMP, SIZE
slli.d INCY, INCY, BASE_SHIFT
bge $r0, N, .L999
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L20
bge $r0, I, .L15
LD a1, X, 0 * SIZE
LD b1, Y, 0 * SIZE
LD a2, X, 1 * SIZE
LD b2, Y, 1 * SIZE
LD a3, X, 2 * SIZE
LD b3, Y, 2 * SIZE
LD a4, X, 3 * SIZE
addi.d I, I, -1
LD b4, Y, 3 * SIZE
bge $r0, I, .L13
.align 3
.L12:
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 4 * SIZE
LD b1, Y, 4 * SIZE
#ifdef DSDOT
fcvt.d.s a2, a2
fcvt.d.s b2, b2
fmadd.d s2, b2, a2, s2
#else
MADD s2, b2, a2, s2
#endif
LD a2, X, 5 * SIZE
LD b2, Y, 5 * SIZE
#ifdef DSDOT
fcvt.d.s a3, a3
fcvt.d.s b3, b3
fmadd.d s1, b3, a3, s1
#else
MADD s1, b3, a3, s1
#endif
LD a3, X, 6 * SIZE
LD b3, Y, 6 * SIZE
#ifdef DSDOT
fcvt.d.s a4, a4
fcvt.d.s b4, b4
fmadd.d s2, b4, a4, s2
#else
MADD s2, b4, a4, s2
#endif
LD a4, X, 7 * SIZE
LD b4, Y, 7 * SIZE
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 8 * SIZE
LD b1, Y, 8 * SIZE
#ifdef DSDOT
fcvt.d.s a2, a2
fcvt.d.s b2, b2
fmadd.d s2, b2, a2, s2
#else
MADD s2, b2, a2, s2
#endif
LD a2, X, 9 * SIZE
LD b2, Y, 9 * SIZE
#ifdef DSDOT
fcvt.d.s a3, a3
fcvt.d.s b3, b3
fmadd.d s1, b3, a3, s1
#else
MADD s1, b3, a3, s1
#endif
LD a3, X, 10 * SIZE
LD b3, Y, 10 * SIZE
#ifdef DSDOT
fcvt.d.s a4, a4
fcvt.d.s b4, b4
fmadd.d s2, b4, a4, s2
#else
MADD s2, b4, a4, s2
#endif
LD a4, X, 11 * SIZE
LD b4, Y, 11 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L12
.align 3
.L13:
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 4 * SIZE
LD b1, Y, 4 * SIZE
#ifdef DSDOT
fcvt.d.s a2, a2
fcvt.d.s b2, b2
fmadd.d s2, b2, a2, s2
#else
MADD s2, b2, a2, s2
#endif
LD a2, X, 5 * SIZE
LD b2, Y, 5 * SIZE
#ifdef DSDOT
fcvt.d.s a3, a3
fcvt.d.s b3, b3
fmadd.d s1, b3, a3, s1
#else
MADD s1, b3, a3, s1
#endif
LD a3, X, 6 * SIZE
LD b3, Y, 6 * SIZE
#ifdef DSDOT
fcvt.d.s a4, a4
fcvt.d.s b4, b4
fmadd.d s2, b4, a4, s2
#else
MADD s2, b4, a4, s2
#endif
LD a4, X, 7 * SIZE
LD b4, Y, 7 * SIZE
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
addi.d X, X, 8 * SIZE
#ifdef DSDOT
fcvt.d.s a2, a2
fcvt.d.s b2, b2
fmadd.d s2, b2, a2, s2
#else
MADD s2, b2, a2, s2
#endif
addi.d Y, Y, 8 * SIZE
#ifdef DSDOT
fcvt.d.s a3, a3
fcvt.d.s b3, b3
fmadd.d s1, b3, a3, s1
#else
MADD s1, b3, a3, s1
#endif
#ifdef DSDOT
fcvt.d.s a4, a4
fcvt.d.s b4, b4
fmadd.d s2, b4, a4, s2
#else
MADD s2, b4, a4, s2
#endif
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L16:
LD a1, X, 0 * SIZE
LD b1, Y, 0 * SIZE
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
addi.d I, I, -1
addi.d X, X, SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L16
b .L999
.align 3
.L20:
#ifdef F_INTERFACE
bgez INCX, .L21
addi.d TEMP, N, -1
mult TEMP, INCX
mflo TEMP
dsub X, X, TEMP
.align 3
.L21:
bgez INCY, .L22
addi.d TEMP, N, -1
mult TEMP, INCY
mflo TEMP
dsub Y, Y, TEMP
.align 3
.L22:
#endif
bge $r0, I, .L25
.align 3
.L23:
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
blt $r0, I, .L23
.align 3
.L25:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
blt $r0, I, .L26
.align 3
.L999:
#ifdef DSDOT
fadd.d $f0, s1, s2
#else
ADD $f0, s1, s2
#endif
move $r4, $r17
jirl $r0, $r1, 0x0
EPILOGUE

File diff suppressed because it is too large Load Diff

531
kernel/loongarch64/gemv_n.S Normal file
View File

@ -0,0 +1,531 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
/* Unused param dummy1 */
#define M $r4
#define N $r5
#define A $r7
#define LDA $r8
#define X $r9
#define INCX $r10
#define Y $r11
#define INCY $r6
#define BUFFER $r16
#define YORIG $r18
#define XX $r12
#define YY $r13
#define I $r14
#define J $r15
#define AO1 $r23
#define AO2 $r24
#define ALPHA $f0
#define a1 $f22
#define a2 $f8
#define a3 $f23
#define a4 $f9
#define a5 $f10
#define a6 $f11
#define a7 $f12
#define a8 $f13
#define x1 $f14
#define x2 $f15
#define y1 $f16
#define y2 $f17
#define y3 $f3
#define y4 $f1
#define y5 $f2
#define y6 $f4
#define y7 $f5
#define y8 $f6
#define t1 $f7
#define t2 $f18
#define t3 $f19
#define t4 $f20
PROLOGUE
LDARG INCY, $sp, 0
LDARG BUFFER, $sp, 8
#ifdef __64BIT__
addi.d $sp, $sp, -16
#else
addi.d $sp, $sp, -48
#endif
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
slli.d LDA, LDA, BASE_SHIFT
#ifndef __64BIT__
fst.d $f18, $sp, 16
fst.d $f19, $sp, 24
fst.d $f20, $sp, 32
#endif
slli.d INCX, INCX, BASE_SHIFT
bge $r0, M, .L999
slli.d INCY, INCY, BASE_SHIFT
bge $r0, N, .L999
li.d I, SIZE
move YORIG, Y
beq INCY, I, .L10
srai.d I, M, 2
move YORIG, BUFFER
move XX, Y
move YY, BUFFER
bge $r0, I, .L05
.align 3
.L02:
LD a1, XX, 0 * SIZE
add.d XX, XX, INCY
LD a2, XX, 0 * SIZE
add.d XX, XX, INCY
LD a3, XX, 0 * SIZE
add.d XX, XX, INCY
LD a4, XX, 0 * SIZE
add.d XX, XX, INCY
ST a1, YY, 0 * SIZE
ST a2, YY, 1 * SIZE
ST a3, YY, 2 * SIZE
ST a4, YY, 3 * SIZE
addi.d I, I, -1
addi.d YY, YY, 4 * SIZE
blt $r0, I, .L02
.align 3
.L05:
andi I, M, 3
bge $r0, I, .L10
.align 3
.L06:
LD a1, XX, 0 * SIZE
add.d XX, XX, INCY
ST a1, YY, 0 * SIZE
addi.d I, I, -1
addi.d YY, YY, 1 * SIZE
blt $r0, I, .L06
.align 3
.L10:
srai.d J, N, 1
bge $r0, J, .L20
.align 3
.L11:
LD x1, X, 0 * SIZE
add.d X, X, INCX
LD x2, X, 0 * SIZE
add.d X, X, INCX
move AO1, A
add.d AO2, A, LDA
add.d A, AO2, LDA
move YY, YORIG
MUL x1, ALPHA, x1
srai.d I, M, 3
MUL x2, ALPHA, x2
bge $r0, I, .L15
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
LD a3, AO1, 2 * SIZE
LD y3, YY, 2 * SIZE
LD a4, AO1, 3 * SIZE
LD y4, YY, 3 * SIZE
LD a5, AO2, 0 * SIZE
LD y5, YY, 4 * SIZE
LD a6, AO2, 1 * SIZE
LD y6, YY, 5 * SIZE
LD a7, AO2, 2 * SIZE
LD y7, YY, 6 * SIZE
LD a8, AO2, 3 * SIZE
addi.d I, I, -1
LD y8, YY, 7 * SIZE
bge $r0, I, .L13
.align 3
.L12:
MADD t1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD t2, a2, x1, y2
LD a2, AO1, 5 * SIZE
LD y1, YY, 8 * SIZE
LD y2, YY, 9 * SIZE
MADD t3, a3, x1, y3
LD a3, AO1, 6 * SIZE
MADD t4, a4, x1, y4
LD a4, AO1, 7 * SIZE
LD y3, YY, 10 * SIZE
LD y4, YY, 11 * SIZE
MADD t1, a5, x2, t1
LD a5, AO2, 4 * SIZE
MADD t2, a6, x2, t2
LD a6, AO2, 5 * SIZE
MADD t3, a7, x2, t3
LD a7, AO2, 6 * SIZE
MADD t4, a8, x2, t4
LD a8, AO2, 7 * SIZE
ST t1, YY, 0 * SIZE
ST t2, YY, 1 * SIZE
ST t3, YY, 2 * SIZE
ST t4, YY, 3 * SIZE
MADD t1, a1, x1, y5
LD a1, AO1, 8 * SIZE
MADD t2, a2, x1, y6
LD a2, AO1, 9 * SIZE
LD y5, YY, 12 * SIZE
LD y6, YY, 13 * SIZE
MADD t3, a3, x1, y7
LD a3, AO1, 10 * SIZE
MADD t4, a4, x1, y8
LD a4, AO1, 11 * SIZE
LD y7, YY, 14 * SIZE
LD y8, YY, 15 * SIZE
MADD t1, a5, x2, t1
LD a5, AO2, 8 * SIZE
MADD t2, a6, x2, t2
LD a6, AO2, 9 * SIZE
MADD t3, a7, x2, t3
LD a7, AO2, 10 * SIZE
MADD t4, a8, x2, t4
LD a8, AO2, 11 * SIZE
ST t1, YY, 4 * SIZE
ST t2, YY, 5 * SIZE
ST t3, YY, 6 * SIZE
ST t4, YY, 7 * SIZE
addi.d I, I, -1
addi.d YY, YY, 8 * SIZE
addi.d AO1, AO1, 8 * SIZE
addi.d AO2, AO2, 8 * SIZE
blt $r0, I, .L12
.align 3
.L13:
MADD t1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD t2, a2, x1, y2
LD a2, AO1, 5 * SIZE
MADD t3, a3, x1, y3
LD a3, AO1, 6 * SIZE
MADD t4, a4, x1, y4
LD a4, AO1, 7 * SIZE
MADD t1, a5, x2, t1
LD a5, AO2, 4 * SIZE
MADD t2, a6, x2, t2
LD a6, AO2, 5 * SIZE
MADD t3, a7, x2, t3
LD a7, AO2, 6 * SIZE
MADD t4, a8, x2, t4
LD a8, AO2, 7 * SIZE
ST t1, YY, 0 * SIZE
MADD t1, a1, x1, y5
ST t2, YY, 1 * SIZE
MADD t2, a2, x1, y6
ST t3, YY, 2 * SIZE
MADD t3, a3, x1, y7
ST t4, YY, 3 * SIZE
MADD t4, a4, x1, y8
MADD t1, a5, x2, t1
addi.d AO1, AO1, 8 * SIZE
MADD t2, a6, x2, t2
addi.d AO2, AO2, 8 * SIZE
MADD t3, a7, x2, t3
addi.d YY, YY, 8 * SIZE
MADD t4, a8, x2, t4
ST t1, YY, -4 * SIZE
ST t2, YY, -3 * SIZE
ST t3, YY, -2 * SIZE
ST t4, YY, -1 * SIZE
.align 3
.L15:
andi I, M, 4
bge $r0, I, .L16
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
LD a3, AO1, 2 * SIZE
LD y3, YY, 2 * SIZE
LD a4, AO1, 3 * SIZE
LD y4, YY, 3 * SIZE
LD a5, AO2, 0 * SIZE
MADD y1, a1, x1, y1
LD a6, AO2, 1 * SIZE
MADD y2, a2, x1, y2
LD a7, AO2, 2 * SIZE
MADD y3, a3, x1, y3
LD a8, AO2, 3 * SIZE
MADD y4, a4, x1, y4
MADD y1, a5, x2, y1
addi.d YY, YY, 4 * SIZE
MADD y2, a6, x2, y2
addi.d AO1, AO1, 4 * SIZE
MADD y3, a7, x2, y3
addi.d AO2, AO2, 4 * SIZE
MADD y4, a8, x2, y4
ST y1, YY, -4 * SIZE
ST y2, YY, -3 * SIZE
ST y3, YY, -2 * SIZE
ST y4, YY, -1 * SIZE
.align 3
.L16:
andi I, M, 2
bge $r0, I, .L17
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
LD a5, AO2, 0 * SIZE
LD a6, AO2, 1 * SIZE
MADD y1, a1, x1, y1
MADD y2, a2, x1, y2
addi.d YY, YY, 2 * SIZE
MADD y1, a5, x2, y1
addi.d AO1, AO1, 2 * SIZE
MADD y2, a6, x2, y2
addi.d AO2, AO2, 2 * SIZE
ST y1, YY, -2 * SIZE
ST y2, YY, -1 * SIZE
.align 3
.L17:
andi I, M, 1
bge $r0, I, .L19
LD y1, YY, 0 * SIZE
LD a1, AO1, 0 * SIZE
LD a5, AO2, 0 * SIZE
MADD y1, a1, x1, y1
MADD y1, a5, x2, y1
ST y1, YY, 0 * SIZE
.align 3
.L19:
addi.d J, J, -1
blt $r0, J, .L11
.align 3
.L20:
andi J, N, 1
bge $r0, J, .L900
.align 3
.L21:
LD x1, X, 0 * SIZE
add.d X, X, INCX
move YY, YORIG
move AO1, A
srai.d I, M, 3
MUL x1, ALPHA, x1
bge $r0, I, .L25
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
LD a3, AO1, 2 * SIZE
LD y3, YY, 2 * SIZE
LD a4, AO1, 3 * SIZE
LD y4, YY, 3 * SIZE
LD y5, YY, 4 * SIZE
LD y6, YY, 5 * SIZE
LD y7, YY, 6 * SIZE
addi.d I, I, -1
LD y8, YY, 7 * SIZE
bge $r0, I, .L23
.align 3
.L22:
MADD t1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD t2, a2, x1, y2
LD a2, AO1, 5 * SIZE
LD y1, YY, 8 * SIZE
LD y2, YY, 9 * SIZE
MADD t3, a3, x1, y3
LD a3, AO1, 6 * SIZE
MADD t4, a4, x1, y4
LD a4, AO1, 7 * SIZE
LD y3, YY, 10 * SIZE
LD y4, YY, 11 * SIZE
ST t1, YY, 0 * SIZE
ST t2, YY, 1 * SIZE
ST t3, YY, 2 * SIZE
ST t4, YY, 3 * SIZE
MADD t1, a1, x1, y5
LD a1, AO1, 8 * SIZE
MADD t2, a2, x1, y6
LD a2, AO1, 9 * SIZE
LD y5, YY, 12 * SIZE
LD y6, YY, 13 * SIZE
MADD t3, a3, x1, y7
LD a3, AO1, 10 * SIZE
MADD t4, a4, x1, y8
LD a4, AO1, 11 * SIZE
LD y7, YY, 14 * SIZE
LD y8, YY, 15 * SIZE
ST t1, YY, 4 * SIZE
ST t2, YY, 5 * SIZE
ST t3, YY, 6 * SIZE
ST t4, YY, 7 * SIZE
addi.d I, I, -1
addi.d YY, YY, 8 * SIZE
addi.d AO1, AO1, 8 * SIZE
blt $r0, I, .L22
.align 3
.L23:
MADD t1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD t2, a2, x1, y2
LD a2, AO1, 5 * SIZE
MADD t3, a3, x1, y3
LD a3, AO1, 6 * SIZE
MADD t4, a4, x1, y4
LD a4, AO1, 7 * SIZE
ST t1, YY, 0 * SIZE
MADD t1, a1, x1, y5
ST t2, YY, 1 * SIZE
MADD t2, a2, x1, y6
ST t3, YY, 2 * SIZE
MADD t3, a3, x1, y7
ST t4, YY, 3 * SIZE
MADD t4, a4, x1, y8
ST t1, YY, 4 * SIZE
ST t2, YY, 5 * SIZE
ST t3, YY, 6 * SIZE
ST t4, YY, 7 * SIZE
addi.d AO1, AO1, 8 * SIZE
addi.d YY, YY, 8 * SIZE
.align 3
.L25:
andi I, M, 4
bge $r0, I, .L26
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
LD a3, AO1, 2 * SIZE
LD y3, YY, 2 * SIZE
LD a4, AO1, 3 * SIZE
LD y4, YY, 3 * SIZE
MADD y1, a1, x1, y1
MADD y2, a2, x1, y2
MADD y3, a3, x1, y3
addi.d YY, YY, 4 * SIZE
MADD y4, a4, x1, y4
addi.d AO1, AO1, 4 * SIZE
ST y1, YY, -4 * SIZE
ST y2, YY, -3 * SIZE
ST y3, YY, -2 * SIZE
ST y4, YY, -1 * SIZE
.align 3
.L26:
andi I, M, 2
bge $r0, I, .L27
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
MADD y1, a1, x1, y1
addi.d YY, YY, 2 * SIZE
MADD y2, a2, x1, y2
addi.d AO1, AO1, 2 * SIZE
ST y1, YY, -2 * SIZE
ST y2, YY, -1 * SIZE
.align 3
.L27:
andi I, M, 1
bge $r0, I, .L900
LD y1, YY, 0 * SIZE
LD a1, AO1, 0 * SIZE
MADD y1, a1, x1, y1
ST y1, YY, 0 * SIZE
.align 3
.L900:
li.d YORIG, SIZE
srai.d I, M, 2
beq INCY, YORIG, .L999
move XX, BUFFER
bge $r0, I, .L905
.align 3
.L902:
LD a1, XX, 0 * SIZE
LD a2, XX, 1 * SIZE
LD a3, XX, 2 * SIZE
LD a4, XX, 3 * SIZE
ST a1, Y, 0 * SIZE
add.d Y, Y, INCY
ST a2, Y, 0 * SIZE
add.d Y, Y, INCY
ST a3, Y, 0 * SIZE
add.d Y, Y, INCY
ST a4, Y, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
addi.d XX, XX, 4 * SIZE
blt $r0, I, .L902
.align 3
.L905:
andi I, M, 3
bge $r0, I, .L999
.align 3
.L906:
LD a1, XX, 0 * SIZE
addi.d XX, XX, 1 * SIZE
ST a1, Y, 0 * SIZE
addi.d I, I, -1
add.d Y, Y, INCY
blt $r0, I, .L906
.align 3
.L999:
LDARG $r23, $sp, 0
LDARG $r24, $sp, 8
#ifndef __64BIT__
fld.d $f18, $sp, 16
fld.d $f19, $sp, 24
fld.d $f20, $sp, 32
#endif
#ifdef __64BIT__
addi.d $sp, $sp, 16
#else
addi.d $sp, $sp, 48
#endif
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

436
kernel/loongarch64/gemv_t.S Normal file
View File

@ -0,0 +1,436 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
/* Unused param dummy1 */
#define M $r4
#define N $r5
#define A $r7
#define LDA $r8
#define X $r9
#define INCX $r10
#define Y $r11
#define INCY $r6
#define BUFFER $r16
#define XORIG $r18
#define XX $r12
#define YY $r13
#define I $r14
#define J $r15
#define AO1 $r23
#define AO2 $r24
#define ALPHA $f0
#define a1 $f22
#define a2 $f8
#define a3 $f23
#define a4 $f9
#define a5 $f10
#define a6 $f11
#define a7 $f12
#define a8 $f13
#define y1 $f14
#define y2 $f15
#define y3 $f16
#define y4 $f17
#define x1 $f3
#define x2 $f1
#define x3 $f2
#define x4 $f4
#define x5 $f5
#define x6 $f6
#define x7 $f7
#define x8 $f18
PROLOGUE
LDARG INCY, $sp, 0
LDARG BUFFER, $sp, 8
#ifdef __64BIT__
addi.d $sp, $sp, -16
#else
addi.d $sp, $sp, -32
#endif
MTC y1, $r0
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
slli.d LDA, LDA, BASE_SHIFT
#ifndef __64BIT__
fst.d $f18, $sp, 16
#endif
slli.d INCX, INCX, BASE_SHIFT
bge $r0, M, .L999
slli.d INCY, INCY, BASE_SHIFT
bge $r0, N, .L999
li.d I, SIZE
move XORIG, X
beq INCX, I, .L10
srai.d I, M, 2
move XORIG, BUFFER
move YY, BUFFER
bge $r0, I, .L05
.align 3
.L02:
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
ST a1, YY, 0 * SIZE
ST a2, YY, 1 * SIZE
ST a3, YY, 2 * SIZE
ST a4, YY, 3 * SIZE
addi.d I, I, -1
addi.d YY, YY, 4 * SIZE
blt $r0, I, .L02
.align 3
.L05:
andi I, M, 3
bge $r0, I, .L10
.align 3
.L06:
LD a1, X, 0 * SIZE
add.d X, X, INCX
ST a1, YY, 0 * SIZE
addi.d I, I, -1
addi.d YY, YY, 1 * SIZE
blt $r0, I, .L06
.align 3
.L10:
srai.d J, N, 1
move YY, Y
bge $r0, J, .L20
.align 3
.L11:
move AO1, A
MOV y2, y1
add.d AO2, A, LDA
MOV y3, y1
add.d A, AO2, LDA
MOV y4, y1
srai.d I, M, 3
move XX, XORIG
bge $r0, I, .L15
LD a1, AO1, 0 * SIZE
LD x1, XX, 0 * SIZE
LD a2, AO2, 0 * SIZE
LD x2, XX, 1 * SIZE
LD a3, AO1, 1 * SIZE
LD x3, XX, 2 * SIZE
LD a4, AO2, 1 * SIZE
LD x4, XX, 3 * SIZE
LD a5, AO1, 2 * SIZE
LD x5, XX, 4 * SIZE
LD a6, AO2, 2 * SIZE
LD x6, XX, 5 * SIZE
LD a7, AO1, 3 * SIZE
LD x7, XX, 6 * SIZE
LD a8, AO2, 3 * SIZE
addi.d I, I, -1
LD x8, XX, 7 * SIZE
bge $r0, I, .L13
.align 3
.L12:
MADD y1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD y2, a2, x1, y2
LD a2, AO2, 4 * SIZE
MADD y3, a3, x2, y3
LD a3, AO1, 5 * SIZE
MADD y4, a4, x2, y4
LD a4, AO2, 5 * SIZE
LD x1, XX, 8 * SIZE
LD x2, XX, 9 * SIZE
MADD y1, a5, x3, y1
LD a5, AO1, 6 * SIZE
MADD y2, a6, x3, y2
LD a6, AO2, 6 * SIZE
MADD y3, a7, x4, y3
LD a7, AO1, 7 * SIZE
MADD y4, a8, x4, y4
LD a8, AO2, 7 * SIZE
LD x3, XX, 10 * SIZE
LD x4, XX, 11 * SIZE
MADD y1, a1, x5, y1
LD a1, AO1, 8 * SIZE
MADD y2, a2, x5, y2
LD a2, AO2, 8 * SIZE
MADD y3, a3, x6, y3
LD a3, AO1, 9 * SIZE
MADD y4, a4, x6, y4
LD a4, AO2, 9 * SIZE
LD x5, XX, 12 * SIZE
LD x6, XX, 13 * SIZE
MADD y1, a5, x7, y1
LD a5, AO1, 10 * SIZE
MADD y2, a6, x7, y2
LD a6, AO2, 10 * SIZE
MADD y3, a7, x8, y3
LD a7, AO1, 11 * SIZE
MADD y4, a8, x8, y4
LD a8, AO2, 11 * SIZE
LD x7, XX, 14 * SIZE
LD x8, XX, 15 * SIZE
addi.d I, I, -1
addi.d XX, XX, 8 * SIZE
addi.d AO1, AO1, 8 * SIZE
addi.d AO2, AO2, 8 * SIZE
blt $r0, I, .L12
.align 3
.L13:
MADD y1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD y2, a2, x1, y2
LD a2, AO2, 4 * SIZE
MADD y3, a3, x2, y3
LD a3, AO1, 5 * SIZE
MADD y4, a4, x2, y4
LD a4, AO2, 5 * SIZE
MADD y1, a5, x3, y1
LD a5, AO1, 6 * SIZE
MADD y2, a6, x3, y2
LD a6, AO2, 6 * SIZE
MADD y3, a7, x4, y3
LD a7, AO1, 7 * SIZE
MADD y4, a8, x4, y4
LD a8, AO2, 7 * SIZE
MADD y1, a1, x5, y1
MADD y2, a2, x5, y2
MADD y3, a3, x6, y3
MADD y4, a4, x6, y4
MADD y1, a5, x7, y1
addi.d XX, XX, 8 * SIZE
MADD y2, a6, x7, y2
addi.d AO1, AO1, 8 * SIZE
MADD y3, a7, x8, y3
addi.d AO2, AO2, 8 * SIZE
MADD y4, a8, x8, y4
.align 3
.L15:
andi I, M, 4
bge $r0, I, .L17
LD a1, AO1, 0 * SIZE
LD x1, XX, 0 * SIZE
LD a2, AO2, 0 * SIZE
LD a3, AO1, 1 * SIZE
LD x2, XX, 1 * SIZE
LD a4, AO2, 1 * SIZE
LD a5, AO1, 2 * SIZE
LD x3, XX, 2 * SIZE
MADD y1, a1, x1, y1
LD a6, AO2, 2 * SIZE
MADD y2, a2, x1, y2
LD a7, AO1, 3 * SIZE
MADD y3, a3, x2, y3
LD x4, XX, 3 * SIZE
MADD y4, a4, x2, y4
LD a8, AO2, 3 * SIZE
MADD y1, a5, x3, y1
MADD y2, a6, x3, y2
addi.d XX, XX, 4 * SIZE
MADD y3, a7, x4, y3
addi.d AO1, AO1, 4 * SIZE
MADD y4, a8, x4, y4
addi.d AO2, AO2, 4 * SIZE
.align 3
.L17:
andi I, M, 3
ADD y1, y1, y3
ADD y2, y2, y4
bge $r0, I, .L19
.align 3
.L18:
LD x1, XX, 0 * SIZE
LD a1, AO1, 0 * SIZE
LD a2, AO2, 0 * SIZE
addi.d I, I, -1
addi.d XX, XX, 1 * SIZE
addi.d AO1, AO1, 1 * SIZE
addi.d AO2, AO2, 1 * SIZE
MADD y1, a1, x1, y1
MADD y2, a2, x1, y2
blt $r0, I, .L18
.align 3
.L19:
LD a1, Y, 0 * SIZE
add.d Y, Y, INCY
LD a2, Y, 0 * SIZE
add.d Y, Y, INCY
MADD a1, y1, ALPHA, a1
addi.d J, J, -1
MADD a2, y2, ALPHA, a2
MTC y1, $r0
ST a1, YY, 0 * SIZE
add.d YY, YY, INCY
ST a2, YY, 0 * SIZE
add.d YY, YY, INCY
blt $r0, J, .L11
.align 3
.L20:
andi J, N, 1
MOV y3, y1
move AO1, A
bge $r0, J, .L999
srai.d I, M, 3
move XX, XORIG
bge $r0, I, .L25
LD a1, AO1, 0 * SIZE
LD x1, XX, 0 * SIZE
LD a3, AO1, 1 * SIZE
LD x2, XX, 1 * SIZE
LD a5, AO1, 2 * SIZE
LD x3, XX, 2 * SIZE
LD a7, AO1, 3 * SIZE
LD x4, XX, 3 * SIZE
LD x5, XX, 4 * SIZE
LD x6, XX, 5 * SIZE
LD x7, XX, 6 * SIZE
addi.d I, I, -1
LD x8, XX, 7 * SIZE
bge $r0, I, .L23
.align 3
.L22:
MADD y1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD y3, a3, x2, y3
LD a3, AO1, 5 * SIZE
LD x1, XX, 8 * SIZE
LD x2, XX, 9 * SIZE
MADD y1, a5, x3, y1
LD a5, AO1, 6 * SIZE
MADD y3, a7, x4, y3
LD a7, AO1, 7 * SIZE
LD x3, XX, 10 * SIZE
LD x4, XX, 11 * SIZE
MADD y1, a1, x5, y1
LD a1, AO1, 8 * SIZE
MADD y3, a3, x6, y3
LD a3, AO1, 9 * SIZE
LD x5, XX, 12 * SIZE
LD x6, XX, 13 * SIZE
MADD y1, a5, x7, y1
LD a5, AO1, 10 * SIZE
MADD y3, a7, x8, y3
LD a7, AO1, 11 * SIZE
LD x7, XX, 14 * SIZE
LD x8, XX, 15 * SIZE
addi.d I, I, -1
addi.d XX, XX, 8 * SIZE
addi.d AO1, AO1, 8 * SIZE
blt $r0, I, .L22
.align 3
.L23:
MADD y1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD y3, a3, x2, y3
LD a3, AO1, 5 * SIZE
MADD y1, a5, x3, y1
LD a5, AO1, 6 * SIZE
MADD y3, a7, x4, y3
LD a7, AO1, 7 * SIZE
MADD y1, a1, x5, y1
MADD y3, a3, x6, y3
MADD y1, a5, x7, y1
MADD y3, a7, x8, y3
addi.d XX, XX, 8 * SIZE
addi.d AO1, AO1, 8 * SIZE
.align 3
.L25:
andi I, M, 4
bge $r0, I, .L27
LD a1, AO1, 0 * SIZE
LD x1, XX, 0 * SIZE
LD a3, AO1, 1 * SIZE
LD x2, XX, 1 * SIZE
LD a5, AO1, 2 * SIZE
LD x3, XX, 2 * SIZE
MADD y1, a1, x1, y1
LD a7, AO1, 3 * SIZE
MADD y3, a3, x2, y3
LD x4, XX, 3 * SIZE
MADD y1, a5, x3, y1
addi.d XX, XX, 4 * SIZE
MADD y3, a7, x4, y3
addi.d AO1, AO1, 4 * SIZE
.align 3
.L27:
andi I, M, 3
ADD y1, y1, y3
bge $r0, I, .L29
.align 3
.L28:
LD x1, XX, 0 * SIZE
LD a1, AO1, 0 * SIZE
addi.d I, I, -1
addi.d XX, XX, 1 * SIZE
addi.d AO1, AO1, 1 * SIZE
MADD y1, a1, x1, y1
blt $r0, I, .L28
.align 3
.L29:
LD a1, Y, 0 * SIZE
add.d Y, Y, INCY
MADD a1, y1, ALPHA, a1
ST a1, YY, 0 * SIZE
add.d YY, YY, INCY
.align 3
.L999:
LDARG $r23, $sp, 0
LDARG $r24, $sp, 8
#ifndef __64BIT__
fld.d $f18, $sp, 16
#endif
#ifdef __64BIT__
addi.d $sp, $sp, 16
#else
addi.d $sp, $sp, 32
#endif
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

233
kernel/loongarch64/iamax.S Normal file
View File

@ -0,0 +1,233 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r18
#define TEMP $r7
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
#define x1 $r17
#define x2 $r8
#define x3 $r9
#define x4 $r10
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
li.d x1, 0
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
addi.d N, N, -1
li.d x1, 1
bge $r0, N, .L999
FABS s1, a1
add.d X, X, INCX
FABS s2, a1
li.d x2, 1
FABS s3, a1
srai.d I, N, 3
FABS s4, a1
li.d x3, 1
li.d TEMP, 2
li.d x4, 1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
add.d X, X, INCX
FABS t3, a3
LD a2, X, 0 * SIZE
FABS t4, a4
add.d X, X, INCX
CMPLT $fcc0, s1, t1
LD a3, X, 0 * SIZE
CMPLT $fcc1, s2, t2
add.d X, X, INCX
CMPLT $fcc2, s3, t3
LD a4, X, 0 * SIZE
CMPLT $fcc3, s4, t4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
addi.d I, I, -1
FABS t1, a5
LD a5, X, 0 * SIZE
FABS t2, a6
add.d X, X, INCX
FABS t3, a7
LD a6, X, 0 * SIZE
FABS t4, a8
add.d X, X, INCX
CMPLT $fcc0, s1, t1
LD a7, X, 0 * SIZE
CMPLT $fcc1, s2, t2
add.d X, X, INCX
CMPLT $fcc2, s3, t3
LD a8, X, 0 * SIZE
CMPLT $fcc3, s4, t4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t2
CMPLT $fcc2, s3, t3
CMPLT $fcc3, s4, t4
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
FABS t1, a5
addi.d TEMP, TEMP, 4
FABS t2, a6
FABS t3, a7
FABS t4, a8
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t2
CMPLT $fcc2, s3, t3
CMPLT $fcc3, s4, t4
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
addi.d x2, x2, 1
addi.d x3, x3, 2
addi.d x4, x4, 3
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
add.d X, X, INCX
FABS t1, a1
addi.d I, I, -1
CMPLT $fcc0, s1, t1
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
addi.d TEMP, TEMP, 1
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s1, s2
CMPLT $fcc1, s3, s4
CMOVT s1, s1, s2, $fcc0
MOVT(x1, x2, $fcc0)
CMOVT s3, s3, s4, $fcc1
MOVT(x3, x4, $fcc1)
CMPLT $fcc0, s1, s3
CMOVT s1, s1, s3, $fcc0
MOVT(x1, x3, $fcc0)
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

233
kernel/loongarch64/iamin.S Normal file
View File

@ -0,0 +1,233 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r18
#define TEMP $r7
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
#define x1 $r17
#define x2 $r8
#define x3 $r9
#define x4 $r10
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
li.d x1, 0
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
addi.d N, N, -1
li.d x1, 1
bge $r0, N, .L999
FABS s1, a1
add.d X, X, INCX
FABS s2, a1
li.d x2, 1
FABS s3, a1
srai.d I, N, 3
FABS s4, a1
li.d x3, 1
li.d TEMP, 2
li.d x4, 1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
add.d X, X, INCX
FABS t3, a3
LD a2, X, 0 * SIZE
FABS t4, a4
add.d X, X, INCX
CMPLT $fcc0, t1, s1
LD a3, X, 0 * SIZE
CMPLT $fcc1, t2, s2
add.d X, X, INCX
CMPLT $fcc2, t3, s3
LD a4, X, 0 * SIZE
CMPLT $fcc3, t4, s4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
addi.d I, I, -1
FABS t1, a5
LD a5, X, 0 * SIZE
FABS t2, a6
add.d X, X, INCX
FABS t3, a7
LD a6, X, 0 * SIZE
FABS t4, a8
add.d X, X, INCX
CMPLT $fcc0, t1, s1
LD a7, X, 0 * SIZE
CMPLT $fcc1, t2, s2
add.d X, X, INCX
CMPLT $fcc2, t3, s3
LD a8, X, 0 * SIZE
CMPLT $fcc3, t4, s4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
CMPLT $fcc0, t1, s1
CMPLT $fcc1, t2, s2
CMPLT $fcc2, t3, s3
CMPLT $fcc3, t4, s4
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
FABS t1, a5
addi.d TEMP, TEMP, 4
FABS t2, a6
FABS t3, a7
FABS t4, a8
CMPLT $fcc0, t1, s1
CMPLT $fcc1, t2, s2
CMPLT $fcc2, t3, s3
CMPLT $fcc3, t4, s4
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
addi.d x2, x2, 1
addi.d x3, x3, 2
addi.d x4, x4, 3
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
add.d X, X, INCX
FABS t1, a1
addi.d I, I, -1
CMPLT $fcc0, t1, s1
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
addi.d TEMP, TEMP, 1
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s2, s1
CMPLT $fcc1, s4, s3
CMOVT s1, s1, s2, $fcc0
MOVT(x1, x2, $fcc0)
CMOVT s3, s3, s4, $fcc1
MOVT(x3, x4, $fcc1)
CMPLT $fcc0, s3, s1
CMOVT s1, s1, s3, $fcc0
MOVT(x1, x3, $fcc0)
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

217
kernel/loongarch64/izamax.S Normal file
View File

@ -0,0 +1,217 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r18
#define TEMP $r7
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define t5 $f4
#define t6 $f5
#define t7 $f6
#define t8 $f7
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
#define x1 $r17
#define x2 $r8
#define x3 $r9
#define x4 $r10
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
li.d x1, 0
bge $r0, N, .L999
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
FABS t1, a1
FABS t2, a2
ADD s1, t1, t2
ADD s2, t1, t2
ADD s3, t1, t2
ADD s4, t1, t2
addi.d N, N, -1
li.d x1, 1
bge $r0, N, .L999
add.d X, X, INCX
li.d x2, 1
srai.d I, N, 2
li.d x3, 1
li.d TEMP, 2
li.d x4, 1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
LD a4, X, 1 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
LD a6, X, 1 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
LD a8, X, 1 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
LD a2, X, 1 * SIZE
FABS t3, a3
add.d X, X, INCX
FABS t4, a4
FABS t5, a5
LD a3, X, 0 * SIZE
FABS t6, a6
LD a4, X, 1 * SIZE
FABS t7, a7
add.d X, X, INCX
FABS t8, a8
ADD t1, t1, t2
LD a5, X, 0 * SIZE
ADD t3, t3, t4
LD a6, X, 1 * SIZE
ADD t5, t5, t6
add.d X, X, INCX
ADD t7, t7, t8
CMPLT $fcc0, s1, t1
LD a7, X, 0 * SIZE
CMPLT $fcc1, s2, t3
LD a8, X, 1 * SIZE
CMPLT $fcc2, s3, t5
add.d X, X, INCX
CMPLT $fcc3, s4, t7
addi.d I, I, -1
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t3, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t5, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t7, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
FABS t5, a5
FABS t6, a6
FABS t7, a7
FABS t8, a8
ADD t1, t1, t2
ADD t3, t3, t4
ADD t5, t5, t6
ADD t7, t7, t8
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t3
CMPLT $fcc2, s3, t5
CMPLT $fcc3, s4, t7
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t3, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t5, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t7, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
addi.d x2, x2, 1
addi.d x3, x3, 2
addi.d x4, x4, 3
.align 3
.L15:
andi I, N, 3
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
FABS t1, a1
FABS t2, a2
ADD t1, t1, t2
addi.d I, I, -1
CMPLT $fcc0, s1, t1
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
addi.d TEMP, TEMP, 1
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s1, s2
CMPLT $fcc1, s3, s4
CMOVT s1, s1, s2, $fcc0
MOVT(x1, x2, $fcc0)
CMOVT s3, s3, s4, $fcc1
MOVT(x3, x4, $fcc1)
CMPLT $fcc0, s1, s3
CMOVT s1, s1, s3, $fcc0
MOVT(x1, x3, $fcc0)
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

217
kernel/loongarch64/izamin.S Normal file
View File

@ -0,0 +1,217 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r18
#define TEMP $r7
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define t5 $f4
#define t6 $f5
#define t7 $f6
#define t8 $f7
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
#define x1 $r17
#define x2 $r8
#define x3 $r9
#define x4 $r10
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
li.d x1, 0
bge $r0, N, .L999
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
FABS t1, a1
FABS t2, a2
ADD s1, t1, t2
ADD s2, t1, t2
ADD s3, t1, t2
ADD s4, t1, t2
addi.d N, N, -1
li.d x1, 1
bge $r0, N, .L999
add.d X, X, INCX
li.d x2, 1
srai.d I, N, 2
li.d x3, 1
li.d TEMP, 2
li.d x4, 1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
LD a4, X, 1 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
LD a6, X, 1 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
LD a8, X, 1 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
LD a2, X, 1 * SIZE
FABS t3, a3
add.d X, X, INCX
FABS t4, a4
FABS t5, a5
LD a3, X, 0 * SIZE
FABS t6, a6
LD a4, X, 1 * SIZE
FABS t7, a7
add.d X, X, INCX
FABS t8, a8
ADD t1, t1, t2
LD a5, X, 0 * SIZE
ADD t3, t3, t4
LD a6, X, 1 * SIZE
ADD t5, t5, t6
add.d X, X, INCX
ADD t7, t7, t8
CMPLT $fcc0, t1, s1
LD a7, X, 0 * SIZE
CMPLT $fcc1, t3, s2
LD a8, X, 1 * SIZE
CMPLT $fcc2, t5, s3
add.d X, X, INCX
CMPLT $fcc3, t7, s4
addi.d I, I, -1
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t3, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t5, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t7, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
FABS t5, a5
FABS t6, a6
FABS t7, a7
FABS t8, a8
ADD t1, t1, t2
ADD t3, t3, t4
ADD t5, t5, t6
ADD t7, t7, t8
CMPLT $fcc0, t1, s1
CMPLT $fcc1, t3, s2
CMPLT $fcc2, t5, s3
CMPLT $fcc3, t7, s4
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t3, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t5, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t7, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
addi.d x2, x2, 1
addi.d x3, x3, 2
addi.d x4, x4, 3
.align 3
.L15:
andi I, N, 3
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
FABS t1, a1
FABS t2, a2
ADD t1, t1, t2
addi.d I, I, -1
CMPLT $fcc0, t1, s1
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
addi.d TEMP, TEMP, 1
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s2, s1
CMPLT $fcc1, s4, s3
CMOVT s1, s1, s2, $fcc0
MOVT(x1, x2, $fcc0)
CMOVT s3, s3, s4, $fcc1
MOVT(x3, x4, $fcc1)
CMPLT $fcc0, s3, s1
CMOVT s1, s1, s3, $fcc0
MOVT(x1, x3, $fcc0)
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

174
kernel/loongarch64/max.S Normal file
View File

@ -0,0 +1,174 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999
LD s1, X, 0 * SIZE
addi.d N, N, -1
add.d X, X, INCX
MOV s2, s1
bge $r0, N, .L999
MOV s3, s1
srai.d I, N, 3
MOV s4, s1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
CMPLT $fcc0, s1, a1
LD a7, X, 0 * SIZE
CMPLT $fcc1, s2, a2
add.d X, X, INCX
CMPLT $fcc2, s3, a3
LD a8, X, 0 * SIZE
CMPLT $fcc3, s4, a4
add.d X, X, INCX
CMOVT s1, s1, a1, $fcc0
LD a1, X, 0 * SIZE
CMOVT s2, s2, a2, $fcc1
add.d X, X, INCX
CMOVT s3, s3, a3, $fcc2
LD a2, X, 0 * SIZE
CMOVT s4, s4, a4, $fcc3
add.d X, X, INCX
CMPLT $fcc0, s1, a5
LD a3, X, 0 * SIZE
CMPLT $fcc1, s2, a6
add.d X, X, INCX
CMPLT $fcc2, s3, a7
LD a4, X, 0 * SIZE
CMPLT $fcc3, s4, a8
add.d X, X, INCX
CMOVT s1, s1, a5, $fcc0
LD a5, X, 0 * SIZE
CMOVT s2, s2, a6, $fcc1
add.d X, X, INCX
CMOVT s3, s3, a7, $fcc2
LD a6, X, 0 * SIZE
CMOVT s4, s4, a8, $fcc3
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L12
.align 3
.L13:
CMPLT $fcc0, s1, a1
LD a7, X, 0 * SIZE
CMPLT $fcc1, s2, a2
add.d X, X, INCX
CMPLT $fcc2, s3, a3
LD a8, X, 0 * SIZE
CMPLT $fcc3, s4, a4
add.d X, X, INCX
CMOVT s1, s1, a1, $fcc0
CMOVT s2, s2, a2, $fcc1
CMOVT s3, s3, a3, $fcc2
CMOVT s4, s4, a4, $fcc3
CMPLT $fcc0, s1, a5
CMPLT $fcc1, s2, a6
CMPLT $fcc2, s3, a7
CMPLT $fcc3, s4, a8
CMOVT s1, s1, a5, $fcc0
CMOVT s2, s2, a6, $fcc1
CMOVT s3, s3, a7, $fcc2
CMOVT s4, s4, a8, $fcc3
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
addi.d I, I, -1
CMPLT $fcc0, s1, a1
CMOVT s1, s1, a1, $fcc0
add.d X, X, INCX
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s1, s2
CMPLT $fcc1, s3, s4
CMOVT s1, s1, s2, $fcc0
CMOVT s3, s3, s4, $fcc1
CMPLT $fcc0, s1, s3
CMOVT s1, s1, s3, $fcc0
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

174
kernel/loongarch64/min.S Normal file
View File

@ -0,0 +1,174 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999
LD s1, X, 0 * SIZE
addi.d N, N, -1
add.d X, X, INCX
MOV s2, s1
bge $r0, N, .L999
MOV s3, s1
srai.d I, N, 3
MOV s4, s1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
CMPLT $fcc0, a1, s1
LD a7, X, 0 * SIZE
CMPLT $fcc1, a2, s2
add.d X, X, INCX
CMPLT $fcc2, a3, s3
LD a8, X, 0 * SIZE
CMPLT $fcc3, a4, s4
add.d X, X, INCX
CMOVT s1, s1, a1, $fcc0
LD a1, X, 0 * SIZE
CMOVT s2, s2, a2, $fcc1
add.d X, X, INCX
CMOVT s3, s3, a3, $fcc2
LD a2, X, 0 * SIZE
CMOVT s4, s4, a4, $fcc3
add.d X, X, INCX
CMPLT $fcc0, a5, s1
LD a3, X, 0 * SIZE
CMPLT $fcc1, a6, s2
add.d X, X, INCX
CMPLT $fcc2, a7, s3
LD a4, X, 0 * SIZE
CMPLT $fcc3, a8, s4
add.d X, X, INCX
CMOVT s1, s1, a5, $fcc0
LD a5, X, 0 * SIZE
CMOVT s2, s2, a6, $fcc1
add.d X, X, INCX
CMOVT s3, s3, a7, $fcc2
LD a6, X, 0 * SIZE
CMOVT s4, s4, a8, $fcc3
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L12
.align 3
.L13:
CMPLT $fcc0, a1, s1
LD a7, X, 0 * SIZE
CMPLT $fcc1, a2, s2
add.d X, X, INCX
CMPLT $fcc2, a3, s3
LD a8, X, 0 * SIZE
CMPLT $fcc3, a4, s4
add.d X, X, INCX
CMOVT s1, s1, a1, $fcc0
CMOVT s2, s2, a2, $fcc1
CMOVT s3, s3, a3, $fcc2
CMOVT s4, s4, a4, $fcc3
CMPLT $fcc0, a5, s1
CMPLT $fcc1, a6, s2
CMPLT $fcc2, a7, s3
CMPLT $fcc3, a8, s4
CMOVT s1, s1, a5, $fcc0
CMOVT s2, s2, a6, $fcc1
CMOVT s3, s3, a7, $fcc2
CMOVT s4, s4, a8, $fcc3
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
addi.d I, I, -1
CMPLT $fcc0, a1, s1
CMOVT s1, s1, a1, $fcc0
add.d X, X, INCX
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s2, s1
CMPLT $fcc1, s4, s3
CMOVT s1, s1, s2, $fcc0
CMOVT s3, s3, s4, $fcc1
CMPLT $fcc0, s3, s1
CMOVT s1, s1, s3, $fcc0
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

330
kernel/loongarch64/scal.S Normal file
View File

@ -0,0 +1,330 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r7
#define INCX $r8
#define I $r17
#define TEMP $r18
#define XX $r5
#define ALPHA $f0
#define a1 $f22
#define a2 $f8
#define a3 $f23
#define a4 $f9
#define a5 $f10
#define a6 $f11
#define a7 $f12
#define a8 $f13
#define t1 $f14
#define t2 $f15
#define t3 $f16
#define t4 $f17
PROLOGUE
li.d TEMP, SIZE
MTC a1, $r0
slli.d INCX, INCX, BASE_SHIFT
bge $r0, N, .L999
CMPEQ $fcc0, ALPHA, a1
bceqz $fcc0, .L50
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L15
.align 3
.L12:
ST a1, X, 0 * SIZE
ST a1, X, 1 * SIZE
ST a1, X, 2 * SIZE
ST a1, X, 3 * SIZE
ST a1, X, 4 * SIZE
ST a1, X, 5 * SIZE
ST a1, X, 6 * SIZE
ST a1, X, 7 * SIZE
addi.w I, I, -1
addi.d X, X, 8 * SIZE
blt $r0, I, .L12
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L16:
ST a1, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
blt $r0, I, .L16
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L20:
srai.d I, N, 3
bge $r0, I, .L25
.align 3
.L22:
ST a1, X, 0 * SIZE
add.d X, X, INCX
ST a1, X, 0 * SIZE
add.d X, X, INCX
ST a1, X, 0 * SIZE
add.d X, X, INCX
ST a1, X, 0 * SIZE
add.d X, X, INCX
ST a1, X, 0 * SIZE
add.d X, X, INCX
ST a1, X, 0 * SIZE
add.d X, X, INCX
ST a1, X, 0 * SIZE
add.d X, X, INCX
ST a1, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L22
.align 3
.L25:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L26:
addi.d I, I, -1
ST a1, X, 0 * SIZE
add.d X, X, INCX
blt $r0, I, .L26
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L50:
srai.d I, N, 3
bne INCX, TEMP, .L60
addi.d I, I, -1
blt I, $r0, .L55
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
LD a3, X, 2 * SIZE
LD a4, X, 3 * SIZE
LD a5, X, 4 * SIZE
LD a6, X, 5 * SIZE
LD a7, X, 6 * SIZE
LD a8, X, 7 * SIZE
bge $r0, I, .L53
.align 3
.L52:
MUL t1, ALPHA, a1
LD a1, X, 8 * SIZE
MUL t2, ALPHA, a2
LD a2, X, 9 * SIZE
MUL t3, ALPHA, a3
LD a3, X, 10 * SIZE
MUL t4, ALPHA, a4
LD a4, X, 11 * SIZE
ST t1, X, 0 * SIZE
MUL t1, ALPHA, a5
LD a5, X, 12 * SIZE
ST t2, X, 1 * SIZE
MUL t2, ALPHA, a6
LD a6, X, 13 * SIZE
ST t3, X, 2 * SIZE
MUL t3, ALPHA, a7
LD a7, X, 14 * SIZE
ST t4, X, 3 * SIZE
MUL t4, ALPHA, a8
LD a8, X, 15 * SIZE
addi.d I, I, -1
ST t1, X, 4 * SIZE
ST t2, X, 5 * SIZE
ST t3, X, 6 * SIZE
ST t4, X, 7 * SIZE
addi.d X, X, 8 * SIZE
blt $r0, I, .L52
.align 3
.L53:
MUL t1, ALPHA, a1
MUL t2, ALPHA, a2
MUL t3, ALPHA, a3
MUL t4, ALPHA, a4
ST t1, X, 0 * SIZE
MUL t1, ALPHA, a5
ST t2, X, 1 * SIZE
MUL t2, ALPHA, a6
ST t3, X, 2 * SIZE
MUL t3, ALPHA, a7
ST t4, X, 3 * SIZE
MUL t4, ALPHA, a8
ST t1, X, 4 * SIZE
ST t2, X, 5 * SIZE
ST t3, X, 6 * SIZE
ST t4, X, 7 * SIZE
addi.d X, X, 8 * SIZE
.align 3
.L55:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L56:
LD a1, X, 0 * SIZE
MUL t1, ALPHA, a1
addi.d X, X, SIZE
addi.d I, I, -1
ST t1, X, -1 * SIZE
blt $r0, I, .L56
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L60:
srai.d I, N, 3
move XX, X
addi.d I, I, -1
blt I, $r0, .L65
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
add.d X, X, INCX
bge $r0, I, .L63
.align 3
.L62:
MUL t1, ALPHA, a1
LD a1, X, 0 * SIZE
add.d X, X, INCX
MUL t2, ALPHA, a2
LD a2, X, 0 * SIZE
add.d X, X, INCX
MUL t3, ALPHA, a3
LD a3, X, 0 * SIZE
add.d X, X, INCX
MUL t4, ALPHA, a4
LD a4, X, 0 * SIZE
add.d X, X, INCX
ST t1, XX, 0 * SIZE
add.d XX, XX, INCX
ST t2, XX, 0 * SIZE
add.d XX, XX, INCX
ST t3, XX, 0 * SIZE
add.d XX, XX, INCX
ST t4, XX, 0 * SIZE
add.d XX, XX, INCX
MUL t1, ALPHA, a5
LD a5, X, 0 * SIZE
add.d X, X, INCX
MUL t2, ALPHA, a6
LD a6, X, 0 * SIZE
add.d X, X, INCX
MUL t3, ALPHA, a7
LD a7, X, 0 * SIZE
add.d X, X, INCX
MUL t4, ALPHA, a8
LD a8, X, 0 * SIZE
add.d X, X, INCX
ST t1, XX, 0 * SIZE
add.d XX, XX, INCX
ST t2, XX, 0 * SIZE
add.d XX, XX, INCX
ST t3, XX, 0 * SIZE
add.d XX, XX, INCX
ST t4, XX, 0 * SIZE
addi.d I, I, -1
add.d XX, XX, INCX
blt $r0, I, .L62
.align 3
.L63:
MUL t1, ALPHA, a1
MUL t2, ALPHA, a2
MUL t3, ALPHA, a3
MUL t4, ALPHA, a4
ST t1, XX, 0 * SIZE
add.d XX, XX, INCX
ST t2, XX, 0 * SIZE
add.d XX, XX, INCX
ST t3, XX, 0 * SIZE
add.d XX, XX, INCX
ST t4, XX, 0 * SIZE
add.d XX, XX, INCX
MUL t1, ALPHA, a5
MUL t2, ALPHA, a6
MUL t3, ALPHA, a7
MUL t4, ALPHA, a8
ST t1, XX, 0 * SIZE
add.d XX, XX, INCX
ST t2, XX, 0 * SIZE
add.d XX, XX, INCX
ST t3, XX, 0 * SIZE
add.d XX, XX, INCX
ST t4, XX, 0 * SIZE
add.d XX, XX, INCX
.align 3
.L65:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L66:
LD a1, X, 0 * SIZE
MUL t1, ALPHA, a1
addi.d I, I, -1
ST t1, X, 0 * SIZE
add.d X, X, INCX
blt $r0, I, .L66
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

249
kernel/loongarch64/snrm2.S Normal file
View File

@ -0,0 +1,249 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define a5 $f16
#define a6 $f17
#define a7 $f0
#define a8 $f1
#define s1 $f22
#define s2 $f8
#define t1 $f23
#define t2 $f9
#define t3 $f10
#define t4 $f11
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
movgr2fr.d s1, $r0
li.d TEMP, SIZE
fmov.d s2, s1
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
LD a3, X, 2 * SIZE
LD a4, X, 3 * SIZE
LD a5, X, 4 * SIZE
addi.d I, I, -1
fcvt.d.s t1, a1
LD a6, X, 5 * SIZE
fcvt.d.s t2, a2
LD a7, X, 6 * SIZE
fcvt.d.s t3, a3
LD a8, X, 7 * SIZE
fcvt.d.s t4, a4
bge $r0, I, .L13
.align 3
.L12:
fmadd.d s1, t1, t1, s1
LD a1, X, 8 * SIZE
fcvt.d.s t1, a5
NOP
fmadd.d s2, t2, t2, s2
LD a2, X, 9 * SIZE
fcvt.d.s t2, a6
NOP
fmadd.d s1, t3, t3, s1
LD a3, X, 10 * SIZE
fcvt.d.s t3, a7
NOP
fmadd.d s2, t4, t4, s2
LD a4, X, 11 * SIZE
fcvt.d.s t4, a8
NOP
fmadd.d s1, t1, t1, s1
LD a5, X, 12 * SIZE
fcvt.d.s t1, a1
NOP
fmadd.d s2, t2, t2, s2
LD a6, X, 13 * SIZE
fcvt.d.s t2, a2
addi.d I, I, -1
fmadd.d s1, t3, t3, s1
LD a7, X, 14 * SIZE
fcvt.d.s t3, a3
addi.d X, X, 8 * SIZE
fmadd.d s2, t4, t4, s2
LD a8, X, 7 * SIZE
fcvt.d.s t4, a4
blt $r0, I, .L12
.align 3
.L13:
fmadd.d s1, t1, t1, s1
fcvt.d.s t1, a5
fmadd.d s2, t2, t2, s2
fcvt.d.s t2, a6
fmadd.d s1, t3, t3, s1
fcvt.d.s t3, a7
fmadd.d s2, t4, t4, s2
fcvt.d.s t4, a8
fmadd.d s1, t1, t1, s1
fmadd.d s2, t2, t2, s2
fmadd.d s1, t3, t3, s1
fmadd.d s2, t4, t4, s2
addi.d X, X, 8 * SIZE
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L16:
LD a1, X, 0 * SIZE
addi.d I, I, -1
fcvt.d.s t1, a1
fmadd.d s1, t1, t1, s1
addi.d X, X, SIZE
blt $r0, I, .L16
b .L999
.align 3
.L20:
bge $r0, I, .L25
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
addi.d I, I, -1
fcvt.d.s t1, a1
fcvt.d.s t2, a2
fcvt.d.s t3, a3
fcvt.d.s t4, a4
add.d X, X, INCX
bge $r0, I, .L24
.align 3
.L23:
fmadd.d s1, t1, t1, s1
LD a1, X, 0 * SIZE
fcvt.d.s t1, a5
add.d X, X, INCX
fmadd.d s2, t2, t2, s2
LD a2, X, 0 * SIZE
fcvt.d.s t2, a6
add.d X, X, INCX
fmadd.d s1, t3, t3, s1
LD a3, X, 0 * SIZE
fcvt.d.s t3, a7
add.d X, X, INCX
fmadd.d s2, t4, t4, s2
LD a4, X, 0 * SIZE
fcvt.d.s t4, a8
add.d X, X, INCX
fmadd.d s1, t1, t1, s1
LD a5, X, 0 * SIZE
fcvt.d.s t1, a1
add.d X, X, INCX
fmadd.d s2, t2, t2, s2
LD a6, X, 0 * SIZE
fcvt.d.s t2, a2
add.d X, X, INCX
fmadd.d s1, t3, t3, s1
LD a7, X, 0 * SIZE
fcvt.d.s t3, a3
add.d X, X, INCX
fmadd.d s2, t4, t4, s2
LD a8, X, 0 * SIZE
fcvt.d.s t4, a4
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L23
.align 3
.L24:
fmadd.d s1, t1, t1, s1
fcvt.d.s t1, a5
fmadd.d s2, t2, t2, s2
fcvt.d.s t2, a6
fmadd.d s1, t3, t3, s1
fcvt.d.s t3, a7
fmadd.d s2, t4, t4, s2
fcvt.d.s t4, a8
fmadd.d s1, t1, t1, s1
fmadd.d s2, t2, t2, s2
fmadd.d s1, t3, t3, s1
fmadd.d s2, t4, t4, s2
.align 3
.L25:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
addi.d I, I, -1
fcvt.d.s t1, a1
add.d X, X, INCX
fmadd.d s1, t1, t1, s1
blt $r0, I, .L26
.align 3
.L999:
fadd.d s1, s1, s2
fsqrt.d s1, s1
move $r4, $r17
fcvt.s.d $f0, s1
jirl $r0, $r1, 0x0
EPILOGUE

330
kernel/loongarch64/swap.S Normal file
View File

@ -0,0 +1,330 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r7
#define INCX $r8
#define Y $r9
#define INCY $r10
#define I $r17
#define TEMP $r18
#define XX $r5
#define YY $r6
#define a1 $f22
#define a2 $f8
#define a3 $f23
#define a4 $f9
#define a5 $f10
#define a6 $f11
#define a7 $f12
#define a8 $f13
#define b1 $f14
#define b2 $f15
#define b3 $f16
#define b4 $f17
#define b5 $f0
#define b6 $f1
#define b7 $f2
#define b8 $f3
PROLOGUE
li.d TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
bge $r0, N, .L999
slli.d INCY, INCY, BASE_SHIFT
bne INCX, TEMP, .L20
srai.d I, N, 3
bne INCY, TEMP, .L20
addi.d I, I, -1
blt I, $r0, .L15
LD a1, X, 0 * SIZE
LD b1, Y, 0 * SIZE
LD a2, X, 1 * SIZE
LD b2, Y, 1 * SIZE
LD a3, X, 2 * SIZE
LD b3, Y, 2 * SIZE
LD a4, X, 3 * SIZE
LD b4, Y, 3 * SIZE
LD a5, X, 4 * SIZE
LD b5, Y, 4 * SIZE
LD a6, X, 5 * SIZE
LD b6, Y, 5 * SIZE
LD a7, X, 6 * SIZE
LD b7, Y, 6 * SIZE
LD a8, X, 7 * SIZE
LD b8, Y, 7 * SIZE
bge $r0, I, .L13
.align 3
.L12:
ST a1, Y, 0 * SIZE
LD a1, X, 8 * SIZE
ST b1, X, 0 * SIZE
LD b1, Y, 8 * SIZE
ST a2, Y, 1 * SIZE
LD a2, X, 9 * SIZE
ST b2, X, 1 * SIZE
LD b2, Y, 9 * SIZE
ST a3, Y, 2 * SIZE
LD a3, X, 10 * SIZE
ST b3, X, 2 * SIZE
LD b3, Y, 10 * SIZE
ST a4, Y, 3 * SIZE
LD a4, X, 11 * SIZE
ST b4, X, 3 * SIZE
LD b4, Y, 11 * SIZE
ST a5, Y, 4 * SIZE
LD a5, X, 12 * SIZE
ST b5, X, 4 * SIZE
LD b5, Y, 12 * SIZE
ST a6, Y, 5 * SIZE
LD a6, X, 13 * SIZE
ST b6, X, 5 * SIZE
LD b6, Y, 13 * SIZE
ST a7, Y, 6 * SIZE
LD a7, X, 14 * SIZE
ST b7, X, 6 * SIZE
LD b7, Y, 14 * SIZE
ST a8, Y, 7 * SIZE
LD a8, X, 15 * SIZE
ST b8, X, 7 * SIZE
LD b8, Y, 15 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L12
.align 3
.L13:
ST a1, Y, 0 * SIZE
ST b1, X, 0 * SIZE
ST a2, Y, 1 * SIZE
ST b2, X, 1 * SIZE
ST a3, Y, 2 * SIZE
ST b3, X, 2 * SIZE
ST a4, Y, 3 * SIZE
ST b4, X, 3 * SIZE
ST a5, Y, 4 * SIZE
ST b5, X, 4 * SIZE
ST a6, Y, 5 * SIZE
ST b6, X, 5 * SIZE
ST a7, Y, 6 * SIZE
ST b7, X, 6 * SIZE
ST a8, Y, 7 * SIZE
ST b8, X, 7 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L16:
LD a1, X, 0 * SIZE
LD b1, Y, 0 * SIZE
addi.d X, X, SIZE
addi.d I, I, -1
addi.d Y, Y, SIZE
ST b1, X, -1 * SIZE
ST a1, Y, -1 * SIZE
blt $r0, I, .L16
b .L999
.align 3
.L20:
srai.d I, N, 3
move XX, X
move YY, Y
addi.d I, I, -1
blt I, $r0, .L25
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD b2, Y, 0 * SIZE
add.d Y, Y, INCY
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD b3, Y, 0 * SIZE
add.d Y, Y, INCY
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD b4, Y, 0 * SIZE
add.d Y, Y, INCY
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD b5, Y, 0 * SIZE
add.d Y, Y, INCY
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD b6, Y, 0 * SIZE
add.d Y, Y, INCY
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD b7, Y, 0 * SIZE
add.d Y, Y, INCY
LD a8, X, 0 * SIZE
add.d X, X, INCX
LD b8, Y, 0 * SIZE
add.d Y, Y, INCY
bge $r0, I, .L23
.align 3
.L22:
ST a1, YY, 0 * SIZE
add.d YY, YY, INCY
LD a1, X, 0 * SIZE
add.d X, X, INCX
ST b1, XX, 0 * SIZE
add.d XX, XX, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
ST a2, YY, 0 * SIZE
add.d YY, YY, INCY
LD a2, X, 0 * SIZE
add.d X, X, INCX
ST b2, XX, 0 * SIZE
add.d XX, XX, INCX
LD b2, Y, 0 * SIZE
add.d Y, Y, INCY
ST a3, YY, 0 * SIZE
add.d YY, YY, INCY
LD a3, X, 0 * SIZE
add.d X, X, INCX
ST b3, XX, 0 * SIZE
add.d XX, XX, INCX
LD b3, Y, 0 * SIZE
add.d Y, Y, INCY
ST a4, YY, 0 * SIZE
add.d YY, YY, INCY
LD a4, X, 0 * SIZE
add.d X, X, INCX
ST b4, XX, 0 * SIZE
add.d XX, XX, INCX
LD b4, Y, 0 * SIZE
add.d Y, Y, INCY
ST a5, YY, 0 * SIZE
add.d YY, YY, INCY
LD a5, X, 0 * SIZE
add.d X, X, INCX
ST b5, XX, 0 * SIZE
add.d XX, XX, INCX
LD b5, Y, 0 * SIZE
add.d Y, Y, INCY
ST a6, YY, 0 * SIZE
add.d YY, YY, INCY
LD a6, X, 0 * SIZE
add.d X, X, INCX
ST b6, XX, 0 * SIZE
add.d XX, XX, INCX
LD b6, Y, 0 * SIZE
add.d Y, Y, INCY
ST a7, YY, 0 * SIZE
add.d YY, YY, INCY
LD a7, X, 0 * SIZE
add.d X, X, INCX
ST b7, XX, 0 * SIZE
add.d XX, XX, INCX
LD b7, Y, 0 * SIZE
add.d Y, Y, INCY
ST a8, YY, 0 * SIZE
add.d YY, YY, INCY
LD a8, X, 0 * SIZE
add.d X, X, INCX
ST b8, XX, 0 * SIZE
add.d XX, XX, INCX
LD b8, Y, 0 * SIZE
addi.d I, I, -1
add.d Y, Y, INCY
blt $r0, I, .L22
.align 3
.L23:
ST a1, YY, 0 * SIZE
add.d YY, YY, INCY
ST b1, XX, 0 * SIZE
add.d XX, XX, INCX
ST a2, YY, 0 * SIZE
add.d YY, YY, INCY
ST b2, XX, 0 * SIZE
add.d XX, XX, INCX
ST a3, YY, 0 * SIZE
add.d YY, YY, INCY
ST b3, XX, 0 * SIZE
add.d XX, XX, INCX
ST a4, YY, 0 * SIZE
add.d YY, YY, INCY
ST b4, XX, 0 * SIZE
add.d XX, XX, INCX
ST a5, YY, 0 * SIZE
add.d YY, YY, INCY
ST b5, XX, 0 * SIZE
add.d XX, XX, INCX
ST a6, YY, 0 * SIZE
add.d YY, YY, INCY
ST b6, XX, 0 * SIZE
add.d XX, XX, INCX
ST a7, YY, 0 * SIZE
add.d YY, YY, INCY
ST b7, XX, 0 * SIZE
add.d XX, XX, INCX
ST a8, YY, 0 * SIZE
add.d YY, YY, INCY
ST b8, XX, 0 * SIZE
add.d XX, XX, INCX
.align 3
.L25:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
LD b1, Y, 0 * SIZE
addi.d I, I, -1
ST a1, Y, 0 * SIZE
ST b1, X, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L26
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

190
kernel/loongarch64/zamax.S Normal file
View File

@ -0,0 +1,190 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define t5 $f4
#define t6 $f5
#define t7 $f6
#define t8 $f7
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
bge $r0, N, .L999
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
addi.d N, N, -1
LD a2, X, 1 * SIZE
add.d X, X, INCX
FABS t1, a1
FABS t2, a2
ADD s1, t1, t2
bge $r0, N, .L999
ADD s2, t1, t2
srai.d I, N, 2
ADD s3, t1, t2
ADD s4, t1, t2
bge $r0, I, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
LD a4, X, 1 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
LD a6, X, 1 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
LD a8, X, 1 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
LD a2, X, 1 * SIZE
FABS t3, a3
add.d X, X, INCX
FABS t4, a4
FABS t5, a5
LD a3, X, 0 * SIZE
FABS t6, a6
LD a4, X, 1 * SIZE
FABS t7, a7
add.d X, X, INCX
FABS t8, a8
ADD t1, t1, t2
LD a5, X, 0 * SIZE
ADD t3, t3, t4
LD a6, X, 1 * SIZE
ADD t5, t5, t6
add.d X, X, INCX
ADD t7, t7, t8
CMPLT $fcc0, s1, t1
LD a7, X, 0 * SIZE
CMPLT $fcc1, s2, t3
LD a8, X, 1 * SIZE
CMPLT $fcc2, s3, t5
add.d X, X, INCX
CMPLT $fcc3, s4, t7
CMOVT s1, s1, t1, $fcc0
addi.d I, I, -1
CMOVT s2, s2, t3, $fcc1
CMOVT s3, s3, t5, $fcc2
CMOVT s4, s4, t7, $fcc3
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
FABS t5, a5
FABS t6, a6
FABS t7, a7
FABS t8, a8
ADD t1, t1, t2
ADD t3, t3, t4
ADD t5, t5, t6
ADD t7, t7, t8
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t3
CMPLT $fcc2, s3, t5
CMPLT $fcc3, s4, t7
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t3, $fcc1
CMOVT s3, s3, t5, $fcc2
CMOVT s4, s4, t7, $fcc3
.align 3
.L15:
andi I, N, 3
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
addi.d I, I, -1
FABS t1, a1
FABS t2, a2
ADD t1, t1, t2
CMPLT $fcc0, s1, t1
CMOVT s1, s1, t1, $fcc0
add.d X, X, INCX
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s1, s2
CMPLT $fcc1, s3, s4
CMOVT s1, s1, s2, $fcc0
CMOVT s3, s3, s4, $fcc1
CMPLT $fcc0, s1, s3
CMOVT s1, s1, s3, $fcc0
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

198
kernel/loongarch64/zamin.S Normal file
View File

@ -0,0 +1,198 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define t5 $f4
#define t6 $f5
#define t7 $f6
#define t8 $f7
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
bge $r0, N, .L999
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
addi.d N, N, -1
LD a2, X, 1 * SIZE
add.d X, X, INCX
FABS t1, a1
FABS t2, a2
ADD s1, t1, t2
bge $r0, N, .L999
NOP
ADD s2, t1, t2
srai.d I, N, 2
ADD s3, t1, t2
ADD s4, t1, t2
bge $r0, I, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
LD a4, X, 1 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
LD a6, X, 1 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
LD a8, X, 1 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
LD a2, X, 1 * SIZE
FABS t3, a3
add.d X, X, INCX
FABS t4, a4
NOP
FABS t5, a5
LD a3, X, 0 * SIZE
FABS t6, a6
LD a4, X, 1 * SIZE
FABS t7, a7
add.d X, X, INCX
FABS t8, a8
NOP
ADD t1, t1, t2
LD a5, X, 0 * SIZE
ADD t3, t3, t4
LD a6, X, 1 * SIZE
ADD t5, t5, t6
add.d X, X, INCX
ADD t7, t7, t8
NOP
CMPLT $fcc0, t1, s1
LD a7, X, 0 * SIZE
CMPLT $fcc1, t3, s2
LD a8, X, 1 * SIZE
CMPLT $fcc2, t5, s3
add.d X, X, INCX
CMPLT $fcc3, t7, s4
NOP
CMOVT s1, s1, t1, $fcc0
addi.d I, I, -1
CMOVT s2, s2, t3, $fcc1
NOP
CMOVT s3, s3, t5, $fcc2
CMOVT s4, s4, t7, $fcc3
blt $r0, I, .L12
NOP
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
FABS t5, a5
FABS t6, a6
FABS t7, a7
FABS t8, a8
ADD t1, t1, t2
ADD t3, t3, t4
ADD t5, t5, t6
ADD t7, t7, t8
CMPLT $fcc0, t1, s1
CMPLT $fcc1, t3, s2
CMPLT $fcc2, t5, s3
CMPLT $fcc3, t7, s4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t3, $fcc1
CMOVT s3, s3, t5, $fcc2
CMOVT s4, s4, t7, $fcc3
.align 3
.L15:
andi I, N, 3
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
addi.d I, I, -1
FABS t1, a1
FABS t2, a2
ADD t1, t1, t2
CMPLT $fcc0, t1, s1
CMOVT s1, s1, t1, $fcc0
add.d X, X, INCX
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s2, s1
CMPLT $fcc1, s4, s3
CMOVT s1, s1, s2, $fcc0
CMOVT s3, s3, s4, $fcc1
CMPLT $fcc0, s3, s1
CMOVT s1, s1, s3, $fcc0
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
NOP
EPILOGUE

158
kernel/loongarch64/zasum.S Normal file
View File

@ -0,0 +1,158 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f23
#define a2 $f9
#define a3 $f10
#define a4 $f11
#define a5 $f12
#define a6 $f13
#define a7 $f14
#define a8 $f15
#define t1 $f16
#define t2 $f17
#define t3 $f0
#define t4 $f1
#define s1 $f22
#define s2 $f8
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
MTC s2, $r0
slli.d INCX, INCX, ZBASE_SHIFT
srai.d I, N, 2
bge $r0, N, .L999
bge $r0, I, .L25
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
LD a4, X, 1 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
LD a6, X, 1 * SIZE
add.d X, X, INCX
FABS t1, a1
FABS t2, a2
LD a7, X, 0 * SIZE
LD a8, X, 1 * SIZE
FABS t3, a3
FABS t4, a4
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L24
.align 3
.L23:
ADD s1, s1, t1
LD a1, X, 0 * SIZE
FABS t1, a5
addi.d I, I, -1
ADD s2, s2, t2
LD a2, X, 1 * SIZE
FABS t2, a6
add.d X, X, INCX
ADD s1, s1, t3
LD a3, X, 0 * SIZE
FABS t3, a7
NOP
ADD s2, s2, t4
LD a4, X, 1 * SIZE
FABS t4, a8
add.d X, X, INCX
ADD s1, s1, t1
LD a5, X, 0 * SIZE
FABS t1, a1
NOP
ADD s2, s2, t2
LD a6, X, 1 * SIZE
FABS t2, a2
add.d X, X, INCX
ADD s1, s1, t3
LD a7, X, 0 * SIZE
FABS t3, a3
LD a8, X, 1 * SIZE
ADD s2, s2, t4
add.d X, X, INCX
FABS t4, a4
blt $r0, I, .L23
.align 3
.L24:
ADD s1, s1, t1
FABS t1, a5
ADD s2, s2, t2
FABS t2, a6
ADD s1, s1, t3
FABS t3, a7
ADD s2, s2, t4
FABS t4, a8
ADD s1, s1, t1
ADD s2, s2, t2
ADD s1, s1, t3
ADD s2, s2, t4
.align 3
.L25:
andi I, N, 3
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
FABS t1, a1
addi.d I, I, -1
FABS t2, a2
add.d X, X, INCX
ADD s1, s1, t1
ADD s2, s2, t2
blt $r0, I, .L26
.align 3
.L999:
ADD s1, s1, s2
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

217
kernel/loongarch64/zcopy.S Normal file
View File

@ -0,0 +1,217 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
#define a1 $f22
#define a2 $f8
#define a3 $f23
#define a4 $f9
#define a5 $f10
#define a6 $f11
#define a7 $f12
#define a8 $f13
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
LDINT INCY, 0(INCY)
#endif
li.d TEMP, 2 * SIZE
NOP
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, N, .L999
slli.d INCY, INCY, ZBASE_SHIFT
bne INCX, TEMP, .L20
srai.d I, N, 2
bne INCY, TEMP, .L20
addi.d I, I, -1
blt I, $r0, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
LD a3, X, 2 * SIZE
LD a4, X, 3 * SIZE
LD a5, X, 4 * SIZE
LD a6, X, 5 * SIZE
LD a7, X, 6 * SIZE
LD a8, X, 7 * SIZE
bge $r0, I, .L13
.align 3
.L12:
ST a1, Y, 0 * SIZE
LD a1, X, 8 * SIZE
ST a2, Y, 1 * SIZE
LD a2, X, 9 * SIZE
ST a3, Y, 2 * SIZE
LD a3, X, 10 * SIZE
ST a4, Y, 3 * SIZE
LD a4, X, 11 * SIZE
ST a5, Y, 4 * SIZE
LD a5, X, 12 * SIZE
ST a6, Y, 5 * SIZE
LD a6, X, 13 * SIZE
ST a7, Y, 6 * SIZE
LD a7, X, 14 * SIZE
ST a8, Y, 7 * SIZE
LD a8, X, 15 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L12
.align 3
.L13:
ST a1, Y, 0 * SIZE
ST a2, Y, 1 * SIZE
ST a3, Y, 2 * SIZE
ST a4, Y, 3 * SIZE
ST a5, Y, 4 * SIZE
ST a6, Y, 5 * SIZE
ST a7, Y, 6 * SIZE
ST a8, Y, 7 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
.align 3
.L15:
andi I, N, 3
bge $r0, I, .L999
.align 3
.L16:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
addi.d X, X, 2 * SIZE
addi.d Y, Y, 2 * SIZE
ST a1, Y, -2 * SIZE
addi.d I, I, -1
ST a2, Y, -1 * SIZE
blt $r0, I, .L16
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
NOP
.align 3
.L20:
srai.d I, N, 2
addi.d I, I, -1
blt I, $r0, .L25
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
LD a4, X, 1 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
LD a6, X, 1 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
LD a8, X, 1 * SIZE
add.d X, X, INCX
bge $r0, I, .L23
.align 3
.L22:
ST a1, Y, 0 * SIZE
LD a1, X, 0 * SIZE
ST a2, Y, 1 * SIZE
add.d Y, Y, INCY
LD a2, X, 1 * SIZE
add.d X, X, INCX
ST a3, Y, 0 * SIZE
LD a3, X, 0 * SIZE
ST a4, Y, 1 * SIZE
add.d Y, Y, INCY
LD a4, X, 1 * SIZE
add.d X, X, INCX
ST a5, Y, 0 * SIZE
LD a5, X, 0 * SIZE
ST a6, Y, 1 * SIZE
add.d Y, Y, INCY
LD a6, X, 1 * SIZE
add.d X, X, INCX
ST a7, Y, 0 * SIZE
LD a7, X, 0 * SIZE
ST a8, Y, 1 * SIZE
add.d Y, Y, INCY
LD a8, X, 1 * SIZE
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L22
.align 3
.L23:
ST a1, Y, 0 * SIZE
ST a2, Y, 1 * SIZE
add.d Y, Y, INCY
ST a3, Y, 0 * SIZE
ST a4, Y, 1 * SIZE
add.d Y, Y, INCY
ST a5, Y, 0 * SIZE
ST a6, Y, 1 * SIZE
add.d Y, Y, INCY
ST a7, Y, 0 * SIZE
ST a8, Y, 1 * SIZE
add.d Y, Y, INCY
.align 3
.L25:
andi I, N, 3
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
addi.d I, I, -1
ST a1, Y, 0 * SIZE
ST a2, Y, 1 * SIZE
add.d Y, Y, INCY
blt $r0, I, .L26
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

330
kernel/loongarch64/zdot.S Normal file
View File

@ -0,0 +1,330 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define b1 $f14
#define b2 $f15
#define b3 $f16
#define b4 $f17
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
LDINT INCY, 0(INCY)
#endif
MTC s1, $r0
MOV s2, s1
MOV s3, s2
MOV s4, s3
slli.d INCX, INCX, ZBASE_SHIFT
li.d TEMP, 2 * SIZE
slli.d INCY, INCY, ZBASE_SHIFT
bge $r0, N, .L999
srai.d I, N, 2
bne INCX, TEMP, .L20
bne INCY, TEMP, .L20
bge $r0, I, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
LD b1, Y, 0 * SIZE
addi.d I, I, -1
LD b2, Y, 1 * SIZE
bge $r0, I, .L14
.align 3
.L13:
MADD s1, b1, a1, s1
LD a3, X, 2 * SIZE
MADD s2, b1, a2, s2
LD a4, X, 3 * SIZE
MADD s3, b2, a1, s3
LD b3, Y, 2 * SIZE
MADD s4, b2, a2, s4
LD b4, Y, 3 * SIZE
MADD s1, b3, a3, s1
LD a1, X, 4 * SIZE
MADD s2, b3, a4, s2
LD a2, X, 5 * SIZE
MADD s3, b4, a3, s3
LD b1, Y, 4 * SIZE
MADD s4, b4, a4, s4
LD b2, Y, 5 * SIZE
MADD s1, b1, a1, s1
LD a3, X, 6 * SIZE
MADD s2, b1, a2, s2
LD a4, X, 7 * SIZE
MADD s3, b2, a1, s3
LD b3, Y, 6 * SIZE
MADD s4, b2, a2, s4
LD b4, Y, 7 * SIZE
MADD s1, b3, a3, s1
LD a1, X, 8 * SIZE
MADD s2, b3, a4, s2
LD a2, X, 9 * SIZE
MADD s3, b4, a3, s3
LD b1, Y, 8 * SIZE
MADD s4, b4, a4, s4
LD b2, Y, 9 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L13
.align 3
.L14:
MADD s1, b1, a1, s1
LD a3, X, 2 * SIZE
MADD s2, b1, a2, s2
LD a4, X, 3 * SIZE
MADD s3, b2, a1, s3
LD b3, Y, 2 * SIZE
MADD s4, b2, a2, s4
LD b4, Y, 3 * SIZE
MADD s1, b3, a3, s1
LD a1, X, 4 * SIZE
MADD s2, b3, a4, s2
LD a2, X, 5 * SIZE
MADD s3, b4, a3, s3
LD b1, Y, 4 * SIZE
MADD s4, b4, a4, s4
LD b2, Y, 5 * SIZE
MADD s1, b1, a1, s1
LD a3, X, 6 * SIZE
MADD s2, b1, a2, s2
LD a4, X, 7 * SIZE
MADD s3, b2, a1, s3
LD b3, Y, 6 * SIZE
MADD s4, b2, a2, s4
LD b4, Y, 7 * SIZE
MADD s1, b3, a3, s1
addi.d X, X, 8 * SIZE
MADD s2, b3, a4, s2
addi.d Y, Y, 8 * SIZE
MADD s3, b4, a3, s3
MADD s4, b4, a4, s4
.align 3
.L15:
andi I, N, 3
bge $r0, I, .L999
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
LD b1, Y, 0 * SIZE
addi.d I, I, -1
LD b2, Y, 1 * SIZE
bge $r0, I, .L17
.align 3
.L16:
MADD s1, b1, a1, s1
addi.d I, I, -1
MADD s2, b1, a2, s2
LD b1, Y, 2 * SIZE
MADD s3, b2, a1, s3
LD a1, X, 2 * SIZE
MADD s4, b2, a2, s4
LD a2, X, 3 * SIZE
LD b2, Y, 3 * SIZE
addi.d X, X, 2 * SIZE
addi.d Y, Y, 2 * SIZE
blt $r0, I, .L16
.align 3
.L17:
MADD s1, b1, a1, s1
MADD s2, b1, a2, s2
MADD s3, b2, a1, s3
MADD s4, b2, a2, s4
b .L999
.align 3
.L20:
#ifdef F_INTERFACE
bgez INCX, .L21
addi.d TEMP, N, -1
mult TEMP, INCX
mflo TEMP
dsub X, X, TEMP
.align 3
.L21:
bgez INCY, .L22
addi.d TEMP, N, -1
mult TEMP, INCY
mflo TEMP
dsub Y, Y, TEMP
.align 3
.L22:
#endif
bge $r0, I, .L25
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
LD b1, Y, 0 * SIZE
LD b2, Y, 1 * SIZE
add.d X, X, INCX
addi.d I, I, -1
add.d Y, Y, INCY
bge $r0, I, .L24
.align 3
.L23:
MADD s1, b1, a1, s1
LD a3, X, 0 * SIZE
MADD s2, b1, a2, s2
LD a4, X, 1 * SIZE
MADD s3, b2, a1, s3
LD b3, Y, 0 * SIZE
MADD s4, b2, a2, s4
LD b4, Y, 1 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
MADD s1, b3, a3, s1
LD a1, X, 0 * SIZE
MADD s2, b3, a4, s2
LD a2, X, 1 * SIZE
MADD s3, b4, a3, s3
LD b1, Y, 0 * SIZE
MADD s4, b4, a4, s4
LD b2, Y, 1 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
MADD s1, b1, a1, s1
LD a3, X, 0 * SIZE
MADD s2, b1, a2, s2
LD a4, X, 1 * SIZE
MADD s3, b2, a1, s3
LD b3, Y, 0 * SIZE
MADD s4, b2, a2, s4
LD b4, Y, 1 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
MADD s1, b3, a3, s1
LD a1, X, 0 * SIZE
MADD s2, b3, a4, s2
LD a2, X, 1 * SIZE
MADD s3, b4, a3, s3
LD b1, Y, 0 * SIZE
MADD s4, b4, a4, s4
LD b2, Y, 1 * SIZE
add.d X, X, INCX
addi.d I, I, -1
add.d Y, Y, INCY
blt $r0, I, .L23
.align 3
.L24:
MADD s1, b1, a1, s1
LD a3, X, 0 * SIZE
MADD s2, b1, a2, s2
LD a4, X, 1 * SIZE
MADD s3, b2, a1, s3
LD b3, Y, 0 * SIZE
MADD s4, b2, a2, s4
LD b4, Y, 1 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
MADD s1, b3, a3, s1
LD a1, X, 0 * SIZE
MADD s2, b3, a4, s2
LD a2, X, 1 * SIZE
MADD s3, b4, a3, s3
LD b1, Y, 0 * SIZE
MADD s4, b4, a4, s4
LD b2, Y, 1 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
MADD s1, b1, a1, s1
LD a3, X, 0 * SIZE
MADD s2, b1, a2, s2
LD a4, X, 1 * SIZE
MADD s3, b2, a1, s3
LD b3, Y, 0 * SIZE
MADD s4, b2, a2, s4
LD b4, Y, 1 * SIZE
MADD s1, b3, a3, s1
add.d X, X, INCX
MADD s2, b3, a4, s2
add.d Y, Y, INCY
MADD s3, b4, a3, s3
MADD s4, b4, a4, s4
.align 3
.L25:
andi I, N, 3
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
LD b1, Y, 0 * SIZE
LD b2, Y, 1 * SIZE
MADD s1, b1, a1, s1
MADD s2, b1, a2, s2
MADD s3, b2, a1, s3
MADD s4, b2, a2, s4
add.d X, X, INCX
add.d Y, Y, INCY
addi.d I, I, -1
blt $r0, I, .L26
.align 3
.L999:
#ifndef CONJ
SUB $f0, s1, s4
#else
ADD $f0, s1, s4
#endif
#ifndef CONJ
ADD $f1, s3, s2
#else
SUB $f1, s3, s2
#endif
jirl $r0, $r1, 0x0
EPILOGUE

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,648 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define M $r4
#define N $r5
#define A $r7
#define LDA $r8
#define X $r9
#define INCX $r10
#define Y $r11
#define INCY $r6
#define BUFFER $r17
#define YORIG $r18
#define XX $r12
#define YY $r13
#define I $r14
#define J $r15
#define AO1 $r23
#define AO2 $r24
#define ALPHA_R $f0
#define ALPHA_I $f1
#define a1 $f22
#define a2 $f8
#define a3 $f23
#define a4 $f9
#define a5 $f10
#define a6 $f11
#define a7 $f12
#define a8 $f13
#define x1 $f14
#define x2 $f15
#define x3 $f16
#define x4 $f17
#define y1 $f3
#define y2 $f4
#define y3 $f2
#define y4 $f5
#define t1 $f6
#define t2 $f7
#define t3 $f18
#define t4 $f19
#define t5 $f20
#define t6 $f21
#define t7 $f24
#define t8 $f25
#if !defined(CONJ) && !defined(XCONJ)
#define MADD1 MADD
#define MADD2 MADD
#define MADD3 NMSUB
#define MADD4 MADD
#endif
#if defined(CONJ) && !defined(XCONJ)
#define MADD1 MADD
#define MADD2 MADD
#define MADD3 MADD
#define MADD4 NMSUB
#endif
#if !defined(CONJ) && defined(XCONJ)
#define MADD1 MADD
#define MADD2 NMSUB
#define MADD3 MADD
#define MADD4 MADD
#endif
#if defined(CONJ) && defined(XCONJ)
#define MADD1 MADD
#define MADD2 NMSUB
#define MADD3 NMSUB
#define MADD4 NMSUB
#endif
PROLOGUE
LDARG INCY, $sp, 0
LDARG BUFFER, $sp, 8
#ifndef __64BIT__
addi.d $sp, $sp, -64
#else
addi.d $sp, $sp, -32
#endif
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
fst.d $f24, $sp, 16
fst.d $f25, $sp, 24
#ifndef __64BIT__
fst.d $f18, $sp, 32
fst.d $f19, $sp, 40
fst.d $f20, $sp, 48
fst.d $f21, $sp, 56
#endif
slli.d LDA, LDA, ZBASE_SHIFT
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, M, .L999
slli.d INCY, INCY, ZBASE_SHIFT
bge $r0, N, .L999
li.d I, 2 * SIZE
move YORIG, Y
beq INCY, I, .L10
srai.d I, M, 2
move YORIG, BUFFER
move XX, Y
move YY, BUFFER
bge $r0, I, .L05
.align 3
.L02:
LD a1, XX, 0 * SIZE
LD a2, XX, 1 * SIZE
add.d XX, XX, INCY
LD a3, XX, 0 * SIZE
LD a4, XX, 1 * SIZE
add.d XX, XX, INCY
LD a5, XX, 0 * SIZE
LD a6, XX, 1 * SIZE
add.d XX, XX, INCY
LD a7, XX, 0 * SIZE
LD a8, XX, 1 * SIZE
add.d XX, XX, INCY
addi.d I, I, -1
addi.d YY, YY, 8 * SIZE
ST a1, YY, -8 * SIZE
ST a2, YY, -7 * SIZE
ST a3, YY, -6 * SIZE
ST a4, YY, -5 * SIZE
ST a5, YY, -4 * SIZE
ST a6, YY, -3 * SIZE
ST a7, YY, -2 * SIZE
ST a8, YY, -1 * SIZE
blt $r0, I, .L02
.align 3
.L05:
andi I, M, 3
bge $r0, I, .L10
.align 3
.L06:
LD a1, XX, 0 * SIZE
LD a2, XX, 1 * SIZE
add.d XX, XX, INCY
addi.d I, I, -1
ST a1, YY, 0 * SIZE
ST a2, YY, 1 * SIZE
addi.d YY, YY, 2 * SIZE
blt $r0, I, .L06
.align 3
.L10:
srai.d J, N, 1
bge $r0, J, .L20
.align 3
.L11:
LD x1, X, 0 * SIZE
LD x2, X, 1 * SIZE
add.d X, X, INCX
LD x3, X, 0 * SIZE
LD x4, X, 1 * SIZE
add.d X, X, INCX
MUL a1, ALPHA_R, x1
move AO1, A
MUL a2, ALPHA_I, x1
add.d AO2, A, LDA
MUL a3, ALPHA_R, x3
add.d A, AO2, LDA
MUL a4, ALPHA_I, x3
#ifndef XCONJ
NMSUB x1, x2, ALPHA_I, a1
MADD x2, x2, ALPHA_R, a2
NMSUB x3, x4, ALPHA_I, a3
MADD x4, x4, ALPHA_R, a4
#else
MADD x1, x2, ALPHA_I, a1
MSUB x2, x2, ALPHA_R, a2
MADD x3, x4, ALPHA_I, a3
MSUB x4, x4, ALPHA_R, a4
#endif
srai.d I, M, 2
move YY, YORIG
bge $r0, I, .L15
LD y1, YY, 0 * SIZE
LD a1, AO1, 0 * SIZE
LD y2, YY, 1 * SIZE
LD a3, AO1, 2 * SIZE
LD y3, YY, 2 * SIZE
LD a2, AO1, 1 * SIZE
LD y4, YY, 3 * SIZE
LD a4, AO1, 3 * SIZE
LD a5, AO2, 0 * SIZE
LD a6, AO2, 1 * SIZE
LD a7, AO2, 2 * SIZE
LD a8, AO2, 3 * SIZE
MADD1 t1, a1, x1, y1
LD y1, YY, 4 * SIZE
MADD2 t2, a1, x2, y2
LD a1, AO1, 4 * SIZE
MADD1 t3, a3, x1, y3
LD y2, YY, 5 * SIZE
MADD2 t4, a3, x2, y4
LD a3, AO1, 6 * SIZE
MADD3 t1, a2, x2, t1
LD y3, YY, 6 * SIZE
MADD4 t2, a2, x1, t2
LD a2, AO1, 5 * SIZE
MADD3 t3, a4, x2, t3
LD y4, YY, 7 * SIZE
MADD4 t4, a4, x1, t4
LD a4, AO1, 7 * SIZE
MADD1 t1, a5, x3, t1
MADD2 t2, a5, x4, t2
LD a5, AO2, 4 * SIZE
MADD1 t3, a7, x3, t3
MADD2 t4, a7, x4, t4
LD a7, AO2, 6 * SIZE
MADD3 t1, a6, x4, t1
MADD4 t2, a6, x3, t2
LD a6, AO2, 5 * SIZE
MADD3 t3, a8, x4, t3
addi.d I, I, -1
MADD4 t4, a8, x3, t4
LD a8, AO2, 7 * SIZE
bge $r0, I, .L13
.align 3
.L12:
MADD1 t5, a1, x1, y1
LD y1, YY, 8 * SIZE
MADD2 t6, a1, x2, y2
LD a1, AO1, 8 * SIZE
MADD1 t7, a3, x1, y3
LD y2, YY, 9 * SIZE
MADD2 t8, a3, x2, y4
LD a3, AO1, 10 * SIZE
MADD3 t5, a2, x2, t5
LD y3, YY, 10 * SIZE
MADD4 t6, a2, x1, t6
LD a2, AO1, 9 * SIZE
MADD3 t7, a4, x2, t7
LD y4, YY, 11 * SIZE
MADD4 t8, a4, x1, t8
LD a4, AO1, 11 * SIZE
MADD1 t5, a5, x3, t5
ST t1, YY, 0 * SIZE
MADD2 t6, a5, x4, t6
LD a5, AO2, 8 * SIZE
MADD1 t7, a7, x3, t7
ST t2, YY, 1 * SIZE
MADD2 t8, a7, x4, t8
LD a7, AO2, 10 * SIZE
MADD3 t5, a6, x4, t5
ST t3, YY, 2 * SIZE
MADD4 t6, a6, x3, t6
LD a6, AO2, 9 * SIZE
MADD3 t7, a8, x4, t7
ST t4, YY, 3 * SIZE
MADD4 t8, a8, x3, t8
LD a8, AO2, 11 * SIZE
MADD1 t1, a1, x1, y1
LD y1, YY, 12 * SIZE
MADD2 t2, a1, x2, y2
LD a1, AO1, 12 * SIZE
MADD1 t3, a3, x1, y3
LD y2, YY, 13 * SIZE
MADD2 t4, a3, x2, y4
LD a3, AO1, 14 * SIZE
MADD3 t1, a2, x2, t1
LD y3, YY, 14 * SIZE
MADD4 t2, a2, x1, t2
LD a2, AO1, 13 * SIZE
MADD3 t3, a4, x2, t3
LD y4, YY, 15 * SIZE
MADD4 t4, a4, x1, t4
LD a4, AO1, 15 * SIZE
MADD1 t1, a5, x3, t1
ST t5, YY, 4 * SIZE
MADD2 t2, a5, x4, t2
LD a5, AO2, 12 * SIZE
MADD1 t3, a7, x3, t3
ST t6, YY, 5 * SIZE
MADD2 t4, a7, x4, t4
LD a7, AO2, 14 * SIZE
MADD3 t1, a6, x4, t1
ST t7, YY, 6 * SIZE
MADD4 t2, a6, x3, t2
LD a6, AO2, 13 * SIZE
MADD3 t3, a8, x4, t3
ST t8, YY, 7 * SIZE
MADD4 t4, a8, x3, t4
LD a8, AO2, 15 * SIZE
addi.d I, I, -1
addi.d YY, YY, 8 * SIZE
addi.d AO1, AO1, 8 * SIZE
addi.d AO2, AO2, 8 * SIZE
blt $r0, I, .L12
.align 3
.L13:
ST t1, YY, 0 * SIZE
MADD1 t1, a1, x1, y1
ST t2, YY, 1 * SIZE
MADD2 t2, a1, x2, y2
ST t3, YY, 2 * SIZE
MADD1 t3, a3, x1, y3
ST t4, YY, 3 * SIZE
MADD2 t4, a3, x2, y4
MADD3 t1, a2, x2, t1
MADD4 t2, a2, x1, t2
MADD3 t3, a4, x2, t3
MADD4 t4, a4, x1, t4
MADD1 t1, a5, x3, t1
MADD2 t2, a5, x4, t2
MADD1 t3, a7, x3, t3
MADD2 t4, a7, x4, t4
MADD3 t1, a6, x4, t1
addi.d AO1, AO1, 8 * SIZE
MADD4 t2, a6, x3, t2
addi.d AO2, AO2, 8 * SIZE
MADD3 t3, a8, x4, t3
addi.d YY, YY, 8 * SIZE
MADD4 t4, a8, x3, t4
ST t1, YY, -4 * SIZE
ST t2, YY, -3 * SIZE
ST t3, YY, -2 * SIZE
ST t4, YY, -1 * SIZE
.align 3
.L15:
andi I, M, 2
bge $r0, I, .L16
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
LD a3, AO1, 2 * SIZE
LD y3, YY, 2 * SIZE
LD a4, AO1, 3 * SIZE
LD y4, YY, 3 * SIZE
MADD1 t1, a1, x1, y1
LD a5, AO2, 0 * SIZE
MADD2 t2, a1, x2, y2
LD a6, AO2, 1 * SIZE
MADD1 t3, a3, x1, y3
LD a7, AO2, 2 * SIZE
MADD2 t4, a3, x2, y4
LD a8, AO2, 3 * SIZE
MADD3 t1, a2, x2, t1
MADD4 t2, a2, x1, t2
MADD3 t3, a4, x2, t3
MADD4 t4, a4, x1, t4
MADD1 t1, a5, x3, t1
MADD2 t2, a5, x4, t2
MADD1 t3, a7, x3, t3
MADD2 t4, a7, x4, t4
MADD3 t1, a6, x4, t1
addi.d YY, YY, 4 * SIZE
MADD4 t2, a6, x3, t2
addi.d AO1, AO1, 4 * SIZE
MADD3 t3, a8, x4, t3
addi.d AO2, AO2, 4 * SIZE
MADD4 t4, a8, x3, t4
ST t1, YY, -4 * SIZE
ST t2, YY, -3 * SIZE
ST t3, YY, -2 * SIZE
ST t4, YY, -1 * SIZE
.align 3
.L16:
andi I, M, 1
bge $r0, I, .L19
LD y1, YY, 0 * SIZE
LD y2, YY, 1 * SIZE
LD a1, AO1, 0 * SIZE
LD a2, AO1, 1 * SIZE
MADD1 t1, a1, x1, y1
LD a5, AO2, 0 * SIZE
MADD2 t2, a1, x2, y2
LD a6, AO2, 1 * SIZE
MADD3 t1, a2, x2, t1
MADD4 t2, a2, x1, t2
MADD1 t1, a5, x3, t1
MADD2 t2, a5, x4, t2
MADD3 t1, a6, x4, t1
MADD4 t2, a6, x3, t2
ST t1, YY, 0 * SIZE
ST t2, YY, 1 * SIZE
.align 3
.L19:
addi.d J, J, -1
blt $r0, J, .L11
.align 3
.L20:
andi J, N, 1
bge $r0, J, .L900
LD x1, X, 0 * SIZE
LD x2, X, 1 * SIZE
add.d X, X, INCX
MUL a1, ALPHA_R, x1
move AO1, A
MUL a2, ALPHA_I, x1
#ifndef XCONJ
NMSUB x1, x2, ALPHA_I, a1
MADD x2, x2, ALPHA_R, a2
#else
MADD x1, x2, ALPHA_I, a1
MSUB x2, x2, ALPHA_R, a2
#endif
srai.d I, M, 2
move YY, YORIG
bge $r0, I, .L25
LD y1, YY, 0 * SIZE
LD a1, AO1, 0 * SIZE
LD y2, YY, 1 * SIZE
LD a3, AO1, 2 * SIZE
LD y3, YY, 2 * SIZE
LD a2, AO1, 1 * SIZE
LD y4, YY, 3 * SIZE
LD a4, AO1, 3 * SIZE
MADD1 t1, a1, x1, y1
LD y1, YY, 4 * SIZE
MADD2 t2, a1, x2, y2
LD a1, AO1, 4 * SIZE
MADD1 t3, a3, x1, y3
LD y2, YY, 5 * SIZE
MADD2 t4, a3, x2, y4
LD a3, AO1, 6 * SIZE
MADD3 t1, a2, x2, t1
LD y3, YY, 6 * SIZE
MADD4 t2, a2, x1, t2
LD a2, AO1, 5 * SIZE
MADD3 t3, a4, x2, t3
LD y4, YY, 7 * SIZE
MADD4 t4, a4, x1, t4
addi.d I, I, -1
LD a4, AO1, 7 * SIZE
bge $r0, I, .L23
.align 3
.L22:
MADD1 t5, a1, x1, y1
LD y1, YY, 8 * SIZE
MADD2 t6, a1, x2, y2
LD a1, AO1, 8 * SIZE
MADD1 t7, a3, x1, y3
LD y2, YY, 9 * SIZE
MADD2 t8, a3, x2, y4
LD a3, AO1, 10 * SIZE
MADD3 t5, a2, x2, t5
LD y3, YY, 10 * SIZE
MADD4 t6, a2, x1, t6
LD a2, AO1, 9 * SIZE
MADD3 t7, a4, x2, t7
LD y4, YY, 11 * SIZE
MADD4 t8, a4, x1, t8
LD a4, AO1, 11 * SIZE
ST t1, YY, 0 * SIZE
ST t2, YY, 1 * SIZE
ST t3, YY, 2 * SIZE
ST t4, YY, 3 * SIZE
MADD1 t1, a1, x1, y1
LD y1, YY, 12 * SIZE
MADD2 t2, a1, x2, y2
LD a1, AO1, 12 * SIZE
MADD1 t3, a3, x1, y3
LD y2, YY, 13 * SIZE
MADD2 t4, a3, x2, y4
LD a3, AO1, 14 * SIZE
MADD3 t1, a2, x2, t1
LD y3, YY, 14 * SIZE
MADD4 t2, a2, x1, t2
LD a2, AO1, 13 * SIZE
MADD3 t3, a4, x2, t3
LD y4, YY, 15 * SIZE
MADD4 t4, a4, x1, t4
LD a4, AO1, 15 * SIZE
ST t5, YY, 4 * SIZE
ST t6, YY, 5 * SIZE
ST t7, YY, 6 * SIZE
ST t8, YY, 7 * SIZE
addi.d I, I, -1
addi.d YY, YY, 8 * SIZE
addi.d AO1, AO1, 8 * SIZE
blt $r0, I, .L22
.align 3
.L23:
ST t1, YY, 0 * SIZE
MADD1 t1, a1, x1, y1
ST t2, YY, 1 * SIZE
MADD2 t2, a1, x2, y2
ST t3, YY, 2 * SIZE
MADD1 t3, a3, x1, y3
ST t4, YY, 3 * SIZE
MADD2 t4, a3, x2, y4
MADD3 t1, a2, x2, t1
addi.d AO1, AO1, 8 * SIZE
MADD4 t2, a2, x1, t2
addi.d YY, YY, 8 * SIZE
MADD3 t3, a4, x2, t3
MADD4 t4, a4, x1, t4
ST t1, YY, -4 * SIZE
ST t2, YY, -3 * SIZE
ST t3, YY, -2 * SIZE
ST t4, YY, -1 * SIZE
.align 3
.L25:
andi I, M, 2
bge $r0, I, .L26
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
LD a3, AO1, 2 * SIZE
LD y3, YY, 2 * SIZE
LD a4, AO1, 3 * SIZE
LD y4, YY, 3 * SIZE
MADD1 t1, a1, x1, y1
MADD2 t2, a1, x2, y2
MADD1 t3, a3, x1, y3
MADD2 t4, a3, x2, y4
MADD3 t1, a2, x2, t1
addi.d YY, YY, 4 * SIZE
MADD4 t2, a2, x1, t2
addi.d AO1, AO1, 4 * SIZE
MADD3 t3, a4, x2, t3
MADD4 t4, a4, x1, t4
ST t1, YY, -4 * SIZE
ST t2, YY, -3 * SIZE
ST t3, YY, -2 * SIZE
ST t4, YY, -1 * SIZE
.align 3
.L26:
andi I, M, 1
bge $r0, I, .L900
LD y1, YY, 0 * SIZE
LD y2, YY, 1 * SIZE
LD a1, AO1, 0 * SIZE
LD a2, AO1, 1 * SIZE
MADD1 t1, a1, x1, y1
MADD2 t2, a1, x2, y2
MADD3 t1, a2, x2, t1
MADD4 t2, a2, x1, t2
ST t1, YY, 0 * SIZE
ST t2, YY, 1 * SIZE
.align 3
.L900:
li.d YORIG, 2 * SIZE
srai.d I, M, 2
beq INCY, YORIG, .L999
move XX, BUFFER
bge $r0, I, .L905
.align 3
.L902:
LD a1, XX, 0 * SIZE
LD a2, XX, 1 * SIZE
LD a3, XX, 2 * SIZE
LD a4, XX, 3 * SIZE
LD a5, XX, 4 * SIZE
LD a6, XX, 5 * SIZE
LD a7, XX, 6 * SIZE
LD a8, XX, 7 * SIZE
addi.d I, I, -1
ST a1, Y, 0 * SIZE
ST a2, Y, 1 * SIZE
add.d Y, Y, INCY
ST a3, Y, 0 * SIZE
ST a4, Y, 1 * SIZE
add.d Y, Y, INCY
ST a5, Y, 0 * SIZE
ST a6, Y, 1 * SIZE
add.d Y, Y, INCY
ST a7, Y, 0 * SIZE
ST a8, Y, 1 * SIZE
add.d Y, Y, INCY
addi.d XX, XX, 8 * SIZE
blt $r0, I, .L902
.align 3
.L905:
andi I, M, 3
bge $r0, I, .L999
.align 3
.L906:
LD a1, XX, 0 * SIZE
LD a2, XX, 1 * SIZE
addi.d XX, XX, 2 * SIZE
addi.d I, I, -1
ST a1, Y, 0 * SIZE
ST a2, Y, 1 * SIZE
add.d Y, Y, INCY
blt $r0, I, .L906
.align 3
.L999:
LDARG $r23, $sp, 0
LDARG $r24, $sp, 8
fld.d $f24, $sp, 16
fld.d $f25, $sp, 24
#ifndef __64BIT__
fld.d $f18, $sp, 32
fld.d $f19, $sp, 40
fld.d $f20, $sp, 48
fld.d $f21, $sp, 56
#endif
#ifdef __64BIT__
addi.d $sp, $sp, 32
#else
addi.d $sp, $sp, 64
#endif
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,556 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define M $r4
#define N $r5
#define A $r7
#define LDA $r8
#define X $r9
#define INCX $r10
#define Y $r11
#define INCY $r6
#define BUFFER $r17
#define XORIG $r18
#define XX $r12
#define YY $r13
#define I $r14
#define J $r15
#define AO1 $r23
#define AO2 $r24
#define ALPHA_R $f0
#define ALPHA_I $f1
#define a1 $f22
#define a2 $f8
#define a3 $f23
#define a4 $f9
#define a5 $f10
#define a6 $f11
#define a7 $f12
#define a8 $f13
#define y1 $f14
#define y2 $f15
#define y3 $f16
#define y4 $f17
#define x1 $f3
#define x2 $f4
#define x3 $f2
#define x4 $f5
#define x5 $f6
#define x6 $f7
#define x7 $f18
#define x8 $f19
#if !defined(CONJ) && !defined(XCONJ)
#define MADD1 MADD
#define MADD2 MADD
#define MADD3 NMSUB
#define MADD4 MADD
#endif
#if defined(CONJ) && !defined(XCONJ)
#define MADD1 MADD
#define MADD2 MADD
#define MADD3 MADD
#define MADD4 NMSUB
#endif
#if !defined(CONJ) && defined(XCONJ)
#define MADD1 MADD
#define MADD2 NMSUB
#define MADD3 MADD
#define MADD4 MADD
#endif
#if defined(CONJ) && defined(XCONJ)
#define MADD1 MADD
#define MADD2 NMSUB
#define MADD3 NMSUB
#define MADD4 NMSUB
#endif
PROLOGUE
LDARG INCY, $sp, 0
LDARG BUFFER, $sp, 8
#ifdef __64BIT__
addi.d $sp, $sp, -16
#else
addi.d $sp, $sp, -32
#endif
MTC y1, $r0
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
slli.d LDA, LDA, ZBASE_SHIFT
#ifndef __64BIT__
fst.d $f18, $sp, 16
fst.d $f19, $sp, 24
#endif
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, M, .L999
slli.d INCY, INCY, ZBASE_SHIFT
bge $r0, N, .L999
li.d I, 2 * SIZE
move XORIG, X
beq INCX, I, .L10
srai.d I, M, 2
move XORIG, BUFFER
move YY, BUFFER
bge $r0, I, .L05
.align 3
.L02:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
LD a4, X, 1 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
LD a6, X, 1 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
LD a8, X, 1 * SIZE
add.d X, X, INCX
addi.d I, I, -1
addi.d YY, YY, 8 * SIZE
ST a1, YY, -8 * SIZE
ST a2, YY, -7 * SIZE
ST a3, YY, -6 * SIZE
ST a4, YY, -5 * SIZE
ST a5, YY, -4 * SIZE
ST a6, YY, -3 * SIZE
ST a7, YY, -2 * SIZE
ST a8, YY, -1 * SIZE
blt $r0, I, .L02
.align 3
.L05:
andi I, M, 3
bge $r0, I, .L10
.align 3
.L06:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
ST a1, YY, 0 * SIZE
ST a2, YY, 1 * SIZE
addi.d I, I, -1
addi.d YY, YY, 2 * SIZE
blt $r0, I, .L06
.align 3
.L10:
srai.d J, N, 1
move YY, Y
bge $r0, J, .L20
.align 3
.L11:
move AO1, A
MOV y2, y1
add.d AO2, A, LDA
MOV y3, y1
add.d A, AO2, LDA
MOV y4, y1
srai.d I, M, 2
move XX, XORIG
bge $r0, I, .L15
LD x1, XX, 0 * SIZE
LD x2, XX, 1 * SIZE
LD x4, XX, 3 * SIZE
LD a1, AO1, 0 * SIZE
LD a3, AO2, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD a4, AO2, 1 * SIZE
LD a5, AO1, 2 * SIZE
LD a7, AO2, 2 * SIZE
LD a6, AO1, 3 * SIZE
LD a8, AO2, 3 * SIZE
addi.d I, I, -1
bge $r0, I, .L13
.align 3
.L12:
MADD1 y1, a1, x1, y1
LD x3, XX, 2 * SIZE
MADD2 y2, a1, x2, y2
LD a1, AO1, 4 * SIZE
MADD1 y3, a3, x1, y3
MADD2 y4, a3, x2, y4
LD a3, AO2, 4 * SIZE
MADD3 y1, a2, x2, y1
MADD4 y2, a2, x1, y2
LD a2, AO1, 5 * SIZE
MADD3 y3, a4, x2, y3
LD x2, XX, 5 * SIZE
MADD4 y4, a4, x1, y4
LD a4, AO2, 5 * SIZE
MADD1 y1, a5, x3, y1
LD x1, XX, 4 * SIZE
MADD2 y2, a5, x4, y2
LD a5, AO1, 6 * SIZE
MADD1 y3, a7, x3, y3
MADD2 y4, a7, x4, y4
LD a7, AO2, 6 * SIZE
MADD3 y1, a6, x4, y1
addi.d I, I, -1
MADD4 y2, a6, x3, y2
LD a6, AO1, 7 * SIZE
MADD3 y3, a8, x4, y3
LD x4, XX, 7 * SIZE
MADD4 y4, a8, x3, y4
LD a8, AO2, 7 * SIZE
MADD1 y1, a1, x1, y1
LD x3, XX, 6 * SIZE
MADD2 y2, a1, x2, y2
LD a1, AO1, 8 * SIZE
MADD1 y3, a3, x1, y3
MADD2 y4, a3, x2, y4
LD a3, AO2, 8 * SIZE
MADD3 y1, a2, x2, y1
MADD4 y2, a2, x1, y2
LD a2, AO1, 9 * SIZE
MADD3 y3, a4, x2, y3
LD x2, XX, 9 * SIZE
MADD4 y4, a4, x1, y4
LD a4, AO2, 9 * SIZE
MADD1 y1, a5, x3, y1
LD x1, XX, 8 * SIZE
MADD2 y2, a5, x4, y2
LD a5, AO1, 10 * SIZE
MADD1 y3, a7, x3, y3
addi.d XX, XX, 8 * SIZE
MADD2 y4, a7, x4, y4
LD a7, AO2, 10 * SIZE
MADD3 y1, a6, x4, y1
addi.d AO2, AO2, 8 * SIZE
MADD4 y2, a6, x3, y2
LD a6, AO1, 11 * SIZE
MADD3 y3, a8, x4, y3
LD x4, XX, 3 * SIZE
MADD4 y4, a8, x3, y4
LD a8, AO2, 3 * SIZE
addi.d AO1, AO1, 8 * SIZE
blt $r0, I, .L12
.align 3
.L13:
MADD1 y1, a1, x1, y1
LD x3, XX, 2 * SIZE
MADD2 y2, a1, x2, y2
LD a1, AO1, 4 * SIZE
MADD1 y3, a3, x1, y3
MADD2 y4, a3, x2, y4
LD a3, AO2, 4 * SIZE
MADD3 y1, a2, x2, y1
MADD4 y2, a2, x1, y2
LD a2, AO1, 5 * SIZE
MADD3 y3, a4, x2, y3
LD x2, XX, 5 * SIZE
MADD4 y4, a4, x1, y4
LD a4, AO2, 5 * SIZE
MADD1 y1, a5, x3, y1
LD x1, XX, 4 * SIZE
MADD2 y2, a5, x4, y2
LD a5, AO1, 6 * SIZE
MADD1 y3, a7, x3, y3
MADD2 y4, a7, x4, y4
LD a7, AO2, 6 * SIZE
MADD3 y1, a6, x4, y1
MADD4 y2, a6, x3, y2
LD a6, AO1, 7 * SIZE
MADD3 y3, a8, x4, y3
LD x4, XX, 7 * SIZE
MADD4 y4, a8, x3, y4
LD a8, AO2, 7 * SIZE
MADD1 y1, a1, x1, y1
LD x3, XX, 6 * SIZE
MADD2 y2, a1, x2, y2
MADD1 y3, a3, x1, y3
MADD2 y4, a3, x2, y4
MADD3 y1, a2, x2, y1
MADD4 y2, a2, x1, y2
MADD3 y3, a4, x2, y3
MADD4 y4, a4, x1, y4
MADD1 y1, a5, x3, y1
MADD2 y2, a5, x4, y2
MADD1 y3, a7, x3, y3
MADD2 y4, a7, x4, y4
MADD3 y1, a6, x4, y1
addi.d XX, XX, 8 * SIZE
MADD4 y2, a6, x3, y2
addi.d AO1, AO1, 8 * SIZE
MADD3 y3, a8, x4, y3
addi.d AO2, AO2, 8 * SIZE
MADD4 y4, a8, x3, y4
.align 3
.L15:
andi I, M, 2
bge $r0, I, .L17
LD x1, XX, 0 * SIZE
LD x2, XX, 1 * SIZE
LD x3, XX, 2 * SIZE
LD x4, XX, 3 * SIZE
LD a1, AO1, 0 * SIZE
LD a3, AO2, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD a4, AO2, 1 * SIZE
LD a5, AO1, 2 * SIZE
LD a7, AO2, 2 * SIZE
LD a6, AO1, 3 * SIZE
LD a8, AO2, 3 * SIZE
MADD1 y1, a1, x1, y1
MADD2 y2, a1, x2, y2
MADD1 y3, a3, x1, y3
MADD2 y4, a3, x2, y4
MADD3 y1, a2, x2, y1
MADD4 y2, a2, x1, y2
MADD3 y3, a4, x2, y3
MADD4 y4, a4, x1, y4
MADD1 y1, a5, x3, y1
MADD2 y2, a5, x4, y2
MADD1 y3, a7, x3, y3
MADD2 y4, a7, x4, y4
MADD3 y1, a6, x4, y1
addi.d XX, XX, 4 * SIZE
MADD4 y2, a6, x3, y2
addi.d AO1, AO1, 4 * SIZE
MADD3 y3, a8, x4, y3
addi.d AO2, AO2, 4 * SIZE
MADD4 y4, a8, x3, y4
.align 3
.L17:
andi I, M, 1
.align 3
bge $r0, I, .L19
.L18:
LD x1, XX, 0 * SIZE
LD x2, XX, 1 * SIZE
LD a1, AO1, 0 * SIZE
LD a3, AO2, 0 * SIZE
MADD1 y1, a1, x1, y1
LD a2, AO1, 1 * SIZE
MADD2 y2, a1, x2, y2
LD a4, AO2, 1 * SIZE
MADD1 y3, a3, x1, y3
MADD2 y4, a3, x2, y4
MADD3 y1, a2, x2, y1
MADD4 y2, a2, x1, y2
MADD3 y3, a4, x2, y3
MADD4 y4, a4, x1, y4
.align 3
.L19:
LD a1, Y, 0 * SIZE
LD a2, Y, 1 * SIZE
add.d Y, Y, INCY
LD a3, Y, 0 * SIZE
LD a4, Y, 1 * SIZE
add.d Y, Y, INCY
MADD a1, y1, ALPHA_R, a1
MADD a2, y1, ALPHA_I, a2
MADD a3, y3, ALPHA_R, a3
MADD a4, y3, ALPHA_I, a4
NMSUB a1, y2, ALPHA_I, a1
MADD a2, y2, ALPHA_R, a2
NMSUB a3, y4, ALPHA_I, a3
MTC y1, $r0
MADD a4, y4, ALPHA_R, a4
addi.d J, J, -1
ST a1, YY, 0 * SIZE
ST a2, YY, 1 * SIZE
add.d YY, YY, INCY
ST a3, YY, 0 * SIZE
ST a4, YY, 1 * SIZE
add.d YY, YY, INCY
blt $r0, J, .L11
.align 3
.L20:
andi J, N, 1
MOV y2, y1
srai.d I, M, 2
bge $r0, J, .L999
MOV y3, y1
move AO1, A
MOV y4, y1
move XX, XORIG
bge $r0, I, .L25
LD a1, AO1, 0 * SIZE
LD x1, XX, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD x2, XX, 1 * SIZE
LD a5, AO1, 2 * SIZE
LD x4, XX, 3 * SIZE
addi.d I, I, -1
LD a6, AO1, 3 * SIZE
bge $r0, I, .L23
.align 3
.L22:
MADD1 y1, a1, x1, y1
LD x3, XX, 2 * SIZE
MADD2 y2, a1, x2, y2
LD a1, AO1, 4 * SIZE
MADD3 y3, a2, x2, y3
LD x2, XX, 5 * SIZE
MADD4 y4, a2, x1, y4
LD a2, AO1, 5 * SIZE
MADD1 y1, a5, x3, y1
LD x1, XX, 4 * SIZE
MADD2 y2, a5, x4, y2
LD a5, AO1, 6 * SIZE
MADD3 y3, a6, x4, y3
LD x4, XX, 7 * SIZE
MADD4 y4, a6, x3, y4
LD a6, AO1, 7 * SIZE
MADD1 y1, a1, x1, y1
LD x3, XX, 6 * SIZE
MADD2 y2, a1, x2, y2
LD a1, AO1, 8 * SIZE
MADD3 y3, a2, x2, y3
LD x2, XX, 9 * SIZE
MADD4 y4, a2, x1, y4
LD a2, AO1, 9 * SIZE
MADD1 y1, a5, x3, y1
LD x1, XX, 8 * SIZE
MADD2 y2, a5, x4, y2
LD a5, AO1, 10 * SIZE
MADD3 y3, a6, x4, y3
LD x4, XX, 11 * SIZE
MADD4 y4, a6, x3, y4
LD a6, AO1, 11 * SIZE
addi.d I, I, -1
addi.d XX, XX, 8 * SIZE
addi.d AO1, AO1, 8 * SIZE
blt $r0, I, .L22
.align 3
.L23:
MADD1 y1, a1, x1, y1
LD x3, XX, 2 * SIZE
MADD2 y2, a1, x2, y2
LD a1, AO1, 4 * SIZE
MADD3 y3, a2, x2, y3
LD x2, XX, 5 * SIZE
MADD4 y4, a2, x1, y4
LD a2, AO1, 5 * SIZE
MADD1 y1, a5, x3, y1
LD x1, XX, 4 * SIZE
MADD2 y2, a5, x4, y2
LD a5, AO1, 6 * SIZE
MADD3 y3, a6, x4, y3
LD x4, XX, 7 * SIZE
MADD4 y4, a6, x3, y4
LD a6, AO1, 7 * SIZE
MADD1 y1, a1, x1, y1
LD x3, XX, 6 * SIZE
MADD2 y2, a1, x2, y2
MADD3 y3, a2, x2, y3
MADD4 y4, a2, x1, y4
MADD1 y1, a5, x3, y1
MADD2 y2, a5, x4, y2
MADD3 y3, a6, x4, y3
addi.d XX, XX, 8 * SIZE
MADD4 y4, a6, x3, y4
addi.d AO1, AO1, 8 * SIZE
.align 3
.L25:
andi I, M, 2
bge $r0, I, .L27
LD a1, AO1, 0 * SIZE
LD x1, XX, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD x2, XX, 1 * SIZE
LD a5, AO1, 2 * SIZE
MADD1 y1, a1, x1, y1
LD x3, XX, 2 * SIZE
MADD2 y2, a1, x2, y2
LD a6, AO1, 3 * SIZE
MADD3 y3, a2, x2, y3
LD x4, XX, 3 * SIZE
MADD4 y4, a2, x1, y4
MADD1 y1, a5, x3, y1
MADD2 y2, a5, x4, y2
MADD3 y3, a6, x4, y3
addi.d XX, XX, 4 * SIZE
MADD4 y4, a6, x3, y4
addi.d AO1, AO1, 4 * SIZE
.align 3
.L27:
andi I, M, 1
.align 3
bge $r0, I, .L29
.L28:
LD a1, AO1, 0 * SIZE
LD x1, XX, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD x2, XX, 1 * SIZE
MADD1 y1, a1, x1, y1
MADD2 y2, a1, x2, y2
MADD3 y3, a2, x2, y3
MADD4 y4, a2, x1, y4
.align 3
.L29:
LD a1, Y, 0 * SIZE
LD a2, Y, 1 * SIZE
ADD y1, y1, y3
ADD y2, y2, y4
MADD a1, y1, ALPHA_R, a1
MADD a2, y1, ALPHA_I, a2
NMSUB a1, y2, ALPHA_I, a1
MADD a2, y2, ALPHA_R, a2
ST a1, YY, 0 * SIZE
ST a2, YY, 1 * SIZE
.align 3
.L999:
LDARG $r23, $sp, 0
LDARG $r24, $sp, 8
#ifndef __64BIT__
fld.d $f18, $sp, 16
fld.d $f19, $sp, 24
#endif
#ifdef __64BIT__
addi.d $sp, $sp, 16
#else
addi.d $sp, $sp, 32
#endif
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

Some files were not shown because too many files have changed in this diff Show More