Merge branch 'develop' into risc-v
This commit is contained in:
commit
e284c048df
|
@ -2,6 +2,9 @@ name: continuous build
|
|||
|
||||
on: [push, pull_request]
|
||||
|
||||
permissions:
|
||||
contents: read # to fetch code (actions/checkout)
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
@ -150,6 +153,7 @@ jobs:
|
|||
matrix:
|
||||
msystem: [MINGW64, MINGW32, CLANG64]
|
||||
idx: [int32, int64]
|
||||
build-type: [Release]
|
||||
include:
|
||||
- msystem: MINGW64
|
||||
idx: int32
|
||||
|
@ -173,6 +177,11 @@ jobs:
|
|||
idx64-flags: -DBINARY=64 -DINTERFACE64=1
|
||||
target-prefix: mingw-w64-clang-x86_64
|
||||
c-lapack-flags: -DC_LAPACK=ON
|
||||
- msystem: MINGW64
|
||||
idx: int32
|
||||
target-prefix: mingw-w64-x86_64
|
||||
fc-pkg: mingw-w64-x86_64-gcc-fortran
|
||||
build-type: None
|
||||
exclude:
|
||||
- msystem: MINGW32
|
||||
idx: int64
|
||||
|
@ -215,11 +224,11 @@ jobs:
|
|||
path: C:/msys64/home/runneradmin/.ccache
|
||||
# We include the commit sha in the cache key, as new cache entries are
|
||||
# only created if there is no existing entry for the key yet.
|
||||
key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ github.ref }}-${{ github.sha }}
|
||||
key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}
|
||||
# Restore a matching ccache cache entry. Prefer same branch.
|
||||
restore-keys: |
|
||||
ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ github.ref }}
|
||||
ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}
|
||||
ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}
|
||||
ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}
|
||||
|
||||
- name: Configure ccache
|
||||
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota.
|
||||
|
@ -235,7 +244,8 @@ jobs:
|
|||
- name: Configure OpenBLAS
|
||||
run: |
|
||||
mkdir build && cd build
|
||||
cmake -DBUILD_SHARED_LIBS=ON \
|
||||
cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} \
|
||||
-DBUILD_SHARED_LIBS=ON \
|
||||
-DBUILD_STATIC_LIBS=ON \
|
||||
-DDYNAMIC_ARCH=ON \
|
||||
-DUSE_THREAD=ON \
|
||||
|
@ -258,6 +268,7 @@ jobs:
|
|||
timeout-minutes: 60
|
||||
run: cd build && ctest
|
||||
|
||||
|
||||
cross_build:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
|
@ -267,7 +278,7 @@ jobs:
|
|||
include:
|
||||
- target: mips64el
|
||||
triple: mips64el-linux-gnuabi64
|
||||
opts: DYNAMIC_ARCH=1
|
||||
opts: DYNAMIC_ARCH=1 TARGET=GENERIC
|
||||
- target: riscv64
|
||||
triple: riscv64-linux-gnu
|
||||
opts: TARGET=RISCV64_GENERIC
|
||||
|
|
|
@ -0,0 +1,117 @@
|
|||
name: mips64 qemu test
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
permissions:
|
||||
contents: read # to fetch code (actions/checkout)
|
||||
|
||||
jobs:
|
||||
TEST:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- target: MIPS64_GENERIC
|
||||
triple: mips64el-linux-gnuabi64
|
||||
opts: NO_SHARED=1 TARGET=MIPS64_GENERIC
|
||||
- target: SICORTEX
|
||||
triple: mips64el-linux-gnuabi64
|
||||
opts: NO_SHARED=1 TARGET=SICORTEX
|
||||
- target: I6400
|
||||
triple: mipsisa64r6el-linux-gnuabi64
|
||||
opts: NO_SHARED=1 TARGET=I6400
|
||||
- target: P6600
|
||||
triple: mipsisa64r6el-linux-gnuabi64
|
||||
opts: NO_SHARED=1 TARGET=P6600
|
||||
- target: I6500
|
||||
triple: mipsisa64r6el-linux-gnuabi64
|
||||
opts: NO_SHARED=1 TARGET=I6500
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: install build deps
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
|
||||
gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross
|
||||
|
||||
- name: checkout qemu
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
repository: qemu/qemu
|
||||
path: qemu
|
||||
ref: 79dfa177ae348bb5ab5f97c0915359b13d6186e2
|
||||
|
||||
- name: build qemu
|
||||
run: |
|
||||
cd qemu
|
||||
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mips64el-linux-user --disable-system
|
||||
make -j$(nproc)
|
||||
make install
|
||||
|
||||
- name: Compilation cache
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.ccache
|
||||
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
|
||||
restore-keys: |
|
||||
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
|
||||
ccache-${{ runner.os }}-${{ matrix.target }}
|
||||
|
||||
- name: Configure ccache
|
||||
run: |
|
||||
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||
echo "max_size = 300M" > ~/.ccache/ccache.conf
|
||||
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||
ccache -s
|
||||
|
||||
- name: build OpenBLAS
|
||||
run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
|
||||
|
||||
- name: test
|
||||
run: |
|
||||
export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
|
||||
qemu-mips64el ./utest/openblas_utest
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat2 < ./ctest/sin2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat2 < ./ctest/din2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat2 < ./ctest/cin2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat2 < ./ctest/zin2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat3 < ./ctest/sin3
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat3 < ./ctest/din3
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat3 < ./ctest/cin3
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat3 < ./ctest/zin3
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat1
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat1
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat1
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat1
|
||||
rm -f ./test/?BLAT2.SUMM
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat
|
||||
rm -f ./test/?BLAT2.SUMM
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat
|
||||
rm -f ./test/?BLAT3.SUMM
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat
|
||||
rm -f ./test/?BLAT3.SUMM
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat
|
|
@ -17,6 +17,10 @@ on:
|
|||
# it only makes sense to test if this file has been changed
|
||||
|
||||
name: Nightly-Homebrew-Build
|
||||
|
||||
permissions:
|
||||
contents: read # to fetch code (actions/checkout)
|
||||
|
||||
jobs:
|
||||
build-OpenBLAS-with-Homebrew:
|
||||
runs-on: macos-latest
|
||||
|
|
|
@ -30,7 +30,7 @@ matrix:
|
|||
before_script: &common-before
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
|
||||
script:
|
||||
- travis_wait 40 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
- travis_wait 50 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
- make -C test $COMMON_FLAGS $BTYPE
|
||||
- make -C ctest $COMMON_FLAGS $BTYPE
|
||||
- make -C utest $COMMON_FLAGS $BTYPE
|
||||
|
@ -104,7 +104,7 @@ matrix:
|
|||
- sudo apt-get update
|
||||
- sudo apt-get install gcc-9 gfortran-9 -y
|
||||
script:
|
||||
- travis_wait 40 make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
|
||||
- travis_wait 50 make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
|
||||
- make -C test $COMMON_FLAGS $BTYPE
|
||||
- make -C ctest $COMMON_FLAGS $BTYPE
|
||||
- make -C utest $COMMON_FLAGS $BTYPE
|
||||
|
@ -121,7 +121,7 @@ matrix:
|
|||
- sudo apt-get update
|
||||
- sudo apt-get install gcc-9 gfortran-9 -y
|
||||
script:
|
||||
- travis_wait 40 make QUIET_MAKE=1 BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
|
||||
- travis_wait 50 make QUIET_MAKE=1 BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
|
||||
- make -C test $COMMON_FLAGS $BTYPE
|
||||
- make -C ctest $COMMON_FLAGS $BTYPE
|
||||
- make -C utest $COMMON_FLAGS $BTYPE
|
||||
|
|
|
@ -212,10 +212,10 @@ if(NOT NO_LAPACKE)
|
|||
add_library(LAPACKE OBJECT ${LAPACKE_SOURCES})
|
||||
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACKE>")
|
||||
endif()
|
||||
if(BUILD_RELAPACK)
|
||||
add_library(RELAPACK OBJECT ${RELA_SOURCES})
|
||||
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:RELAPACK>")
|
||||
endif()
|
||||
#if(BUILD_RELAPACK)
|
||||
# add_library(RELAPACK OBJECT ${RELA_SOURCES})
|
||||
# list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:RELAPACK>")
|
||||
#endif()
|
||||
set(OpenBLAS_LIBS "")
|
||||
if(BUILD_STATIC_LIBS)
|
||||
add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
|
||||
|
|
|
@ -211,4 +211,5 @@ In chronological order:
|
|||
* PLCT Lab, Institute of Software Chinese Academy of Sciences
|
||||
* [2022-03] Support RISC-V Vector Intrinisc 1.0 version.
|
||||
|
||||
|
||||
* Pablo Romero <https://github.com/pablorcum>
|
||||
* [2022-08] Fix building from sources for QNX
|
4
Makefile
4
Makefile
|
@ -278,7 +278,11 @@ prof_lapack : lapack_prebuild
|
|||
lapack_prebuild :
|
||||
ifeq ($(NO_LAPACK), $(filter 0,$(NO_LAPACK)))
|
||||
-@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
-@echo "override FFLAGS = $(LAPACK_FFLAGS) -fno-tree-vectorize" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
else
|
||||
-@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
endif
|
||||
-@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
|
|
|
@ -70,12 +70,12 @@ endif
|
|||
ifeq ($(CORE), NEOVERSEN1)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
|
||||
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=neoverse-n1
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
endif
|
||||
|
@ -89,17 +89,17 @@ endif
|
|||
endif
|
||||
|
||||
# Use a72 tunings because Neoverse-V1 is only available
|
||||
# in GCC>=9.4
|
||||
# in GCC>=10.4
|
||||
ifeq ($(CORE), NEOVERSEV1)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
|
||||
CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
|
||||
CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-v1
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.4-a -mtune=native
|
||||
CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.4-a -mtune=native
|
||||
endif
|
||||
|
@ -119,17 +119,21 @@ endif
|
|||
endif
|
||||
|
||||
# Use a72 tunings because Neoverse-N2 is only available
|
||||
# in GCC>=9.4
|
||||
# in GCC>=10.4
|
||||
ifeq ($(CORE), NEOVERSEN2)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
|
||||
ifneq ($(OSNAME), Darwin)
|
||||
CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
endif
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.5-a -mtune=native
|
||||
CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.5-a -mtune=native
|
||||
endif
|
||||
|
|
|
@ -14,6 +14,11 @@ OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
|
|||
OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake
|
||||
OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig
|
||||
PKG_EXTRALIB := $(EXTRALIB)
|
||||
ifeq ($(INTERFACE64),1)
|
||||
SUFFIX64=64
|
||||
endif
|
||||
PKGFILE="$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE)$(SUFFIX64).pc"
|
||||
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
PKG_EXTRALIB += -lomp
|
||||
|
@ -150,13 +155,19 @@ endif
|
|||
endif
|
||||
|
||||
#Generating openblas.pc
|
||||
@echo Generating $(LIBSONAMEBASE).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
|
||||
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
|
||||
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
|
||||
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
|
||||
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
|
||||
@echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
|
||||
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
|
||||
ifeq ($(INTERFACE64),1)
|
||||
SUFFIX64=64
|
||||
endif
|
||||
PKGFILE="$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE)$(SUFFIX64).pc"
|
||||
|
||||
@echo Generating $(LIBSONAMEBASE)$(SUFFIX64).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
|
||||
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(PKGFILE)"
|
||||
@echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)"
|
||||
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)"
|
||||
@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
|
||||
@echo 'version='$(VERSION) >> "$(PKGFILE)"
|
||||
@echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)"
|
||||
@cat openblas.pc.in >> "$(PKGFILE)"
|
||||
|
||||
|
||||
#Generating OpenBLASConfig.cmake
|
||||
|
|
|
@ -60,9 +60,9 @@ all: getarch_2nd
|
|||
./getarch_2nd 1 >> $(TARGET_CONF)
|
||||
|
||||
$(TARGET_CONF): c_check$(SCRIPTSUFFIX) f_check$(SCRIPTSUFFIX) getarch
|
||||
./c_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(CC)" "$(TARGET_FLAGS) $(CFLAGS)"
|
||||
./c_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(CC)" $(TARGET_FLAGS) $(CFLAGS)
|
||||
ifneq ($(ONLY_CBLAS), 1)
|
||||
./f_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(FC)" "$(TARGET_FLAGS)"
|
||||
./f_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(FC)" $(TARGET_FLAGS)
|
||||
else
|
||||
#When we only build CBLAS, we set NOFORTRAN=2
|
||||
echo "NOFORTRAN=2" >> $(TARGET_MAKE)
|
||||
|
@ -77,8 +77,8 @@ endif
|
|||
|
||||
|
||||
getarch : getarch.c cpuid.S dummy $(CPUIDEMU)
|
||||
avx512=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" "$(TARGET_FLAGS) $(CFLAGS)" | grep NO_AVX512); \
|
||||
rv64gv=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" "$(TARGET_FLAGS) $(CFLAGS)" | grep NO_RV64GV); \
|
||||
avx512=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" $(TARGET_FLAGS) $(CFLAGS) | grep NO_AVX512); \
|
||||
rv64gv=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" $(TARGET_FLAGS) $(CFLAGS) | grep NO_RV64GV); \
|
||||
$(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) $${avx512:+-D$${avx512}} $${rv64gv:+-D$${rv64gv}} -o $(@F) getarch.c cpuid.S $(CPUIDEMU)
|
||||
|
||||
getarch_2nd : getarch_2nd.c $(TARGET_CONF) dummy
|
||||
|
|
|
@ -131,6 +131,9 @@ BUILD_LAPACK_DEPRECATED = 1
|
|||
|
||||
# Build RecursiveLAPACK on top of LAPACK
|
||||
# BUILD_RELAPACK = 1
|
||||
# Have RecursiveLAPACK actually replace standard LAPACK routines instead of
|
||||
# just adding its equivalents with a RELAPACK_ prefix
|
||||
# RELAPACK_REPLACE = 1
|
||||
|
||||
# If you want to use the legacy threaded Level 3 implementation.
|
||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
|
@ -207,7 +210,7 @@ NO_AFFINITY = 1
|
|||
# to the user space. If bigphysarea is enabled, it will use it.
|
||||
# DEVICEDRIVER_ALLOCATION = 1
|
||||
|
||||
# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
|
||||
# If you need to synchronize FP CSR between threads (for x86/x86_64 and aarch64 only).
|
||||
# CONSISTENT_FPCSR = 1
|
||||
|
||||
# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute
|
||||
|
|
|
@ -9,6 +9,10 @@ ifndef TOPDIR
|
|||
TOPDIR = .
|
||||
endif
|
||||
|
||||
ifndef RELAPACK_REPLACE
|
||||
RELAPACK_REPLACE=0
|
||||
endif
|
||||
|
||||
# we need to use the host system's architecture for getarch compile options even especially when cross-compiling
|
||||
HOSTARCH := $(shell uname -m)
|
||||
ifeq ($(HOSTARCH), amd64)
|
||||
|
|
|
@ -143,6 +143,7 @@ ifeq ($(C_COMPILER), CLANG)
|
|||
CCOMMON_OPT += -mavx2
|
||||
endif
|
||||
endif
|
||||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
# AVX2 support was added in 4.7.0
|
||||
GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||
|
@ -159,6 +160,7 @@ endif
|
|||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
|
|
|
@ -141,7 +141,7 @@ jobs:
|
|||
|
||||
- job: OSX_OpenMP
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
vmImage: 'macOS-11'
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
|
@ -151,15 +151,23 @@ jobs:
|
|||
|
||||
- job: OSX_GCC_Nothreads
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
vmImage: 'macOS-11'
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
make USE_THREADS=0 CC=gcc-10 FC=gfortran-10
|
||||
|
||||
- job: OSX_GCC12
|
||||
pool:
|
||||
vmImage: 'macOS-latest'
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
make CC=gcc-12 FC=gfortran-12
|
||||
|
||||
- job: OSX_OpenMP_Clang
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
vmImage: 'macOS-11'
|
||||
variables:
|
||||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
|
@ -172,7 +180,7 @@ jobs:
|
|||
|
||||
- job: OSX_OpenMP_Clang_cmake
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
vmImage: 'macOS-11'
|
||||
variables:
|
||||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
|
@ -188,7 +196,7 @@ jobs:
|
|||
|
||||
- job: OSX_dynarch_cmake
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
vmImage: 'macOS-11'
|
||||
variables:
|
||||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
|
@ -196,13 +204,13 @@ jobs:
|
|||
- script: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON ..
|
||||
cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DDYNAMIC_LIST='NEHALEM HASWELL SKYLAKEX' -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON ..
|
||||
cmake --build .
|
||||
ctest
|
||||
|
||||
- job: OSX_Ifort_Clang
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
vmImage: 'macOS-11'
|
||||
variables:
|
||||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg
|
||||
|
@ -235,7 +243,7 @@ jobs:
|
|||
|
||||
- job: OSX_NDK_ARMV7
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
vmImage: 'macOS-11'
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
|
@ -255,7 +263,7 @@ jobs:
|
|||
|
||||
- job: OSX_IOS_ARMV7
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
vmImage: 'macOS-11'
|
||||
variables:
|
||||
CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch armv7 -miphoneos-version-min=5.1
|
||||
|
|
|
@ -74,6 +74,24 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#endif
|
||||
|
||||
/* Benchmarks should allocate with cacheline (often 64 bytes) alignment
|
||||
to avoid unreliable results. This technique, storing the allocated
|
||||
pointer value just before the aligned memory, doesn't require
|
||||
C11's aligned_alloc for compatibility with older compilers. */
|
||||
static void *aligned_alloc_cacheline(size_t n)
|
||||
{
|
||||
void *p = malloc((size_t)(void *) + n + L1_DATA_LINESIZE - 1);
|
||||
if (p) {
|
||||
void **newp = (void **)
|
||||
(((uintptr_t)p + L1_DATA_LINESIZE) & (uintptr_t)-L1_DATA_LINESIZE);
|
||||
newp[-1] = p;
|
||||
p = newp;
|
||||
}
|
||||
return p;
|
||||
}
|
||||
#define malloc aligned_alloc_cacheline
|
||||
#define free(p) free((p) ? ((void **)(p))[-1] : (p))
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
|
||||
struct timeval start, stop;
|
||||
#elif defined(__APPLE__)
|
||||
|
|
|
@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *x, *y;
|
||||
FLOAT *x;
|
||||
FLOAT alpha[2] = { 2.0, 2.0 };
|
||||
blasint m, i;
|
||||
blasint inc_x=1,inc_y=1;
|
||||
|
@ -74,10 +74,6 @@ int main(int argc, char *argv[]){
|
|||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef __linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
@ -91,30 +87,20 @@ int main(int argc, char *argv[]){
|
|||
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
begin();
|
||||
|
||||
for (l=0; l<loops; l++)
|
||||
{
|
||||
SCAL (&m, alpha, x, &inc_x);
|
||||
|
||||
}
|
||||
end();
|
||||
|
||||
time1 = getsec();
|
||||
|
||||
timeg += time1;
|
||||
|
||||
}
|
||||
|
||||
timeg /= loops;
|
||||
timeg = time1 / loops;
|
||||
|
||||
#ifdef COMPLEX
|
||||
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 6. * (double)m / timeg * 1.e-6, timeg);
|
||||
|
|
|
@ -155,6 +155,39 @@ if (${CORE} STREQUAL A64FX)
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL NEOVERSEN2)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
|
||||
else ()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
|
||||
endif()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL NEOVERSEV1)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
|
||||
else ()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
|
||||
endif()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL NEOVERSEN1)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=neoverse-n1")
|
||||
else ()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
|
||||
endif()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL ARMV8SVE)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
|
||||
|
|
|
@ -123,7 +123,8 @@ set(SLASRC
|
|||
ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
|
||||
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
|
||||
sgesvdq.f slaorhr_col_getrfnp.f
|
||||
slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f )
|
||||
slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f
|
||||
slarmm.f slatrs3.f strsyl3.f sgelst.f)
|
||||
|
||||
set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
|
||||
sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
|
||||
|
@ -221,7 +222,8 @@ set(CLASRC
|
|||
cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
|
||||
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
|
||||
cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f
|
||||
cungtsqr.f cungtsqr_row.f cunhr_col.f )
|
||||
cungtsqr.f cungtsqr_row.f cunhr_col.f
|
||||
clatrs3.f ctrsyl3.f cgelst.f)
|
||||
|
||||
set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
|
||||
cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
|
||||
|
@ -313,7 +315,8 @@ set(DLASRC
|
|||
dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
|
||||
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
|
||||
dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
|
||||
dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f )
|
||||
dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f
|
||||
dlarmm.f dlatrs3.f dtrsyl3.f dgelst.f)
|
||||
|
||||
set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
|
||||
dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
|
||||
|
@ -415,7 +418,8 @@ set(ZLASRC
|
|||
zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
|
||||
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
|
||||
zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
|
||||
zungtsqr.f zungtsqr_row.f zunhr_col.f)
|
||||
zungtsqr.f zungtsqr_row.f zunhr_col.f
|
||||
zlatrs3.f ztrsyl3.f zgelst.f)
|
||||
|
||||
set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
|
||||
zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
|
||||
|
@ -617,7 +621,8 @@ set(SLASRC
|
|||
ssyevd_2stage.c ssyev_2stage.c ssyevx_2stage.c ssyevr_2stage.c
|
||||
ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c
|
||||
sgesvdq.c slaorhr_col_getrfnp.c
|
||||
slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c )
|
||||
slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c
|
||||
slarmm.c slatrs3.c strsyl3.c sgelst.c)
|
||||
|
||||
set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c
|
||||
sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c
|
||||
|
@ -714,7 +719,8 @@ set(CLASRC
|
|||
cheevd_2stage.c cheev_2stage.c cheevx_2stage.c cheevr_2stage.c
|
||||
chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c
|
||||
cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c
|
||||
cungtsqr.c cungtsqr_row.c cunhr_col.c )
|
||||
cungtsqr.c cungtsqr_row.c cunhr_col.c
|
||||
clatrs3.c ctrsyl3.c cgelst.c)
|
||||
|
||||
set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c
|
||||
cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c
|
||||
|
@ -805,7 +811,8 @@ set(DLASRC
|
|||
dsyevd_2stage.c dsyev_2stage.c dsyevx_2stage.c dsyevr_2stage.c
|
||||
dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c
|
||||
dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c
|
||||
dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c )
|
||||
dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c
|
||||
dlarmm.c dlatrs3.c dtrsyl3.c dgelst.c)
|
||||
|
||||
set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c
|
||||
dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c
|
||||
|
@ -906,7 +913,7 @@ set(ZLASRC
|
|||
zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c
|
||||
zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c
|
||||
zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c
|
||||
zungtsqr.c zungtsqr_row.c zunhr_col.c)
|
||||
zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c)
|
||||
|
||||
set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c
|
||||
zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c
|
||||
|
@ -999,6 +1006,9 @@ endforeach ()
|
|||
|
||||
if (NOT C_LAPACK)
|
||||
set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}")
|
||||
if (${F_COMPILER} STREQUAL "GFORTRAN")
|
||||
set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS} -fno-tree-vectorize")
|
||||
endif()
|
||||
else ()
|
||||
set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")
|
||||
endif ()
|
||||
|
|
|
@ -318,6 +318,8 @@ set(CSRC
|
|||
lapacke_clacn2.c
|
||||
lapacke_clag2z.c
|
||||
lapacke_clag2z_work.c
|
||||
lapacke_clangb.c
|
||||
lapacke_clangb_work.c
|
||||
lapacke_clange.c
|
||||
lapacke_clange_work.c
|
||||
lapacke_clanhe.c
|
||||
|
@ -803,6 +805,8 @@ set(DSRC
|
|||
lapacke_dlag2s_work.c
|
||||
lapacke_dlamch.c
|
||||
lapacke_dlamch_work.c
|
||||
lapacke_dlangb.c
|
||||
lapacke_dlangb_work.c
|
||||
lapacke_dlange.c
|
||||
lapacke_dlange_work.c
|
||||
lapacke_dlansy.c
|
||||
|
@ -1381,6 +1385,8 @@ set(SSRC
|
|||
lapacke_slag2d_work.c
|
||||
lapacke_slamch.c
|
||||
lapacke_slamch_work.c
|
||||
lapacke_slangb.c
|
||||
lapacke_slangb_work.c
|
||||
lapacke_slange.c
|
||||
lapacke_slange_work.c
|
||||
lapacke_slansy.c
|
||||
|
@ -2089,6 +2095,8 @@ set(ZSRC
|
|||
lapacke_zlacrm_work.c
|
||||
lapacke_zlag2c.c
|
||||
lapacke_zlag2c_work.c
|
||||
lapacke_zlangb.c
|
||||
lapacke_zlangb_work.c
|
||||
lapacke_zlange.c
|
||||
lapacke_zlange_work.c
|
||||
lapacke_zlanhe.c
|
||||
|
@ -2481,6 +2489,8 @@ set(Utils_SRC
|
|||
lapacke_ctp_nancheck.c lapacke_dtr_trans.c lapacke_str_trans.c lapacke_ztp_trans.c
|
||||
lapacke_ctp_trans.c lapacke_lsame.c lapacke_xerbla.c lapacke_ztr_nancheck.c
|
||||
lapacke_ctr_nancheck.c lapacke_make_complex_double.c lapacke_z_nancheck.c lapacke_ztr_trans.c
|
||||
lapacke_ctz_nancheck.c lapacke_ctz_trans.c lapacke_dtz_nancheck.c lapacke_dtz_trans.c
|
||||
lapacke_stz_nancheck.c lapacke_stz_trans.c lapacke_ztz_nancheck.c lapacke_ztz_trans.c
|
||||
)
|
||||
|
||||
set(LAPACKE_REL_SRC "")
|
||||
|
|
|
@ -2,7 +2,7 @@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
|||
libsuffix=@SUFFIX64_UNDERSCORE@
|
||||
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
||||
|
||||
openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
|
||||
openblas_config=USE_64BITINT=@INTERFACE64@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
|
||||
Name: OpenBLAS
|
||||
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
||||
Version: @OPENBLAS_VERSION@
|
||||
|
|
|
@ -197,14 +197,14 @@ if (DEFINED TARGET)
|
|||
if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
if (${TARGET} STREQUAL HASWELL AND NOT NO_AVX2)
|
||||
if ((${TARGET} STREQUAL HASWELL OR ${TARGET} STREQUAL ZEN) AND NOT NO_AVX2)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
||||
endif()
|
||||
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2 -mfma")
|
||||
endif()
|
||||
endif()
|
||||
if (DEFINED HAVE_AVX)
|
||||
|
|
8
common.h
8
common.h
|
@ -90,7 +90,7 @@ extern "C" {
|
|||
#endif
|
||||
#include <time.h>
|
||||
|
||||
#ifdef OS_LINUX
|
||||
#if defined(OS_LINUX) || defined(OS_QNX)
|
||||
#include <malloc.h>
|
||||
#include <sched.h>
|
||||
#endif
|
||||
|
@ -107,7 +107,7 @@ extern "C" {
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
#if defined(OS_HAIKU) || defined(OS_QNX)
|
||||
#define NO_SYSV_IPC
|
||||
#endif
|
||||
|
||||
|
@ -387,6 +387,10 @@ typedef int blasint;
|
|||
#endif
|
||||
*/
|
||||
|
||||
#ifdef __EMSCRIPTEN__
|
||||
#define YIELDING
|
||||
#endif
|
||||
|
||||
#ifndef YIELDING
|
||||
#define YIELDING sched_yield()
|
||||
#endif
|
||||
|
|
|
@ -50,6 +50,7 @@ typedef struct {
|
|||
#ifdef BUILD_BFLOAT16
|
||||
int sbgemm_p, sbgemm_q, sbgemm_r;
|
||||
int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn;
|
||||
int sbgemm_align_k;
|
||||
|
||||
void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG);
|
||||
void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG);
|
||||
|
|
23
cpuid_x86.c
23
cpuid_x86.c
|
@ -1544,6 +1544,17 @@ int get_cpuname(void){
|
|||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 11: //family 6 exmodel 11
|
||||
switch (model) {
|
||||
case 7: // Raptor Lake
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case 0x7:
|
||||
|
@ -2334,6 +2345,18 @@ int get_coretype(void){
|
|||
return CORE_NEHALEM;
|
||||
}
|
||||
|
||||
case 11:
|
||||
switch (model) {
|
||||
case 7: // Raptor Lake
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CORE_HASWELL;
|
||||
#endif
|
||||
if(support_avx())
|
||||
return CORE_SANDYBRIDGE;
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
case 15:
|
||||
if (model <= 0x2) return CORE_NORTHWOOD;
|
||||
else return CORE_PRESCOTT;
|
||||
|
|
5
ctest.c
5
ctest.c
|
@ -173,3 +173,8 @@ HAVE_C11
|
|||
ARCH_E2K
|
||||
#endif
|
||||
|
||||
#if defined(__EMSCRIPTEN__)
|
||||
ARCH_RISCV64
|
||||
OS_WINDOWS
|
||||
#endif
|
||||
|
||||
|
|
|
@ -40,7 +40,7 @@ else()
|
|||
c_${float_char}blas1.c)
|
||||
endif()
|
||||
target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD")
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
||||
target_link_libraries(x${float_char}cblat1 m)
|
||||
endif()
|
||||
add_test(NAME "x${float_char}cblat1"
|
||||
|
@ -65,7 +65,7 @@ else()
|
|||
constant.c)
|
||||
endif()
|
||||
target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD")
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
||||
target_link_libraries(x${float_char}cblat2 m)
|
||||
endif()
|
||||
add_test(NAME "x${float_char}cblat2"
|
||||
|
@ -90,7 +90,7 @@ else()
|
|||
constant.c)
|
||||
endif()
|
||||
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD")
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
||||
target_link_libraries(x${float_char}cblat3 m)
|
||||
endif()
|
||||
add_test(NAME "x${float_char}cblat3"
|
||||
|
|
|
@ -969,7 +969,7 @@ real *sfac;
|
|||
1.17 };
|
||||
|
||||
/* Local variables */
|
||||
extern /* Subroutine */ srottest_();
|
||||
extern /* Subroutine */ void srottest_();
|
||||
static integer i__, k, ksize;
|
||||
extern /* Subroutine */ int stest_(), srotmtest_();
|
||||
static integer ki, kn;
|
||||
|
|
|
@ -304,6 +304,15 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M;
|
||||
}
|
||||
|
||||
BLASLONG pad_min_l = min_l;
|
||||
#if defined(HALF)
|
||||
#if defined(DYNAMIC_ARCH)
|
||||
pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1);
|
||||
#else
|
||||
pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* First, we have to move data A to L2 cache */
|
||||
min_i = m_to - m_from;
|
||||
l1stride = 1;
|
||||
|
@ -350,7 +359,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
START_RPCC();
|
||||
|
||||
OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
|
||||
sb + min_l * (jjs - js) * COMPSIZE * l1stride);
|
||||
sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride);
|
||||
|
||||
STOP_RPCC(outercost);
|
||||
|
||||
|
@ -358,10 +367,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
#if !defined(XDOUBLE) || !defined(QUAD_PRECISION)
|
||||
KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
|
||||
sa, sb + min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs);
|
||||
sa, sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs);
|
||||
#else
|
||||
KERNEL_OPERATION(min_i, min_jj, min_l, (void *)&xalpha,
|
||||
sa, sb + min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs);
|
||||
sa, sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs);
|
||||
#endif
|
||||
|
||||
STOP_RPCC(kernelcost);
|
||||
|
|
|
@ -325,6 +325,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
if (min_l > GEMM_Q) min_l = (min_l + 1) / 2;
|
||||
}
|
||||
|
||||
BLASLONG pad_min_l = min_l;
|
||||
|
||||
#if defined(HALF)
|
||||
#if defined(DYNAMIC_ARCH)
|
||||
pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1);
|
||||
#else
|
||||
pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* Determine step size in m
|
||||
* Note: We are currently on the first step in m
|
||||
*/
|
||||
|
@ -382,13 +392,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
/* Copy part of local region of B into workspace */
|
||||
START_RPCC();
|
||||
OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
|
||||
buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride);
|
||||
buffer[bufferside] + pad_min_l * (jjs - js) * COMPSIZE * l1stride);
|
||||
STOP_RPCC(copy_B);
|
||||
|
||||
/* Apply kernel with local region of A and part of local region of B */
|
||||
START_RPCC();
|
||||
KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
|
||||
sa, buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride,
|
||||
sa, buffer[bufferside] + pad_min_l * (jjs - js) * COMPSIZE * l1stride,
|
||||
c, ldc, m_from, jjs);
|
||||
STOP_RPCC(kernel);
|
||||
|
||||
|
|
|
@ -470,9 +470,13 @@ blas_queue_t *tscq;
|
|||
#endif
|
||||
|
||||
#ifdef CONSISTENT_FPCSR
|
||||
#ifdef __aarch64__
|
||||
__asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode));
|
||||
#else
|
||||
__asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
|
||||
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef MONITOR
|
||||
main_status[cpu] = MAIN_RUNNING1;
|
||||
|
@ -746,9 +750,13 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
|
|||
queue -> position = pos;
|
||||
|
||||
#ifdef CONSISTENT_FPCSR
|
||||
#ifdef __aarch64__
|
||||
__asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue -> sse_mode));
|
||||
#else
|
||||
__asm__ __volatile__ ("fnstcw %0" : "=m" (queue -> x87_mode));
|
||||
__asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode));
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST)
|
||||
|
||||
|
|
|
@ -69,6 +69,8 @@
|
|||
|
||||
int blas_server_avail = 0;
|
||||
|
||||
extern int openblas_omp_adaptive_env();
|
||||
|
||||
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
|
||||
#ifdef HAVE_C11
|
||||
static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
|
||||
|
@ -282,8 +284,12 @@ static void exec_threads(blas_queue_t *queue, int buf_index){
|
|||
sb = queue -> sb;
|
||||
|
||||
#ifdef CONSISTENT_FPCSR
|
||||
#ifdef __aarch64__
|
||||
__asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode));
|
||||
#else
|
||||
__asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
|
||||
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
|
||||
|
@ -381,8 +387,12 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
|||
|
||||
#ifdef CONSISTENT_FPCSR
|
||||
for (i = 0; i < num; i ++) {
|
||||
#ifdef __aarch64__
|
||||
__asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue[i].sse_mode));
|
||||
#else
|
||||
__asm__ __volatile__ ("fnstcw %0" : "=m" (queue[i].x87_mode));
|
||||
__asm__ __volatile__ ("stmxcsr %0" : "=m" (queue[i].sse_mode));
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -278,12 +278,15 @@ static DWORD WINAPI blas_thread_server(void *arg){
|
|||
} else
|
||||
#endif
|
||||
if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
|
||||
#ifdef BUILD_DOUBLE
|
||||
sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
|
||||
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
|
||||
#endif
|
||||
} else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) {
|
||||
#ifdef BUILD_SINGLE
|
||||
sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float)
|
||||
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
#endif
|
||||
} else {
|
||||
/* Other types in future */
|
||||
}
|
||||
|
@ -295,11 +298,15 @@ static DWORD WINAPI blas_thread_server(void *arg){
|
|||
} else
|
||||
#endif
|
||||
if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
|
||||
#ifdef BUILD_COMPLEX16
|
||||
sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double)
|
||||
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
#endif
|
||||
} else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) {
|
||||
#ifdef BUILD_COMPLEX
|
||||
sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float)
|
||||
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
#endif
|
||||
} else {
|
||||
/* Other types in future */
|
||||
}
|
||||
|
|
|
@ -1018,7 +1018,7 @@ static gotoblas_t *force_coretype(char *coretype){
|
|||
char message[128];
|
||||
//char mname[20];
|
||||
|
||||
for ( i=1 ; i <= 24; i++)
|
||||
for ( i=1 ; i <= 25; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype,corename[i],20))
|
||||
{
|
||||
|
|
|
@ -823,6 +823,8 @@ void gotoblas_affinity_init(void) {
|
|||
|
||||
if (numprocs == 0) numprocs = readenv_atoi("OMP_NUM_THREADS");
|
||||
|
||||
if (numprocs == 0) numprocs = readenv_atoi("OPENBLAS_DEFAULT_NUM_THREADS");
|
||||
|
||||
numnodes = 1;
|
||||
|
||||
if (numprocs == 1) {
|
||||
|
|
|
@ -66,9 +66,15 @@ void openblas_read_env() {
|
|||
if(ret<0) ret=0;
|
||||
openblas_env_thread_timeout=(unsigned int)ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"OPENBLAS_DEFAULT_NUM_THREADS")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_openblas_num_threads=ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
if(ret != 0 || openblas_env_openblas_num_threads == 0)
|
||||
openblas_env_openblas_num_threads=ret;
|
||||
|
||||
ret=0;
|
||||
|
|
8
f_check
8
f_check
|
@ -82,10 +82,6 @@ else
|
|||
vendor=FUJITSU
|
||||
openmp='-Kopenmp'
|
||||
;;
|
||||
*Cray*)
|
||||
vendor=CRAY
|
||||
openmp='-fopenmp'
|
||||
;;
|
||||
*GNU*|*GCC*)
|
||||
|
||||
v="${data#*GCC: *\) }"
|
||||
|
@ -117,6 +113,10 @@ else
|
|||
esac
|
||||
fi
|
||||
;;
|
||||
*Cray*)
|
||||
vendor=CRAY
|
||||
openmp='-fopenmp'
|
||||
;;
|
||||
*g95*)
|
||||
vendor=G95
|
||||
openmp=''
|
||||
|
|
|
@ -76,11 +76,6 @@ if ($compiler eq "") {
|
|||
$vendor = FUJITSU;
|
||||
$openmp = "-Kopenmp";
|
||||
|
||||
} elsif ($data =~ /Cray/) {
|
||||
|
||||
$vendor = CRAY;
|
||||
$openmp = "-fopenmp";
|
||||
|
||||
} elsif ($data =~ /GNU/ || $data =~ /GCC/ ) {
|
||||
|
||||
$data =~ s/\(+.*?\)+//g;
|
||||
|
@ -106,6 +101,10 @@ if ($compiler eq "") {
|
|||
$openmp = "";
|
||||
}
|
||||
}
|
||||
} elsif ($data =~ /Cray/) {
|
||||
|
||||
$vendor = CRAY;
|
||||
$openmp = "-fopenmp";
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -1410,7 +1410,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \
|
||||
"-march=armv8.4-a -mtune=neoverse-v1"
|
||||
"-march=armv8.4-a+sve -mtune=neoverse-v1"
|
||||
#define LIBNAME "neoversev1"
|
||||
#define CORENAME "NEOVERSEV1"
|
||||
#endif
|
||||
|
|
|
@ -53,7 +53,7 @@ set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES
|
|||
# these do not have separate 'z' sources
|
||||
set(BLAS3_SOURCES
|
||||
gemm.c symm.c
|
||||
trsm.c syrk.c syr2k.c
|
||||
trsm.c syrk.c syr2k.c gemmt.c
|
||||
)
|
||||
|
||||
set(BLAS3_MANGLED_SOURCES
|
||||
|
@ -189,7 +189,16 @@ if (NOT DEFINED NO_LAPACK)
|
|||
)
|
||||
|
||||
GenerateNamedObjects("${LAPACK_SOURCES}")
|
||||
if (NOT RELAPACK_REPLACE)
|
||||
GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" 0 "" "" 0 3)
|
||||
else ()
|
||||
GenerateNamedObjects("lapack/getrs.c" "" "" 0 "" "" 0 3)
|
||||
GenerateNamedObjects("lapack/getf2.c" "" "" 0 "" "" 0 3)
|
||||
GenerateNamedObjects("lapack/potf2.c" "" "" 0 "" "" 0 3)
|
||||
GenerateNamedObjects("lapack/laswp.c" "" "" 0 "" "" 0 3)
|
||||
GenerateNamedObjects("lapack/lauu2.c" "" "" 0 "" "" 0 3)
|
||||
GenerateNamedObjects("lapack/trti2.c" "" "" 0 "" "" 0 3)
|
||||
endif()
|
||||
endif ()
|
||||
|
||||
if ( BUILD_COMPLEX AND NOT BUILD_SINGLE)
|
||||
|
|
|
@ -44,12 +44,12 @@ SBLAS3OBJS = \
|
|||
sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \
|
||||
strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \
|
||||
somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\
|
||||
sgeadd.$(SUFFIX)
|
||||
sgeadd.$(SUFFIX) sgemmt.$(SUFFIX)
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
SBBLAS1OBJS = sbdot.$(SUFFIX)
|
||||
SBBLAS2OBJS = sbgemv.$(SUFFIX)
|
||||
SBBLAS3OBJS = sbgemm.$(SUFFIX)
|
||||
SBBLAS3OBJS = sbgemm.$(SUFFIX) sbgemmt.$(SUFFIX)
|
||||
SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX)
|
||||
endif
|
||||
|
||||
|
@ -76,7 +76,7 @@ DBLAS3OBJS = \
|
|||
dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \
|
||||
dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \
|
||||
domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)\
|
||||
dgeadd.$(SUFFIX)
|
||||
dgeadd.$(SUFFIX) dgemmt.$(SUFFIX)
|
||||
|
||||
CBLAS1OBJS = \
|
||||
caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \
|
||||
|
@ -105,7 +105,7 @@ CBLAS3OBJS = \
|
|||
ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \
|
||||
chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \
|
||||
comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)\
|
||||
cgeadd.$(SUFFIX)
|
||||
cgeadd.$(SUFFIX) cgemmt.$(SUFFIX)
|
||||
|
||||
ZBLAS1OBJS = \
|
||||
zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \
|
||||
|
@ -134,7 +134,7 @@ ZBLAS3OBJS = \
|
|||
ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \
|
||||
zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \
|
||||
zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)\
|
||||
zgeadd.$(SUFFIX)
|
||||
zgeadd.$(SUFFIX) zgemmt.$(SUFFIX)
|
||||
|
||||
ifeq ($(SUPPORT_GEMM3M), 1)
|
||||
|
||||
|
@ -281,12 +281,12 @@ CSBLAS2OBJS = \
|
|||
CSBLAS3OBJS = \
|
||||
cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \
|
||||
cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\
|
||||
cblas_sgeadd.$(SUFFIX)
|
||||
cblas_sgeadd.$(SUFFIX) cblas_sgemmt.$(SUFFIX)
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX)
|
||||
CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX)
|
||||
CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX)
|
||||
CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) cblas_sbgemmt.$(SUFFIX)
|
||||
CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX)
|
||||
endif
|
||||
|
||||
|
@ -306,7 +306,7 @@ CDBLAS2OBJS = \
|
|||
CDBLAS3OBJS += \
|
||||
cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \
|
||||
cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) \
|
||||
cblas_dgeadd.$(SUFFIX)
|
||||
cblas_dgeadd.$(SUFFIX) cblas_dgemmt.$(SUFFIX)
|
||||
|
||||
CCBLAS1OBJS = \
|
||||
cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
|
||||
|
@ -331,7 +331,7 @@ CCBLAS3OBJS = \
|
|||
cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \
|
||||
cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \
|
||||
cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\
|
||||
cblas_cgeadd.$(SUFFIX)
|
||||
cblas_cgeadd.$(SUFFIX) cblas_cgemmt.$(SUFFIX)
|
||||
|
||||
CXERBLAOBJ = \
|
||||
cblas_xerbla.$(SUFFIX)
|
||||
|
@ -362,7 +362,7 @@ CZBLAS3OBJS = \
|
|||
cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \
|
||||
cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\
|
||||
cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \
|
||||
cblas_zgeadd.$(SUFFIX)
|
||||
cblas_zgeadd.$(SUFFIX) cblas_zgemmt.$(SUFFIX)
|
||||
|
||||
|
||||
ifeq ($(SUPPORT_GEMM3M), 1)
|
||||
|
@ -1300,6 +1300,8 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c
|
|||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
sbgemmt.$(SUFFIX) sbgemm.$(PSUFFIX) : gemmt.c ../param.h
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
endif
|
||||
|
||||
sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||
|
@ -1320,6 +1322,24 @@ zgemm.$(SUFFIX) zgemm.$(PSUFFIX) : gemm.c ../param.h
|
|||
xgemm.$(SUFFIX) xgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
sgemmt.$(SUFFIX) sgemmt.$(PSUFFIX) : gemmt.c ../param.h
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
dgemmt.$(SUFFIX) dgemmt.$(PSUFFIX) : gemmt.c ../param.h
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
qgemmt.$(SUFFIX) qgemmt.$(PSUFFIX) : gemmt.c ../param.h
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
cgemmt.$(SUFFIX) cgemmt.$(PSUFFIX) : gemmt.c ../param.h
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
zgemmt.$(SUFFIX) zgemmt.$(PSUFFIX) : gemmt.c ../param.h
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
xgemmt.$(SUFFIX) xgemmt.$(PSUFFIX) : gemmt.c ../param.h
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
|
@ -1907,6 +1927,23 @@ cblas_cgemm.$(SUFFIX) cblas_cgemm.$(PSUFFIX) : gemm.c ../param.h
|
|||
cblas_zgemm.$(SUFFIX) cblas_zgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h
|
||||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h
|
||||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||
endif
|
||||
|
||||
cblas_dgemmt.$(SUFFIX) cblas_dgemmt.$(PSUFFIX) : gemmt.c ../param.h
|
||||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
cblas_cgemmt.$(SUFFIX) cblas_cgemmt.$(PSUFFIX) : gemmt.c ../param.h
|
||||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
cblas_zgemmt.$(SUFFIX) cblas_zgemmt.$(PSUFFIX) : gemmt.c ../param.h
|
||||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
cblas_ssymm.$(SUFFIX) cblas_ssymm.$(PSUFFIX) : symm.c
|
||||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
|
|
|
@ -0,0 +1,589 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2022, The OpenBLAS Project. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "common.h"
|
||||
#ifdef FUNCTION_PROFILE
|
||||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX
|
||||
#define SMP_THRESHOLD_MIN 65536.0
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "QGEMT "
|
||||
#elif defined(DOUBLE)
|
||||
#define ERROR_NAME "DGEMT "
|
||||
#elif defined(BFLOAT16)
|
||||
#define ERROR_NAME "SBGEMT "
|
||||
#else
|
||||
#define ERROR_NAME "SGEMT "
|
||||
#endif
|
||||
#else
|
||||
#define SMP_THRESHOLD_MIN 8192.0
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "XGEMT "
|
||||
#elif defined(DOUBLE)
|
||||
#define ERROR_NAME "ZGEMT "
|
||||
#else
|
||||
#define ERROR_NAME "CGEMT "
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef GEMM_MULTITHREAD_THRESHOLD
|
||||
#define GEMM_MULTITHREAD_THRESHOLD 4
|
||||
#endif
|
||||
|
||||
#ifndef CBLAS
|
||||
|
||||
void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
||||
blasint * M, blasint * N, blasint * K,
|
||||
FLOAT * Alpha,
|
||||
IFLOAT * a, blasint * ldA,
|
||||
IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
|
||||
{
|
||||
|
||||
blasint m, n, k;
|
||||
blasint lda, ldb, ldc;
|
||||
int transa, transb, uplo;
|
||||
blasint info;
|
||||
|
||||
char transA, transB, Uplo;
|
||||
IFLOAT *buffer;
|
||||
IFLOAT *aa, *bb;
|
||||
FLOAT *cc;
|
||||
#if defined(COMPLEX)
|
||||
FLOAT alpha_r, alpha_i, beta_r, beta_i;
|
||||
#else
|
||||
FLOAT alpha, beta;
|
||||
#endif
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
m = *M;
|
||||
n = *N;
|
||||
k = *K;
|
||||
|
||||
#if defined(COMPLEX)
|
||||
FLOAT *alpha = Alpha;
|
||||
alpha_r = *(Alpha + 0);
|
||||
alpha_i = *(Alpha + 1);
|
||||
|
||||
beta_r = *(Beta + 0);
|
||||
beta_i = *(Beta + 1);
|
||||
#else
|
||||
alpha = *Alpha;
|
||||
beta = *Beta;
|
||||
#endif
|
||||
|
||||
lda = *ldA;
|
||||
ldb = *ldB;
|
||||
ldc = *ldC;
|
||||
|
||||
transA = *TRANSA;
|
||||
transB = *TRANSB;
|
||||
Uplo = *UPLO;
|
||||
TOUPPER(transA);
|
||||
TOUPPER(transB);
|
||||
TOUPPER(Uplo);
|
||||
|
||||
transa = -1;
|
||||
transb = -1;
|
||||
uplo = -1;
|
||||
|
||||
if (transA == 'N')
|
||||
transa = 0;
|
||||
if (transA == 'T')
|
||||
transa = 1;
|
||||
#ifndef COMPLEX
|
||||
if (transA == 'R')
|
||||
transa = 0;
|
||||
if (transA == 'C')
|
||||
transa = 1;
|
||||
#else
|
||||
if (transA == 'R')
|
||||
transa = 2;
|
||||
if (transA == 'C')
|
||||
transa = 3;
|
||||
#endif
|
||||
|
||||
if (transB == 'N')
|
||||
transb = 0;
|
||||
if (transB == 'T')
|
||||
transb = 1;
|
||||
#ifndef COMPLEX
|
||||
if (transB == 'R')
|
||||
transb = 0;
|
||||
if (transB == 'C')
|
||||
transb = 1;
|
||||
#else
|
||||
if (transB == 'R')
|
||||
transb = 2;
|
||||
if (transB == 'C')
|
||||
transb = 3;
|
||||
#endif
|
||||
|
||||
if (Uplo == 'U')
|
||||
uplo = 0;
|
||||
if (Uplo == 'L')
|
||||
uplo = 1;
|
||||
|
||||
info = 0;
|
||||
|
||||
if (uplo < 0)
|
||||
info = 14;
|
||||
if (ldc < m)
|
||||
info = 13;
|
||||
if (k < 0)
|
||||
info = 5;
|
||||
if (n < 0)
|
||||
info = 4;
|
||||
if (m < 0)
|
||||
info = 3;
|
||||
if (transb < 0)
|
||||
info = 2;
|
||||
if (transa < 0)
|
||||
info = 1;
|
||||
|
||||
if (info) {
|
||||
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
return;
|
||||
}
|
||||
#else
|
||||
|
||||
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M,
|
||||
blasint N, blasint k,
|
||||
#ifndef COMPLEX
|
||||
FLOAT alpha,
|
||||
IFLOAT * A, blasint LDA,
|
||||
IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc)
|
||||
{
|
||||
#else
|
||||
void *valpha,
|
||||
void *va, blasint LDA,
|
||||
void *vb, blasint LDB, void *vbeta, void *vc, blasint ldc)
|
||||
{
|
||||
FLOAT *alpha = (FLOAT *) valpha;
|
||||
FLOAT *beta = (FLOAT *) vbeta;
|
||||
FLOAT *A = (FLOAT *) va;
|
||||
FLOAT *B = (FLOAT *) vb;
|
||||
FLOAT *c = (FLOAT *) vc;
|
||||
#endif
|
||||
FLOAT *aa, *bb, *cc;
|
||||
|
||||
int transa, transb, uplo;
|
||||
blasint info;
|
||||
blasint m, n, lda, ldb;
|
||||
FLOAT *a, *b;
|
||||
XFLOAT *buffer;
|
||||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
transa = -1;
|
||||
transb = -1;
|
||||
info = 0;
|
||||
|
||||
if (order == CblasColMajor) {
|
||||
|
||||
if (TransA == CblasNoTrans)
|
||||
transa = 0;
|
||||
if (TransA == CblasTrans)
|
||||
transa = 1;
|
||||
#ifndef COMPLEX
|
||||
if (TransA == CblasConjNoTrans)
|
||||
transa = 0;
|
||||
if (TransA == CblasConjTrans)
|
||||
transa = 1;
|
||||
#else
|
||||
if (TransA == CblasConjNoTrans)
|
||||
transa = 2;
|
||||
if (TransA == CblasConjTrans)
|
||||
transa = 3;
|
||||
#endif
|
||||
if (TransB == CblasNoTrans)
|
||||
transb = 0;
|
||||
if (TransB == CblasTrans)
|
||||
transb = 1;
|
||||
#ifndef COMPLEX
|
||||
if (TransB == CblasConjNoTrans)
|
||||
transb = 0;
|
||||
if (TransB == CblasConjTrans)
|
||||
transb = 1;
|
||||
#else
|
||||
if (TransB == CblasConjNoTrans)
|
||||
transb = 2;
|
||||
if (TransB == CblasConjTrans)
|
||||
transb = 3;
|
||||
#endif
|
||||
|
||||
m = M;
|
||||
n = N;
|
||||
|
||||
a = (void *)A;
|
||||
b = (void *)B;
|
||||
lda = LDA;
|
||||
ldb = LDB;
|
||||
|
||||
info = -1;
|
||||
|
||||
if (ldc < m)
|
||||
info = 13;
|
||||
if (k < 0)
|
||||
info = 5;
|
||||
if (n < 0)
|
||||
info = 4;
|
||||
if (m < 0)
|
||||
info = 3;
|
||||
if (transb < 0)
|
||||
info = 2;
|
||||
if (transa < 0)
|
||||
info = 1;
|
||||
}
|
||||
|
||||
if (order == CblasRowMajor) {
|
||||
m = N;
|
||||
n = M;
|
||||
|
||||
a = (void *)B;
|
||||
b = (void *)A;
|
||||
|
||||
lda = LDB;
|
||||
ldb = LDA;
|
||||
|
||||
if (TransB == CblasNoTrans)
|
||||
transa = 0;
|
||||
if (TransB == CblasTrans)
|
||||
transa = 1;
|
||||
#ifndef COMPLEX
|
||||
if (TransB == CblasConjNoTrans)
|
||||
transa = 0;
|
||||
if (TransB == CblasConjTrans)
|
||||
transa = 1;
|
||||
#else
|
||||
if (TransB == CblasConjNoTrans)
|
||||
transa = 2;
|
||||
if (TransB == CblasConjTrans)
|
||||
transa = 3;
|
||||
#endif
|
||||
if (TransA == CblasNoTrans)
|
||||
transb = 0;
|
||||
if (TransA == CblasTrans)
|
||||
transb = 1;
|
||||
#ifndef COMPLEX
|
||||
if (TransA == CblasConjNoTrans)
|
||||
transb = 0;
|
||||
if (TransA == CblasConjTrans)
|
||||
transb = 1;
|
||||
#else
|
||||
if (TransA == CblasConjNoTrans)
|
||||
transb = 2;
|
||||
if (TransA == CblasConjTrans)
|
||||
transb = 3;
|
||||
#endif
|
||||
|
||||
info = -1;
|
||||
|
||||
if (ldc < m)
|
||||
info = 13;
|
||||
if (k < 0)
|
||||
info = 5;
|
||||
if (n < 0)
|
||||
info = 4;
|
||||
if (m < 0)
|
||||
info = 3;
|
||||
if (transb < 0)
|
||||
info = 2;
|
||||
if (transa < 0)
|
||||
info = 1;
|
||||
|
||||
}
|
||||
|
||||
uplo = -1;
|
||||
if (Uplo == CblasUpper)
|
||||
uplo = 0;
|
||||
if (Uplo == CblasLower)
|
||||
uplo = 1;
|
||||
if (uplo < 0)
|
||||
info = 14;
|
||||
|
||||
if (info >= 0) {
|
||||
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
return;
|
||||
}
|
||||
#if defined(COMPLEX)
|
||||
FLOAT alpha_r = *(alpha + 0);
|
||||
FLOAT alpha_i = *(alpha + 1);
|
||||
|
||||
FLOAT beta_r = *(beta + 0);
|
||||
FLOAT beta_i = *(beta + 1);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
int buffer_size;
|
||||
blasint l;
|
||||
blasint i, j;
|
||||
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
#endif
|
||||
|
||||
#if defined(COMPLEX)
|
||||
|
||||
#ifdef SMP
|
||||
static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT *, FLOAT *,
|
||||
BLASLONG, FLOAT *, BLASLONG, FLOAT *,
|
||||
BLASLONG, FLOAT *, int) = {
|
||||
#ifdef XDOUBLE
|
||||
xgemv_thread_n, xgemv_thread_t, xgemv_thread_r, xgemv_thread_c,
|
||||
xgemv_thread_o, xgemv_thread_u, xgemv_thread_s,
|
||||
xgemv_thread_d,
|
||||
#elif defined DOUBLE
|
||||
zgemv_thread_n, zgemv_thread_t, zgemv_thread_r, zgemv_thread_c,
|
||||
zgemv_thread_o, zgemv_thread_u, zgemv_thread_s,
|
||||
zgemv_thread_d,
|
||||
#else
|
||||
cgemv_thread_n, cgemv_thread_t, cgemv_thread_r, cgemv_thread_c,
|
||||
cgemv_thread_o, cgemv_thread_u, cgemv_thread_s,
|
||||
cgemv_thread_d,
|
||||
#endif
|
||||
};
|
||||
#endif
|
||||
|
||||
int (*gemv[]) (BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *,
|
||||
BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG,
|
||||
FLOAT *) = {
|
||||
GEMV_N, GEMV_T, GEMV_R, GEMV_C, GEMV_O, GEMV_U, GEMV_S, GEMV_D,};
|
||||
|
||||
#else
|
||||
|
||||
#ifdef SMP
|
||||
static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, FLOAT *,
|
||||
BLASLONG, FLOAT *, BLASLONG, FLOAT *,
|
||||
BLASLONG, FLOAT *, int) = {
|
||||
#ifdef XDOUBLE
|
||||
qgemv_thread_n, qgemv_thread_t,
|
||||
#elif defined DOUBLE
|
||||
dgemv_thread_n, dgemv_thread_t,
|
||||
#else
|
||||
sgemv_thread_n, sgemv_thread_t,
|
||||
#endif
|
||||
};
|
||||
#endif
|
||||
int (*gemv[]) (BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG,
|
||||
FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = {
|
||||
GEMV_N, GEMV_T,};
|
||||
|
||||
#endif
|
||||
|
||||
if ((m == 0) || (n == 0))
|
||||
return;
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
const blasint incb = (transb == 0) ? 1 : ldb;
|
||||
|
||||
if (uplo == 1) {
|
||||
for (i = 0; i < n; i++) {
|
||||
j = n - i;
|
||||
|
||||
l = j;
|
||||
#if defined(COMPLEX)
|
||||
aa = a + i * 2;
|
||||
bb = b + i * ldb * 2;
|
||||
if (transa) {
|
||||
l = k;
|
||||
aa = a + lda * i * 2;
|
||||
bb = b + i * 2;
|
||||
}
|
||||
cc = c + i * 2 * ldc + i * 2;
|
||||
#else
|
||||
aa = a + i;
|
||||
bb = b + i * ldb;
|
||||
if (transa) {
|
||||
l = k;
|
||||
aa = a + lda * i;
|
||||
bb = b + i;
|
||||
}
|
||||
cc = c + i * ldc + i;
|
||||
#endif
|
||||
|
||||
#if defined(COMPLEX)
|
||||
if (beta_r != ONE || beta_i != ZERO)
|
||||
SCAL_K(l, 0, 0, beta_r, beta_i, cc, 1, NULL, 0,
|
||||
NULL, 0);
|
||||
|
||||
if (alpha_r == ZERO && alpha_i == ZERO)
|
||||
return;
|
||||
#else
|
||||
if (beta != ONE)
|
||||
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO)
|
||||
continue;
|
||||
#endif
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
buffer_size = j + k + 128 / sizeof(FLOAT);
|
||||
#ifdef WINDOWS_ABI
|
||||
buffer_size += 160 / sizeof(FLOAT);
|
||||
#endif
|
||||
// for alignment
|
||||
buffer_size = (buffer_size + 3) & ~3;
|
||||
STACK_ALLOC(buffer_size, FLOAT, buffer);
|
||||
|
||||
#ifdef SMP
|
||||
|
||||
if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = num_cpu_avail(2);
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
||||
#if defined(COMPLEX)
|
||||
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
|
||||
aa, lda, bb, incb, cc, 1,
|
||||
buffer);
|
||||
#else
|
||||
(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
|
||||
bb, incb, cc, 1, buffer);
|
||||
#endif
|
||||
#ifdef SMP
|
||||
} else {
|
||||
|
||||
(gemv_thread[(int)transa]) (j, k, alpha, aa,
|
||||
lda, bb, incb, cc,
|
||||
1, buffer,
|
||||
nthreads);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
STACK_FREE(buffer);
|
||||
}
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
j = i + 1;
|
||||
|
||||
l = j;
|
||||
#if defined COMPLEX
|
||||
bb = b + i * ldb * 2;
|
||||
if (transa) {
|
||||
l = k;
|
||||
bb = b + i * 2;
|
||||
}
|
||||
cc = c + i * 2 * ldc;
|
||||
#else
|
||||
bb = b + i * ldb;
|
||||
if (transa) {
|
||||
l = k;
|
||||
bb = b + i;
|
||||
}
|
||||
cc = c + i * ldc;
|
||||
#endif
|
||||
|
||||
#if defined(COMPLEX)
|
||||
if (beta_r != ONE || beta_i != ZERO)
|
||||
SCAL_K(l, 0, 0, beta_r, beta_i, cc, 1, NULL, 0,
|
||||
NULL, 0);
|
||||
|
||||
if (alpha_r == ZERO && alpha_i == ZERO)
|
||||
return;
|
||||
#else
|
||||
if (beta != ONE)
|
||||
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO)
|
||||
continue;
|
||||
#endif
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
buffer_size = j + k + 128 / sizeof(FLOAT);
|
||||
#ifdef WINDOWS_ABI
|
||||
buffer_size += 160 / sizeof(FLOAT);
|
||||
#endif
|
||||
// for alignment
|
||||
buffer_size = (buffer_size + 3) & ~3;
|
||||
STACK_ALLOC(buffer_size, FLOAT, buffer);
|
||||
|
||||
#ifdef SMP
|
||||
|
||||
if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = num_cpu_avail(2);
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
||||
#if defined(COMPLEX)
|
||||
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
|
||||
a, lda, bb, incb, cc, 1,
|
||||
buffer);
|
||||
#else
|
||||
(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
|
||||
incb, cc, 1, buffer);
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
} else {
|
||||
|
||||
(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
|
||||
bb, incb, cc, 1,
|
||||
buffer, nthreads);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
STACK_FREE(buffer);
|
||||
}
|
||||
}
|
||||
FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE,
|
||||
args.m * args.k + args.k * args.n +
|
||||
args.m * args.n, 2 * args.m * args.n * args.k);
|
||||
|
||||
IDEBUG_END;
|
||||
|
||||
return;
|
||||
}
|
|
@ -23,7 +23,7 @@ ifeq ($(C_COMPILER), CLANG)
|
|||
# Any clang posing as gcc 4.2 should be new enough (3.4 or later)
|
||||
GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2)
|
||||
ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
|
||||
AVX2OPT = -mavx2
|
||||
AVX2OPT = -mavx2 -mfma
|
||||
endif
|
||||
endif
|
||||
ifdef NO_AVX2
|
||||
|
@ -73,6 +73,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX)
|
|||
endif
|
||||
else ifeq ($(TARGET_CORE), HASWELL)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
|
||||
else ifeq ($(TARGET_CORE), ZEN)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
|
||||
else ifeq ($(TARGET_CORE), LOONGSON3R4)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS)
|
||||
else
|
||||
|
|
|
@ -96,8 +96,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c
|
|||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
DDOTKERNEL = dot_thunderx2t99.c
|
||||
SDOTKERNEL = dot_thunderx2t99.c
|
||||
DDOTKERNEL = dot.c
|
||||
SDOTKERNEL = dot.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
DSDOTKERNEL = dot.S
|
||||
|
|
|
@ -96,8 +96,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c
|
|||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
DDOTKERNEL = dot_thunderx2t99.c
|
||||
SDOTKERNEL = dot_thunderx2t99.c
|
||||
DDOTKERNEL = dot.c
|
||||
SDOTKERNEL = dot.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
DSDOTKERNEL = dot.S
|
||||
|
@ -190,10 +190,10 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
|||
|
||||
SBGEMM_BETA = sbgemm_beta_neoversen2.c
|
||||
SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversen2.c
|
||||
SBGEMMINCOPY = sbgemm_ncopy_neoversen2.c
|
||||
SBGEMMITCOPY = sbgemm_tcopy_neoversen2.c
|
||||
SBGEMMONCOPY = sbgemm_ncopy_neoversen2.c
|
||||
SBGEMMOTCOPY = sbgemm_tcopy_neoversen2.c
|
||||
SBGEMMINCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversen2.c
|
||||
SBGEMMITCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversen2.c
|
||||
SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversen2.c
|
||||
SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversen2.c
|
||||
SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
|
|
|
@ -96,8 +96,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c
|
|||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
DDOTKERNEL = dot_thunderx2t99.c
|
||||
SDOTKERNEL = dot_thunderx2t99.c
|
||||
DDOTKERNEL = dot.c
|
||||
SDOTKERNEL = dot.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
DSDOTKERNEL = dot.S
|
||||
|
|
|
@ -161,8 +161,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c
|
|||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
|
||||
DDOTKERNEL = dot_thunderx2t99.c
|
||||
SDOTKERNEL = dot_thunderx2t99.c
|
||||
DDOTKERNEL = dot.c
|
||||
SDOTKERNEL = dot.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
DSDOTKERNEL = dot.S
|
||||
|
|
|
@ -161,8 +161,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c
|
|||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
|
||||
DDOTKERNEL = dot_thunderx2t99.c
|
||||
SDOTKERNEL = dot_thunderx2t99.c
|
||||
DDOTKERNEL = dot.c
|
||||
SDOTKERNEL = dot.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
DSDOTKERNEL = dot.S
|
||||
|
|
|
@ -0,0 +1,121 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2017, The OpenBLAS Project
|
||||
Copyright (c) 2022, Arm Ltd
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
// Some compilers will report feature support for SVE without the appropriate
|
||||
// header available
|
||||
#ifdef HAVE_SVE
|
||||
#if defined __has_include
|
||||
#if __has_include(<arm_sve.h>) && __ARM_FEATURE_SVE
|
||||
#define USE_SVE
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef USE_SVE
|
||||
#include "dot_kernel_sve.c"
|
||||
#endif
|
||||
#include "dot_kernel_asimd.c"
|
||||
|
||||
#if defined(SMP)
|
||||
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
|
||||
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
|
||||
void *c, BLASLONG ldc, int (*function)(), int nthreads);
|
||||
#endif
|
||||
|
||||
static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
RETURN_TYPE dot = 0.0 ;
|
||||
|
||||
if ( n <= 0 ) return dot;
|
||||
|
||||
#ifdef USE_SVE
|
||||
if (inc_x == 1 && inc_y == 1) {
|
||||
return dot_kernel_sve(n, x, y);
|
||||
}
|
||||
#endif
|
||||
|
||||
return dot_kernel_asimd(n, x, inc_x, y, inc_y);
|
||||
}
|
||||
|
||||
#if defined(SMP)
|
||||
static int dot_thread_function(BLASLONG n, BLASLONG dummy0,
|
||||
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
||||
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
|
||||
{
|
||||
*(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y);
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
#if defined(SMP)
|
||||
int nthreads;
|
||||
FLOAT dummy_alpha;
|
||||
#endif
|
||||
RETURN_TYPE dot = 0.0;
|
||||
|
||||
#if defined(SMP)
|
||||
if (inc_x == 0 || inc_y == 0 || n <= 10000)
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = num_cpu_avail(1);
|
||||
|
||||
if (nthreads == 1) {
|
||||
dot = dot_compute(n, x, inc_x, y, inc_y);
|
||||
} else {
|
||||
int mode, i;
|
||||
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
|
||||
RETURN_TYPE *ptr;
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
mode = BLAS_SINGLE | BLAS_REAL;
|
||||
#else
|
||||
mode = BLAS_DOUBLE | BLAS_REAL;
|
||||
#endif
|
||||
|
||||
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
|
||||
x, inc_x, y, inc_y, result, 0,
|
||||
( void *)dot_thread_function, nthreads);
|
||||
|
||||
ptr = (RETURN_TYPE *)result;
|
||||
for (i = 0; i < nthreads; i++) {
|
||||
dot = dot + (*ptr);
|
||||
ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2);
|
||||
}
|
||||
}
|
||||
#else
|
||||
dot = dot_compute(n, x, inc_x, y, inc_y);
|
||||
#endif
|
||||
|
||||
return dot;
|
||||
}
|
|
@ -1,5 +1,6 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2017, The OpenBLAS Project
|
||||
Copyright (c) 2022, Arm Ltd
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
@ -36,25 +37,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define RETURN_TYPE double
|
||||
#endif
|
||||
|
||||
#define N "x0" /* vector length */
|
||||
#define X "x1" /* "X" vector address */
|
||||
#define INC_X "x2" /* "X" stride */
|
||||
#define Y "x3" /* "Y" vector address */
|
||||
#define INC_Y "x4" /* "Y" stride */
|
||||
#define J "x5" /* loop variable */
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#if !defined(DSDOT)
|
||||
#define DOT_MOD "s"
|
||||
#define REG0 "wzr"
|
||||
#define DOTF "s0"
|
||||
#define TMPX "s16"
|
||||
#define TMPY "s24"
|
||||
#define INC_SHIFT "2"
|
||||
#define N_DIV_SHIFT "6"
|
||||
#define N_REM_MASK "63"
|
||||
#else
|
||||
#define DOT_MOD "d"
|
||||
#define REG0 "xzr"
|
||||
#define DOTF "d0"
|
||||
#define TMPX "s16"
|
||||
#define TMPX1 "d2"
|
||||
#define TMPY "s24"
|
||||
|
@ -64,8 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define N_REM_MASK "15"
|
||||
#endif
|
||||
#else
|
||||
#define DOT_MOD "d"
|
||||
#define REG0 "xzr"
|
||||
#define DOTF "d0"
|
||||
#define TMPX "d16"
|
||||
#define TMPY "d24"
|
||||
#define INC_SHIFT "3"
|
||||
|
@ -73,59 +67,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define N_REM_MASK "31"
|
||||
#endif
|
||||
|
||||
#define OUT "%"DOT_MOD"[DOT_]"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
|
||||
#if !defined(DSDOT)
|
||||
#define KERNEL_F1 \
|
||||
" ldr "TMPX", ["X"] \n" \
|
||||
" ldr "TMPY", ["Y"] \n" \
|
||||
" add "X", "X", "INC_X" \n" \
|
||||
" add "Y", "Y", "INC_Y" \n" \
|
||||
" fmadd "DOTF", "TMPX", "TMPY", "DOTF" \n"
|
||||
" ldr "TMPX", [%[X_]] \n" \
|
||||
" ldr "TMPY", [%[Y_]] \n" \
|
||||
" add %[X_], %[X_], %[INCX_] \n" \
|
||||
" add %[Y_], %[Y_], %[INCY_] \n" \
|
||||
" fmadd "OUT", "TMPX", "TMPY", "OUT" \n"
|
||||
|
||||
#define KERNEL_F \
|
||||
" ldp q16, q17, ["X"] \n" \
|
||||
" ldp q24, q25, ["Y"] \n" \
|
||||
" ldp q18, q19, ["X", #32] \n" \
|
||||
" ldp q26, q27, ["Y", #32] \n" \
|
||||
" ldp q16, q17, [%[X_]] \n" \
|
||||
" ldp q24, q25, [%[Y_]] \n" \
|
||||
" ldp q18, q19, [%[X_], #32] \n" \
|
||||
" ldp q26, q27, [%[Y_], #32] \n" \
|
||||
" fmla v0.4s, v16.4s, v24.4s \n" \
|
||||
" fmla v1.4s, v17.4s, v25.4s \n" \
|
||||
" ldp q20, q21, ["X", #64] \n" \
|
||||
" ldp q28, q29, ["Y", #64] \n" \
|
||||
" ldp q20, q21, [%[X_], #64] \n" \
|
||||
" ldp q28, q29, [%[Y_], #64] \n" \
|
||||
" fmla v2.4s, v18.4s, v26.4s \n" \
|
||||
" fmla v3.4s, v19.4s, v27.4s \n" \
|
||||
" ldp q22, q23, ["X", #96] \n" \
|
||||
" ldp q30, q31, ["Y", #96] \n" \
|
||||
" add "Y", "Y", #128 \n" \
|
||||
" add "X", "X", #128 \n" \
|
||||
" ldp q22, q23, [%[X_], #96] \n" \
|
||||
" ldp q30, q31, [%[Y_], #96] \n" \
|
||||
" add %[Y_], %[Y_], #128 \n" \
|
||||
" add %[X_], %[X_], #128 \n" \
|
||||
" fmla v4.4s, v20.4s, v28.4s \n" \
|
||||
" fmla v5.4s, v21.4s, v29.4s \n" \
|
||||
" PRFM PLDL1KEEP, ["X", #896] \n" \
|
||||
" PRFM PLDL1KEEP, ["Y", #896] \n" \
|
||||
" PRFM PLDL1KEEP, ["X", #896+64] \n" \
|
||||
" PRFM PLDL1KEEP, ["Y", #896+64] \n" \
|
||||
" PRFM PLDL1KEEP, [%[X_], #896] \n" \
|
||||
" PRFM PLDL1KEEP, [%[Y_], #896] \n" \
|
||||
" PRFM PLDL1KEEP, [%[X_], #896+64] \n" \
|
||||
" PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \
|
||||
" fmla v6.4s, v22.4s, v30.4s \n" \
|
||||
" fmla v7.4s, v23.4s, v31.4s \n" \
|
||||
" ldp q16, q17, ["X"] \n" \
|
||||
" ldp q24, q25, ["Y"] \n" \
|
||||
" ldp q18, q19, ["X", #32] \n" \
|
||||
" ldp q26, q27, ["Y", #32] \n" \
|
||||
" ldp q16, q17, [%[X_]] \n" \
|
||||
" ldp q24, q25, [%[Y_]] \n" \
|
||||
" ldp q18, q19, [%[X_], #32] \n" \
|
||||
" ldp q26, q27, [%[Y_], #32] \n" \
|
||||
" fmla v0.4s, v16.4s, v24.4s \n" \
|
||||
" fmla v1.4s, v17.4s, v25.4s \n" \
|
||||
" ldp q20, q21, ["X", #64] \n" \
|
||||
" ldp q28, q29, ["Y", #64] \n" \
|
||||
" ldp q20, q21, [%[X_], #64] \n" \
|
||||
" ldp q28, q29, [%[Y_], #64] \n" \
|
||||
" fmla v2.4s, v18.4s, v26.4s \n" \
|
||||
" fmla v3.4s, v19.4s, v27.4s \n" \
|
||||
" ldp q22, q23, ["X", #96] \n" \
|
||||
" ldp q30, q31, ["Y", #96] \n" \
|
||||
" add "Y", "Y", #128 \n" \
|
||||
" add "X", "X", #128 \n" \
|
||||
" ldp q22, q23, [%[X_], #96] \n" \
|
||||
" ldp q30, q31, [%[Y_], #96] \n" \
|
||||
" add %[Y_], %[Y_], #128 \n" \
|
||||
" add %[X_], %[X_], #128 \n" \
|
||||
" fmla v4.4s, v20.4s, v28.4s \n" \
|
||||
" fmla v5.4s, v21.4s, v29.4s \n" \
|
||||
" PRFM PLDL1KEEP, ["X", #896] \n" \
|
||||
" PRFM PLDL1KEEP, ["Y", #896] \n" \
|
||||
" PRFM PLDL1KEEP, ["X", #896+64] \n" \
|
||||
" PRFM PLDL1KEEP, ["Y", #896+64] \n" \
|
||||
" PRFM PLDL1KEEP, [%[X_], #896] \n" \
|
||||
" PRFM PLDL1KEEP, [%[Y_], #896] \n" \
|
||||
" PRFM PLDL1KEEP, [%[X_], #896+64] \n" \
|
||||
" PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \
|
||||
" fmla v6.4s, v22.4s, v30.4s \n" \
|
||||
" fmla v7.4s, v23.4s, v31.4s \n"
|
||||
|
||||
|
@ -142,19 +138,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#else /* !defined(DSDOT) */
|
||||
#define KERNEL_F1 \
|
||||
" ldr "TMPX", ["X"] \n" \
|
||||
" ldr "TMPY", ["Y"] \n" \
|
||||
" add "X", "X", "INC_X" \n" \
|
||||
" add "Y", "Y", "INC_Y" \n" \
|
||||
" ldr "TMPX", [%[X_]] \n" \
|
||||
" ldr "TMPY", [%[Y_]] \n" \
|
||||
" add %[X_], %[X_], %[INCX_] \n" \
|
||||
" add %[Y_], %[Y_], %[INCY_] \n" \
|
||||
" fcvt "TMPX1", "TMPX" \n" \
|
||||
" fcvt "TMPY1", "TMPY" \n" \
|
||||
" fmul "TMPX1", "TMPX1", "TMPY1" \n" \
|
||||
" fadd "DOTF", "DOTF", "TMPX1" \n"
|
||||
" fadd "OUT", "OUT", "TMPX1" \n"
|
||||
|
||||
|
||||
#define KERNEL_F \
|
||||
" ldp q18, q19, ["X"] \n" \
|
||||
" ldp q26, q27, ["Y"] \n" \
|
||||
" ldp q18, q19, [%[X_]] \n" \
|
||||
" ldp q26, q27, [%[Y_]] \n" \
|
||||
" fcvtl v16.2d, v18.2s \n" \
|
||||
" fcvtl2 v17.2d, v18.4s \n" \
|
||||
" fcvtl v18.2d, v19.2s \n" \
|
||||
|
@ -163,8 +159,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
" fcvtl2 v25.2d, v26.4s \n" \
|
||||
" fcvtl v26.2d, v27.2s \n" \
|
||||
" fcvtl2 v27.2d, v27.4s \n" \
|
||||
" ldp q22, q23, ["X", #32] \n" \
|
||||
" ldp q30, q31, ["Y", #32] \n" \
|
||||
" ldp q22, q23, [%[X_], #32] \n" \
|
||||
" ldp q30, q31, [%[Y_], #32] \n" \
|
||||
" fcvtl v20.2d, v22.2s \n" \
|
||||
" fcvtl2 v21.2d, v22.4s \n" \
|
||||
" fcvtl v22.2d, v23.2s \n" \
|
||||
|
@ -173,16 +169,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
" fcvtl2 v29.2d, v30.4s \n" \
|
||||
" fcvtl v30.2d, v31.2s \n" \
|
||||
" fcvtl2 v31.2d, v31.4s \n" \
|
||||
" PRFM PLDL1KEEP, ["X", #896] \n" \
|
||||
" PRFM PLDL1KEEP, ["Y", #896] \n" \
|
||||
" PRFM PLDL1KEEP, ["X", #896+64] \n" \
|
||||
" PRFM PLDL1KEEP, ["Y", #896+64] \n" \
|
||||
" PRFM PLDL1KEEP, [%[X_], #896] \n" \
|
||||
" PRFM PLDL1KEEP, [%[Y_], #896] \n" \
|
||||
" PRFM PLDL1KEEP, [%[X_], #896+64] \n" \
|
||||
" PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \
|
||||
" fmla v0.2d, v16.2d, v24.2d \n" \
|
||||
" fmla v1.2d, v17.2d, v25.2d \n" \
|
||||
" fmla v2.2d, v18.2d, v26.2d \n" \
|
||||
" fmla v3.2d, v19.2d, v27.2d \n" \
|
||||
" add "Y", "Y", #64 \n" \
|
||||
" add "X", "X", #64 \n" \
|
||||
" add %[Y_], %[Y_], #64 \n" \
|
||||
" add %[X_], %[X_], #64 \n" \
|
||||
" fmla v4.2d, v20.2d, v28.2d \n" \
|
||||
" fmla v5.2d, v21.2d, v29.2d \n" \
|
||||
" fmla v6.2d, v22.2d, v30.2d \n" \
|
||||
|
@ -196,60 +192,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
" fadd v0.2d, v0.2d, v2.2d \n" \
|
||||
" fadd v4.2d, v4.2d, v6.2d \n" \
|
||||
" fadd v0.2d, v0.2d, v4.2d \n" \
|
||||
" faddp "DOTF", v0.2d \n"
|
||||
" faddp "OUT", v0.2d \n"
|
||||
#endif /* !defined(DSDOT) */
|
||||
|
||||
#else /* !defined(DOUBLE) */
|
||||
#define KERNEL_F1 \
|
||||
" ldr "TMPX", ["X"] \n" \
|
||||
" ldr "TMPY", ["Y"] \n" \
|
||||
" add "X", "X", "INC_X" \n" \
|
||||
" add "Y", "Y", "INC_Y" \n" \
|
||||
" fmadd "DOTF", "TMPX", "TMPY", "DOTF" \n"
|
||||
" ldr "TMPX", [%[X_]] \n" \
|
||||
" ldr "TMPY", [%[Y_]] \n" \
|
||||
" add %[X_], %[X_], %[INCX_] \n" \
|
||||
" add %[Y_], %[Y_], %[INCY_] \n" \
|
||||
" fmadd "OUT", "TMPX", "TMPY", "OUT" \n"
|
||||
|
||||
#define KERNEL_F \
|
||||
" ldp q16, q17, ["X"] \n" \
|
||||
" ldp q24, q25, ["Y"] \n" \
|
||||
" ldp q18, q19, ["X", #32] \n" \
|
||||
" ldp q26, q27, ["Y", #32] \n" \
|
||||
" ldp q16, q17, [%[X_]] \n" \
|
||||
" ldp q24, q25, [%[Y_]] \n" \
|
||||
" ldp q18, q19, [%[X_], #32] \n" \
|
||||
" ldp q26, q27, [%[Y_], #32] \n" \
|
||||
" fmla v0.2d, v16.2d, v24.2d \n" \
|
||||
" fmla v1.2d, v17.2d, v25.2d \n" \
|
||||
" ldp q20, q21, ["X", #64] \n" \
|
||||
" ldp q28, q29, ["Y", #64] \n" \
|
||||
" ldp q20, q21, [%[X_], #64] \n" \
|
||||
" ldp q28, q29, [%[Y_], #64] \n" \
|
||||
" fmla v2.2d, v18.2d, v26.2d \n" \
|
||||
" fmla v3.2d, v19.2d, v27.2d \n" \
|
||||
" ldp q22, q23, ["X", #96] \n" \
|
||||
" ldp q30, q31, ["Y", #96] \n" \
|
||||
" add "Y", "Y", #128 \n" \
|
||||
" add "X", "X", #128 \n" \
|
||||
" ldp q22, q23, [%[X_], #96] \n" \
|
||||
" ldp q30, q31, [%[Y_], #96] \n" \
|
||||
" add %[Y_], %[Y_], #128 \n" \
|
||||
" add %[X_], %[X_], #128 \n" \
|
||||
" fmla v4.2d, v20.2d, v28.2d \n" \
|
||||
" fmla v5.2d, v21.2d, v29.2d \n" \
|
||||
" PRFM PLDL1KEEP, ["X", #896] \n" \
|
||||
" PRFM PLDL1KEEP, ["Y", #896] \n" \
|
||||
" PRFM PLDL1KEEP, ["X", #896+64] \n" \
|
||||
" PRFM PLDL1KEEP, ["Y", #896+64] \n" \
|
||||
" PRFM PLDL1KEEP, [%[X_], #896] \n" \
|
||||
" PRFM PLDL1KEEP, [%[Y_], #896] \n" \
|
||||
" PRFM PLDL1KEEP, [%[X_], #896+64] \n" \
|
||||
" PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \
|
||||
" fmla v6.2d, v22.2d, v30.2d \n" \
|
||||
" fmla v7.2d, v23.2d, v31.2d \n" \
|
||||
" ldp q16, q17, ["X"] \n" \
|
||||
" ldp q24, q25, ["Y"] \n" \
|
||||
" ldp q18, q19, ["X", #32] \n" \
|
||||
" ldp q26, q27, ["Y", #32] \n" \
|
||||
" ldp q16, q17, [%[X_]] \n" \
|
||||
" ldp q24, q25, [%[Y_]] \n" \
|
||||
" ldp q18, q19, [%[X_], #32] \n" \
|
||||
" ldp q26, q27, [%[Y_], #32] \n" \
|
||||
" fmla v0.2d, v16.2d, v24.2d \n" \
|
||||
" fmla v1.2d, v17.2d, v25.2d \n" \
|
||||
" ldp q20, q21, ["X", #64] \n" \
|
||||
" ldp q28, q29, ["Y", #64] \n" \
|
||||
" ldp q20, q21, [%[X_], #64] \n" \
|
||||
" ldp q28, q29, [%[Y_], #64] \n" \
|
||||
" fmla v2.2d, v18.2d, v26.2d \n" \
|
||||
" fmla v3.2d, v19.2d, v27.2d \n" \
|
||||
" ldp q22, q23, ["X", #96] \n" \
|
||||
" ldp q30, q31, ["Y", #96] \n" \
|
||||
" add "Y", "Y", #128 \n" \
|
||||
" add "X", "X", #128 \n" \
|
||||
" ldp q22, q23, [%[X_], #96] \n" \
|
||||
" ldp q30, q31, [%[Y_], #96] \n" \
|
||||
" add %[Y_], %[Y_], #128 \n" \
|
||||
" add %[X_], %[X_], #128 \n" \
|
||||
" fmla v4.2d, v20.2d, v28.2d \n" \
|
||||
" fmla v5.2d, v21.2d, v29.2d \n" \
|
||||
" PRFM PLDL1KEEP, ["X", #896] \n" \
|
||||
" PRFM PLDL1KEEP, ["Y", #896] \n" \
|
||||
" PRFM PLDL1KEEP, ["X", #896+64] \n" \
|
||||
" PRFM PLDL1KEEP, ["Y", #896+64] \n" \
|
||||
" PRFM PLDL1KEEP, [%[X_], #896] \n" \
|
||||
" PRFM PLDL1KEEP, [%[Y_], #896] \n" \
|
||||
" PRFM PLDL1KEEP, [%[X_], #896+64] \n" \
|
||||
" PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \
|
||||
" fmla v6.2d, v22.2d, v30.2d \n" \
|
||||
" fmla v7.2d, v23.2d, v31.2d \n"
|
||||
|
||||
|
@ -261,28 +257,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
" fadd v0.2d, v0.2d, v2.2d \n" \
|
||||
" fadd v4.2d, v4.2d, v6.2d \n" \
|
||||
" fadd v0.2d, v0.2d, v4.2d \n" \
|
||||
" faddp "DOTF", v0.2d \n"
|
||||
" faddp "OUT", v0.2d \n"
|
||||
#endif /* !defined(DOUBLE) */
|
||||
|
||||
#if defined(SMP)
|
||||
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
|
||||
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
|
||||
void *c, BLASLONG ldc, int (*function)(), int nthreads);
|
||||
#endif
|
||||
|
||||
static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
static RETURN_TYPE dot_kernel_asimd(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
RETURN_TYPE dot = 0.0;
|
||||
|
||||
if ( n < 0 ) return dot;
|
||||
BLASLONG j = 0;
|
||||
|
||||
__asm__ __volatile__ (
|
||||
" mov "N", %[N_] \n"
|
||||
" mov "X", %[X_] \n"
|
||||
" mov "INC_X", %[INCX_] \n"
|
||||
" mov "Y", %[Y_] \n"
|
||||
" mov "INC_Y", %[INCY_] \n"
|
||||
" fmov "DOTF", "REG0" \n"
|
||||
" fmov "OUT", "REG0" \n"
|
||||
" fmov d1, xzr \n"
|
||||
" fmov d2, xzr \n"
|
||||
" fmov d3, xzr \n"
|
||||
|
@ -290,42 +274,40 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B
|
|||
" fmov d5, xzr \n"
|
||||
" fmov d6, xzr \n"
|
||||
" fmov d7, xzr \n"
|
||||
" cmp "N", xzr \n"
|
||||
" ble 9f //dot_kernel_L999 \n"
|
||||
" cmp "INC_X", #1 \n"
|
||||
" cmp %[INCX_], #1 \n"
|
||||
" bne 5f //dot_kernel_S_BEGIN \n"
|
||||
" cmp "INC_Y", #1 \n"
|
||||
" cmp %[INCY_], #1 \n"
|
||||
" bne 5f //dot_kernel_S_BEGIN \n"
|
||||
|
||||
"1: //dot_kernel_F_BEGIN: \n"
|
||||
" lsl "INC_X", "INC_X", "INC_SHIFT" \n"
|
||||
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n"
|
||||
" asr "J", "N", #"N_DIV_SHIFT" \n"
|
||||
" cmp "J", xzr \n"
|
||||
" lsl %[INCX_], %[INCX_], "INC_SHIFT" \n"
|
||||
" lsl %[INCY_], %[INCY_], "INC_SHIFT" \n"
|
||||
" asr %[J_], %[N_], #"N_DIV_SHIFT" \n"
|
||||
" cmp %[J_], xzr \n"
|
||||
" beq 3f //dot_kernel_F1 \n"
|
||||
|
||||
" .align 5 \n"
|
||||
"2: //dot_kernel_F: \n"
|
||||
" "KERNEL_F" \n"
|
||||
" subs "J", "J", #1 \n"
|
||||
" subs %[J_], %[J_], #1 \n"
|
||||
" bne 2b //dot_kernel_F \n"
|
||||
" "KERNEL_F_FINALIZE" \n"
|
||||
|
||||
"3: //dot_kernel_F1: \n"
|
||||
" ands "J", "N", #"N_REM_MASK" \n"
|
||||
" ands %[J_], %[N_], #"N_REM_MASK" \n"
|
||||
" ble 9f //dot_kernel_L999 \n"
|
||||
|
||||
"4: //dot_kernel_F10: \n"
|
||||
" "KERNEL_F1" \n"
|
||||
" subs "J", "J", #1 \n"
|
||||
" subs %[J_], %[J_], #1 \n"
|
||||
" bne 4b //dot_kernel_F10 \n"
|
||||
" b 9f //dot_kernel_L999 \n"
|
||||
|
||||
"5: //dot_kernel_S_BEGIN: \n"
|
||||
" lsl "INC_X", "INC_X", "INC_SHIFT" \n"
|
||||
" lsl "INC_Y", "INC_Y", "INC_SHIFT" \n"
|
||||
" asr "J", "N", #2 \n"
|
||||
" cmp "J", xzr \n"
|
||||
" lsl %[INCX_], %[INCX_], "INC_SHIFT" \n"
|
||||
" lsl %[INCY_], %[INCY_], "INC_SHIFT" \n"
|
||||
" asr %[J_], %[N_], #2 \n"
|
||||
" cmp %[J_], xzr \n"
|
||||
" ble 7f //dot_kernel_S1 \n"
|
||||
|
||||
"6: //dot_kernel_S4: \n"
|
||||
|
@ -333,88 +315,31 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B
|
|||
" "KERNEL_F1" \n"
|
||||
" "KERNEL_F1" \n"
|
||||
" "KERNEL_F1" \n"
|
||||
" subs "J", "J", #1 \n"
|
||||
" subs %[J_], %[J_], #1 \n"
|
||||
" bne 6b //dot_kernel_S4 \n"
|
||||
|
||||
"7: //dot_kernel_S1: \n"
|
||||
" ands "J", "N", #3 \n"
|
||||
" ands %[J_], %[N_], #3 \n"
|
||||
" ble 9f //dot_kernel_L999 \n"
|
||||
|
||||
"8: //dot_kernel_S10: \n"
|
||||
" "KERNEL_F1" \n"
|
||||
" subs "J", "J", #1 \n"
|
||||
" subs %[J_], %[J_], #1 \n"
|
||||
" bne 8b //dot_kernel_S10 \n"
|
||||
|
||||
"9: //dot_kernel_L999: \n"
|
||||
" str "DOTF", [%[DOT_]] \n"
|
||||
|
||||
:
|
||||
: [DOT_] "r" (&dot), //%0
|
||||
[N_] "r" (n), //%1
|
||||
[X_] "r" (x), //%2
|
||||
[INCX_] "r" (inc_x), //%3
|
||||
[Y_] "r" (y), //%4
|
||||
[INCY_] "r" (inc_y) //%5
|
||||
: [DOT_] "=&w" (dot)
|
||||
: [N_] "r" (n),
|
||||
[X_] "r" (x),
|
||||
[INCX_] "r" (inc_x),
|
||||
[Y_] "r" (y),
|
||||
[INCY_] "r" (inc_y),
|
||||
[J_] "r" (j)
|
||||
: "cc",
|
||||
"memory",
|
||||
"x0", "x1", "x2", "x3", "x4", "x5",
|
||||
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
|
||||
"d1", "d2", "d3", "d4", "d5", "d6", "d7"
|
||||
);
|
||||
|
||||
return dot;
|
||||
}
|
||||
|
||||
#if defined(SMP)
|
||||
static int dot_thread_function(BLASLONG n, BLASLONG dummy0,
|
||||
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
||||
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
|
||||
{
|
||||
*(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y);
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
#if defined(SMP)
|
||||
int nthreads;
|
||||
FLOAT dummy_alpha;
|
||||
#endif
|
||||
RETURN_TYPE dot = 0.0;
|
||||
|
||||
#if defined(SMP)
|
||||
if (inc_x == 0 || inc_y == 0 || n <= 10000)
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = num_cpu_avail(1);
|
||||
|
||||
if (nthreads == 1) {
|
||||
dot = dot_compute(n, x, inc_x, y, inc_y);
|
||||
} else {
|
||||
int mode, i;
|
||||
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
|
||||
RETURN_TYPE *ptr;
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
mode = BLAS_SINGLE | BLAS_REAL;
|
||||
#else
|
||||
mode = BLAS_DOUBLE | BLAS_REAL;
|
||||
#endif
|
||||
|
||||
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
|
||||
x, inc_x, y, inc_y, result, 0,
|
||||
( void *)dot_thread_function, nthreads);
|
||||
|
||||
ptr = (RETURN_TYPE *)result;
|
||||
for (i = 0; i < nthreads; i++) {
|
||||
dot = dot + (*ptr);
|
||||
ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2);
|
||||
}
|
||||
}
|
||||
#else
|
||||
dot = dot_compute(n, x, inc_x, y, inc_y);
|
||||
#endif
|
||||
|
||||
return dot;
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, Arm Ltd
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
|
||||
THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include <arm_sve.h>
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define SVE_TYPE svfloat64_t
|
||||
#define SVE_ZERO svdup_f64(0.0)
|
||||
#define SVE_WHILELT svwhilelt_b64
|
||||
#define SVE_ALL svptrue_b64()
|
||||
#define SVE_WIDTH svcntd()
|
||||
#else
|
||||
#define SVE_TYPE svfloat32_t
|
||||
#define SVE_ZERO svdup_f32(0.0)
|
||||
#define SVE_WHILELT svwhilelt_b32
|
||||
#define SVE_ALL svptrue_b32()
|
||||
#define SVE_WIDTH svcntw()
|
||||
#endif
|
||||
|
||||
static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||
SVE_TYPE acc_a = SVE_ZERO;
|
||||
SVE_TYPE acc_b = SVE_ZERO;
|
||||
|
||||
BLASLONG sve_width = SVE_WIDTH;
|
||||
|
||||
for (BLASLONG i = 0; i < n; i += sve_width * 2) {
|
||||
svbool_t pg_a = SVE_WHILELT(i, n);
|
||||
svbool_t pg_b = SVE_WHILELT(i + sve_width, n);
|
||||
|
||||
SVE_TYPE x_vec_a = svld1(pg_a, &x[i]);
|
||||
SVE_TYPE y_vec_a = svld1(pg_a, &y[i]);
|
||||
SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]);
|
||||
SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]);
|
||||
|
||||
acc_a = svmla_m(pg_a, acc_a, x_vec_a, y_vec_a);
|
||||
acc_b = svmla_m(pg_b, acc_b, x_vec_b, y_vec_b);
|
||||
}
|
||||
|
||||
return svaddv(SVE_ALL, acc_a) + svaddv(SVE_ALL, acc_b);
|
||||
}
|
|
@ -30,100 +30,37 @@
|
|||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef ALPHA_ONE
|
||||
#define LOAD_C(M, N) \
|
||||
mc##M##N = svld1_gather_index(pg32, ptr_c0##N + 2 * M , off_vc);
|
||||
|
||||
#define LOAD_C_LOW(M, N) \
|
||||
mc##M##N = svld1_gather_index(pg32_low, ptr_c0##N + 2 * M, off_vc);
|
||||
|
||||
#define LOAD_C_EVEN(M, N) \
|
||||
mc##M##N = svld1_gather_index(pg32_even, ptr_c0##N + 2 * M, off_vc);
|
||||
|
||||
#define LOAD_C_FIRST(M, N) \
|
||||
mc##M##N = svld1_gather_index(pg32_first, ptr_c0##N + 2 * M, off_vc);
|
||||
|
||||
#define STORE_C(M, N) \
|
||||
svst1_scatter_index(pg32, ptr_c0##N + 2 * M, off_vc, mc##M##N);
|
||||
|
||||
#define STORE_C_LOW(M, N) \
|
||||
svst1_scatter_index(pg32_low, ptr_c0##N + 2 * M, off_vc, mc##M##N);
|
||||
|
||||
#define STORE_C_EVEN(M, N) \
|
||||
svst1_scatter_index(pg32_even, ptr_c0##N + 2 * M, off_vc, mc##M##N);
|
||||
|
||||
#define STORE_C_FIRST(M, N) \
|
||||
svst1_scatter_index(pg32_first, ptr_c0##N + 2 * M, off_vc, mc##M##N);
|
||||
|
||||
#else
|
||||
#define LOAD_C(M, N) \
|
||||
mc##M##N = svdup_f32(0); \
|
||||
oc##M##N = svld1_gather_index(pg32, ptr_c0##N + 2 * M , off_vc);
|
||||
|
||||
#define LOAD_C_LOW(M, N) \
|
||||
mc##M##N = svdup_f32(0); \
|
||||
oc##M##N = svld1_gather_index(pg32_low, ptr_c0##N + 2 * M , off_vc);
|
||||
|
||||
#define LOAD_C_EVEN(M, N) \
|
||||
mc##M##N = svdup_f32(0); \
|
||||
oc##M##N = svld1_gather_index(pg32_even, ptr_c0##N + 2 * M , off_vc);
|
||||
|
||||
#define LOAD_C_FIRST(M, N) \
|
||||
mc##M##N = svdup_f32(0); \
|
||||
oc##M##N = svld1_gather_index(pg32_first, ptr_c0##N + 2 * M , off_vc);
|
||||
|
||||
#define STORE_C(M, N) \
|
||||
mc##M##N = svmad_z(pg32, svalpha, mc##M##N, oc##M##N); \
|
||||
svst1_scatter_index(pg32, ptr_c0##N + 2 * M, off_vc, mc##M##N);
|
||||
|
||||
#define STORE_C_LOW(M, N) \
|
||||
mc##M##N = svmad_z(pg32_low, svalpha, mc##M##N, oc##M##N); \
|
||||
svst1_scatter_index(pg32_low, ptr_c0##N + 2 * M, off_vc, mc##M##N);
|
||||
|
||||
#define STORE_C_EVEN(M, N) \
|
||||
mc##M##N = svmad_z(pg32_even, svalpha, mc##M##N, oc##M##N); \
|
||||
svst1_scatter_index(pg32_even, ptr_c0##N + 2 * M, off_vc, mc##M##N);
|
||||
|
||||
#define STORE_C_FIRST(M, N) \
|
||||
mc##M##N = svmad_z(pg32_first, svalpha, mc##M##N, oc##M##N); \
|
||||
svst1_scatter_index(pg32_first, ptr_c0##N + 2 * M, off_vc, mc##M##N);
|
||||
|
||||
#endif
|
||||
|
||||
#define LOAD_A(M) ma##M = svld1_bf16(pg16, ptr_a##M);
|
||||
|
||||
#define LOAD_B(N) mb##N = svld1_bf16(pg16, ptr_b##N);
|
||||
#define INIT_C(M, N) mc##M##N = svdup_f32(0);
|
||||
|
||||
#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N);
|
||||
|
||||
#define LOAD_KREST_1(NAME, M) \
|
||||
m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), zero, zero, zero, \
|
||||
*(ptr_##NAME##M + 1), zero, zero, zero);
|
||||
|
||||
#define LOAD_KREST_1_LOW(NAME, M) \
|
||||
m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), zero, zero, zero, zero, zero, \
|
||||
zero, zero);
|
||||
|
||||
#define LOAD_KREST_2(NAME, M) \
|
||||
m##NAME##M = \
|
||||
svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), zero, zero, \
|
||||
*(ptr_##NAME##M + 2), *(ptr_##NAME##M + 3), zero, zero);
|
||||
|
||||
#define LOAD_KREST_2_LOW(NAME, M) \
|
||||
m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), zero, \
|
||||
zero, zero, zero, zero, zero);
|
||||
|
||||
#define LOAD_KREST_3(NAME, M) \
|
||||
m##NAME##M = \
|
||||
svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), \
|
||||
*(ptr_##NAME##M + 2), zero, *(ptr_##NAME##M + 3), \
|
||||
*(ptr_##NAME##M + 4), *(ptr_##NAME##M + 5), zero);
|
||||
|
||||
#define LOAD_KREST_3_LOW(NAME, M) \
|
||||
m##NAME##M = \
|
||||
svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), \
|
||||
*(ptr_##NAME##M + 2), zero, zero, zero, zero, zero);
|
||||
#define INIT_C_8x4 \
|
||||
do { \
|
||||
INIT_C(0, 0); \
|
||||
INIT_C(0, 1); \
|
||||
INIT_C(1, 0); \
|
||||
INIT_C(1, 1); \
|
||||
INIT_C(2, 0); \
|
||||
INIT_C(2, 1); \
|
||||
INIT_C(3, 0); \
|
||||
INIT_C(3, 1); \
|
||||
} while (0);
|
||||
|
||||
#ifdef ALPHA_ONE
|
||||
#define UPDATE_C(PG, PTR, DST, SRC) \
|
||||
do { \
|
||||
DST = svld1_f32((PG), (PTR)); \
|
||||
DST = svadd_z((PG), SRC, DST); \
|
||||
svst1_f32((PG), (PTR), DST); \
|
||||
} while (0);
|
||||
#else
|
||||
#define UPDATE_C(PG, PTR, DST, SRC) \
|
||||
do { \
|
||||
DST = svld1_f32((PG), (PTR)); \
|
||||
DST = svmad_z((PG), svalpha, SRC, DST); \
|
||||
svst1_f32((PG), (PTR), DST); \
|
||||
} while (0);
|
||||
#endif
|
||||
|
||||
#ifdef ALPHA_ONE
|
||||
int sbgemm_kernel_neoversen2_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc)
|
||||
|
@ -131,396 +68,329 @@ int sbgemm_kernel_neoversen2_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT
|
|||
int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc)
|
||||
#endif
|
||||
{
|
||||
BLASLONG pad_k = (k + 3) & ~3;
|
||||
|
||||
svbfloat16_t ma0, ma1, ma2, ma3, mb0, mb1;
|
||||
svfloat32_t mc00, mc01, mc10, mc11, mc20, mc21, mc30, mc31,
|
||||
vc0, vc1, vc2, vc3, vc4, vc5, vc6, vc7,
|
||||
oc0, oc1, oc2, oc3, oc4, oc5, oc6, oc7;
|
||||
svfloat32_t svalpha = svdup_f32(alpha);
|
||||
|
||||
svbool_t pg16 = svptrue_b16();
|
||||
svbool_t pg16_low = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0);
|
||||
svbool_t pg32 = svptrue_b32();
|
||||
svbool_t pg32_low = svdupq_b32(1, 1, 0, 0);
|
||||
svbool_t pg32_first = svdupq_b32(1, 0, 0, 0);
|
||||
|
||||
bfloat16_t *ptr_a = (bfloat16_t *)A;
|
||||
bfloat16_t *ptr_b = (bfloat16_t *)B;
|
||||
FLOAT *ptr_c = C;
|
||||
|
||||
bfloat16_t *ptr_a0, *ptr_a1, *ptr_a2, *ptr_a3;
|
||||
bfloat16_t *ptr_b0, *ptr_b1;
|
||||
FLOAT *ptr_c00, *ptr_c01;
|
||||
|
||||
svbfloat16_t ma0, ma1, ma2, ma3, mb0, mb1;
|
||||
svfloat32_t mc00, mc01, mc10, mc11, mc20, mc21, mc30, mc31;
|
||||
#ifndef ALPHA_ONE
|
||||
svfloat32_t oc00, oc01, oc10, oc11, oc20, oc21, oc30, oc31;
|
||||
#endif
|
||||
svbool_t pg16 = svptrue_b16();
|
||||
svbool_t pg16_low = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0);
|
||||
svbool_t pg32 = svptrue_b32();
|
||||
svbool_t pg32_low = svdupq_b32(1, 1, 0, 0);
|
||||
svbool_t pg32_even = svdupq_b32(1, 0, 1, 0);
|
||||
svbool_t pg32_first = svdupq_b32(1, 0, 0, 0);
|
||||
svfloat32_t svalpha = svdup_f32(alpha);
|
||||
bfloat16 tmp = 0;
|
||||
bfloat16_t zero = *((bfloat16_t *)&tmp);
|
||||
BLASLONG krest = k & 3;
|
||||
|
||||
// 00 01 10 11
|
||||
svuint32_t off_vc = svdupq_u32(0, (uint32_t)ldc, 1, (uint32_t)ldc + 1);
|
||||
FLOAT *ptr_c0, *ptr_c1, *ptr_c2, *ptr_c3;
|
||||
|
||||
for (BLASLONG j = 0; j < n / 4; j++) {
|
||||
ptr_c00 = ptr_c;
|
||||
ptr_c01 = ptr_c + 2 * ldc;
|
||||
ptr_c0 = ptr_c;
|
||||
ptr_c1 = ptr_c0 + ldc;
|
||||
ptr_c2 = ptr_c1 + ldc;
|
||||
ptr_c3 = ptr_c2 + ldc;
|
||||
ptr_c += 4 * ldc;
|
||||
|
||||
ptr_a = (bfloat16_t *)A;
|
||||
|
||||
for (BLASLONG i = 0; i < m / 8; i++) {
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_a1 = ptr_a0 + 2 * k;
|
||||
ptr_a2 = ptr_a1 + 2 * k;
|
||||
ptr_a3 = ptr_a2 + 2 * k;
|
||||
ptr_a += 8 * k;
|
||||
ptr_a += 8 * pad_k;
|
||||
|
||||
ptr_b0 = ptr_b;
|
||||
ptr_b1 = ptr_b0 + 2 * k;
|
||||
|
||||
LOAD_C(0, 0); LOAD_C(0, 1);
|
||||
LOAD_C(1, 0); LOAD_C(1, 1);
|
||||
LOAD_C(2, 0); LOAD_C(2, 1);
|
||||
LOAD_C(3, 0); LOAD_C(3, 1);
|
||||
INIT_C_8x4;
|
||||
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3);
|
||||
LOAD_B(0); LOAD_B(1);
|
||||
for (BLASLONG p = 0; p < pad_k; p += 4) {
|
||||
ma0 = svld1_bf16(pg16, ptr_a0);
|
||||
ma1 = svld1_bf16(pg16, ptr_a0 + 8);
|
||||
ma2 = svld1_bf16(pg16, ptr_a0 + 16);
|
||||
ma3 = svld1_bf16(pg16, ptr_a0 + 24);
|
||||
|
||||
mb0 = svld1_bf16(pg16, ptr_b0);
|
||||
mb1 = svld1_bf16(pg16, ptr_b0 + 8);
|
||||
|
||||
MATMUL(0, 0); MATMUL(0, 1);
|
||||
MATMUL(1, 0); MATMUL(1, 1);
|
||||
MATMUL(2, 0); MATMUL(2, 1);
|
||||
MATMUL(3, 0); MATMUL(3, 1);
|
||||
|
||||
ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8;
|
||||
ptr_b0 += 8; ptr_b1 += 8;
|
||||
ptr_a0 += 32;
|
||||
ptr_b0 += 16;
|
||||
}
|
||||
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
|
||||
LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3);
|
||||
LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
|
||||
LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3);
|
||||
LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
|
||||
LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3);
|
||||
LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1);
|
||||
}
|
||||
MATMUL(0, 0); MATMUL(0, 1);
|
||||
MATMUL(1, 0); MATMUL(1, 1);
|
||||
MATMUL(2, 0); MATMUL(2, 1);
|
||||
MATMUL(3, 0); MATMUL(3, 1);
|
||||
}
|
||||
vc0 = svuzp1(mc00, mc10);
|
||||
vc1 = svuzp1(mc20, mc30);
|
||||
vc2 = svuzp2(mc00, mc10);
|
||||
vc3 = svuzp2(mc20, mc30);
|
||||
vc4 = svuzp1(mc01, mc11);
|
||||
vc5 = svuzp1(mc21, mc31);
|
||||
vc6 = svuzp2(mc01, mc11);
|
||||
vc7 = svuzp2(mc21, mc31);
|
||||
|
||||
STORE_C(0, 0); STORE_C(0, 1);
|
||||
STORE_C(1, 0); STORE_C(1, 1);
|
||||
STORE_C(2, 0); STORE_C(2, 1);
|
||||
STORE_C(3, 0); STORE_C(3, 1);
|
||||
UPDATE_C(pg32, ptr_c0, oc0, vc0);
|
||||
UPDATE_C(pg32, ptr_c0+4, oc1, vc1);
|
||||
UPDATE_C(pg32, ptr_c1, oc2, vc2);
|
||||
UPDATE_C(pg32, ptr_c1+4, oc3, vc3);
|
||||
UPDATE_C(pg32, ptr_c2, oc4, vc4)
|
||||
UPDATE_C(pg32, ptr_c2+4, oc5, vc5);
|
||||
UPDATE_C(pg32, ptr_c3, oc6, vc6)
|
||||
UPDATE_C(pg32, ptr_c3+4, oc7, vc7);
|
||||
|
||||
ptr_c00 += 8; ptr_c01 += 8;
|
||||
ptr_c0 += 8;
|
||||
ptr_c1 += 8;
|
||||
ptr_c2 += 8;
|
||||
ptr_c3 += 8;
|
||||
}
|
||||
|
||||
if (m & 4) {
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_a1 = ptr_a0 + 2 * k;
|
||||
ptr_a += 4 * k;
|
||||
|
||||
ptr_a += 4 * pad_k;
|
||||
ptr_b0 = ptr_b;
|
||||
ptr_b1 = ptr_b0 + 2 * k;
|
||||
|
||||
LOAD_C(0, 0); LOAD_C(0, 1);
|
||||
LOAD_C(1, 0); LOAD_C(1, 1);
|
||||
INIT_C(0, 0); INIT_C(0, 1);
|
||||
INIT_C(1, 0); INIT_C(1, 1);
|
||||
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
LOAD_A(0); LOAD_A(1);
|
||||
LOAD_B(0); LOAD_B(1);
|
||||
for (BLASLONG p = 0; p < pad_k; p += 4) {
|
||||
ma0 = svld1_bf16(pg16, ptr_a0);
|
||||
ma1 = svld1_bf16(pg16, ptr_a0 + 8);
|
||||
mb0 = svld1_bf16(pg16, ptr_b0);
|
||||
mb1 = svld1_bf16(pg16, ptr_b0 + 8);
|
||||
|
||||
MATMUL(0, 0); MATMUL(0, 1);
|
||||
MATMUL(1, 0); MATMUL(1, 1);
|
||||
|
||||
ptr_a0 += 8; ptr_a1 += 8;
|
||||
ptr_b0 += 8; ptr_b1 += 8;
|
||||
ptr_a0 += 16;
|
||||
ptr_b0 += 16;
|
||||
}
|
||||
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
|
||||
LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
|
||||
LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
|
||||
LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1);
|
||||
}
|
||||
MATMUL(0, 0); MATMUL(0, 1);
|
||||
MATMUL(1, 0); MATMUL(1, 1);
|
||||
}
|
||||
vc0 = svuzp1(mc00, mc10);
|
||||
vc1 = svuzp2(mc00, mc10);
|
||||
vc2 = svuzp1(mc01, mc11);
|
||||
vc3 = svuzp2(mc01, mc11);
|
||||
|
||||
STORE_C(0, 0); STORE_C(0, 1);
|
||||
STORE_C(1, 0); STORE_C(1, 1);
|
||||
UPDATE_C(pg32, ptr_c0, oc0, vc0);
|
||||
UPDATE_C(pg32, ptr_c1, oc1, vc1);
|
||||
UPDATE_C(pg32, ptr_c2, oc2, vc2);
|
||||
UPDATE_C(pg32, ptr_c3, oc3, vc3);
|
||||
|
||||
ptr_c00 += 4; ptr_c01 += 4;
|
||||
ptr_c0 += 4;
|
||||
ptr_c1 += 4;
|
||||
ptr_c2 += 4;
|
||||
ptr_c3 += 4;
|
||||
}
|
||||
|
||||
if (m & 2) {
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_a += 2 * k;
|
||||
|
||||
ptr_a += 2 * pad_k;
|
||||
ptr_b0 = ptr_b;
|
||||
ptr_b1 = ptr_b0 + 2 * k;
|
||||
|
||||
LOAD_C(0, 0); LOAD_C(0, 1);
|
||||
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
LOAD_A(0);
|
||||
LOAD_B(0); LOAD_B(1);
|
||||
INIT_C(0, 0); INIT_C(0, 1);
|
||||
for (BLASLONG p = 0; p < pad_k; p += 4) {
|
||||
ma0 = svld1_bf16(pg16, ptr_a0);
|
||||
mb0 = svld1_bf16(pg16, ptr_b0);
|
||||
mb1 = svld1_bf16(pg16, ptr_b0 + 8);
|
||||
|
||||
MATMUL(0, 0); MATMUL(0, 1);
|
||||
|
||||
ptr_a0 += 8;
|
||||
ptr_b0 += 8; ptr_b1 += 8;
|
||||
ptr_b0 += 16;
|
||||
}
|
||||
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1(a, 0);
|
||||
LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2(a, 0);
|
||||
LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3(a, 0);
|
||||
LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1);
|
||||
}
|
||||
MATMUL(0, 0); MATMUL(0, 1);
|
||||
}
|
||||
STORE_C(0, 0); STORE_C(0, 1);
|
||||
ptr_c00 += 2; ptr_c01 += 2;
|
||||
vc0 = svuzp1(mc00, mc00);
|
||||
vc1 = svuzp2(mc00, mc00);
|
||||
vc2 = svuzp1(mc01, mc01);
|
||||
vc3 = svuzp2(mc01, mc01);
|
||||
|
||||
UPDATE_C(pg32_low, ptr_c0, oc0, vc0);
|
||||
UPDATE_C(pg32_low, ptr_c1, oc1, vc1);
|
||||
UPDATE_C(pg32_low, ptr_c2, oc2, vc2);
|
||||
UPDATE_C(pg32_low, ptr_c3, oc3, vc3);
|
||||
|
||||
ptr_c0 += 2;
|
||||
ptr_c1 += 2;
|
||||
ptr_c2 += 2;
|
||||
ptr_c3 += 2;
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
ptr_a0 = ptr_a;
|
||||
|
||||
ptr_b0 = ptr_b;
|
||||
ptr_b1 = ptr_b0 + 2 * k;
|
||||
|
||||
LOAD_C_LOW(0, 0); LOAD_C_LOW(0, 1);
|
||||
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
INIT_C(0, 0); INIT_C(0, 1);
|
||||
for (BLASLONG p = 0; p < pad_k; p += 4) {
|
||||
ma0 = svld1_bf16(pg16_low, ptr_a0);
|
||||
LOAD_B(0); LOAD_B(1);
|
||||
mb0 = svld1_bf16(pg16, ptr_b0);
|
||||
mb1 = svld1_bf16(pg16, ptr_b0 + 8);
|
||||
|
||||
MATMUL(0, 0); MATMUL(0, 1);
|
||||
|
||||
ptr_a0 += 4;
|
||||
ptr_b0 += 8;
|
||||
ptr_b1 += 8;
|
||||
ptr_b0 += 16;
|
||||
}
|
||||
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1_LOW(a, 0);
|
||||
LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2_LOW(a, 0);
|
||||
LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3_LOW(a, 0);
|
||||
LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1);
|
||||
}
|
||||
MATMUL(0, 0); MATMUL(0, 1);
|
||||
}
|
||||
STORE_C_LOW(0, 0); STORE_C_LOW(0, 1);
|
||||
vc1 = svuzp2(mc00, mc00);
|
||||
vc3 = svuzp2(mc01, mc01);
|
||||
|
||||
UPDATE_C(pg32_first, ptr_c0, oc0, mc00);
|
||||
UPDATE_C(pg32_first, ptr_c1, oc1, vc1);
|
||||
UPDATE_C(pg32_first, ptr_c2, oc2, mc01);
|
||||
UPDATE_C(pg32_first, ptr_c3, oc3, vc3);
|
||||
|
||||
}
|
||||
|
||||
ptr_b += 4 * k;
|
||||
ptr_b += 4 * pad_k;
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
ptr_c00 = ptr_c;
|
||||
ptr_c0 = ptr_c;
|
||||
ptr_c1 = ptr_c0 + ldc;
|
||||
ptr_c += 2 * ldc;
|
||||
|
||||
ptr_a = (bfloat16_t *)A;
|
||||
|
||||
for (BLASLONG i = 0; i < m / 8; i++) {
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_a1 = ptr_a0 + 2 * k;
|
||||
ptr_a2 = ptr_a1 + 2 * k;
|
||||
ptr_a3 = ptr_a2 + 2 * k;
|
||||
ptr_a += 8 * k;
|
||||
ptr_a += 8 * pad_k;
|
||||
|
||||
ptr_b0 = ptr_b;
|
||||
|
||||
LOAD_C(0, 0);
|
||||
LOAD_C(1, 0);
|
||||
LOAD_C(2, 0);
|
||||
LOAD_C(3, 0);
|
||||
INIT_C(0, 0);
|
||||
INIT_C(1, 0);
|
||||
INIT_C(2, 0);
|
||||
INIT_C(3, 0);
|
||||
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3);
|
||||
LOAD_B(0);
|
||||
for (BLASLONG p = 0; p < pad_k; p += 4) {
|
||||
ma0 = svld1_bf16(pg16, ptr_a0);
|
||||
ma1 = svld1_bf16(pg16, ptr_a0 + 8);
|
||||
ma2 = svld1_bf16(pg16, ptr_a0 + 16);
|
||||
ma3 = svld1_bf16(pg16, ptr_a0 + 24);
|
||||
|
||||
mb0 = svld1_bf16(pg16, ptr_b0);
|
||||
|
||||
MATMUL(0, 0);
|
||||
MATMUL(1, 0);
|
||||
MATMUL(2, 0);
|
||||
MATMUL(3, 0);
|
||||
|
||||
ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8;
|
||||
ptr_a0 += 32;
|
||||
ptr_b0 += 8;
|
||||
}
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
|
||||
LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3);
|
||||
LOAD_KREST_1(b, 0);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
|
||||
LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3);
|
||||
LOAD_KREST_2(b, 0);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
|
||||
LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3);
|
||||
LOAD_KREST_3(b, 0);
|
||||
}
|
||||
MATMUL(0, 0);
|
||||
MATMUL(1, 0);
|
||||
MATMUL(2, 0);
|
||||
MATMUL(3, 0);
|
||||
}
|
||||
|
||||
STORE_C(0, 0);
|
||||
STORE_C(1, 0);
|
||||
STORE_C(2, 0);
|
||||
STORE_C(3, 0);
|
||||
vc0 = svuzp1(mc00, mc10);
|
||||
vc1 = svuzp1(mc20, mc30);
|
||||
vc2 = svuzp2(mc00, mc10);
|
||||
vc3 = svuzp2(mc20, mc30);
|
||||
|
||||
ptr_c00 += 8;
|
||||
UPDATE_C(pg32, ptr_c0, oc0, vc0);
|
||||
UPDATE_C(pg32, ptr_c0 + 4, oc1, vc1);
|
||||
UPDATE_C(pg32, ptr_c1, oc2, vc2);
|
||||
UPDATE_C(pg32, ptr_c1 + 4, oc3, vc3);
|
||||
|
||||
ptr_c0 += 8;
|
||||
ptr_c1 += 8;
|
||||
}
|
||||
|
||||
if (m & 4) {
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_a1 = ptr_a0 + 2 * k;
|
||||
ptr_a += 4 * k;
|
||||
|
||||
ptr_a += 4 * pad_k;
|
||||
ptr_b0 = ptr_b;
|
||||
|
||||
LOAD_C(0, 0);
|
||||
LOAD_C(1, 0);
|
||||
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
LOAD_A(0); LOAD_A(1);
|
||||
LOAD_B(0);
|
||||
INIT_C(0, 0);
|
||||
INIT_C(1, 0);
|
||||
|
||||
for (BLASLONG p = 0; p < pad_k; p += 4) {
|
||||
ma0 = svld1_bf16(pg16, ptr_a0);
|
||||
ma1 = svld1_bf16(pg16, ptr_a0 + 8);
|
||||
mb0 = svld1_bf16(pg16, ptr_b0);
|
||||
MATMUL(0, 0);
|
||||
MATMUL(1, 0);
|
||||
|
||||
ptr_a0 += 8; ptr_a1 += 8;
|
||||
ptr_a0 += 16;
|
||||
ptr_b0 += 8;
|
||||
}
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
|
||||
LOAD_KREST_1(b, 0);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
|
||||
LOAD_KREST_2(b, 0);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
|
||||
LOAD_KREST_3(b, 0);
|
||||
}
|
||||
MATMUL(0, 0);
|
||||
MATMUL(1, 0);
|
||||
}
|
||||
STORE_C(0, 0)
|
||||
STORE_C(1, 0)
|
||||
|
||||
ptr_c00 += 4;
|
||||
vc0 = svuzp1(mc00, mc10);
|
||||
vc1 = svuzp2(mc00, mc10);
|
||||
|
||||
UPDATE_C(pg32, ptr_c0, oc0, vc0);
|
||||
UPDATE_C(pg32, ptr_c1, oc1, vc1);
|
||||
|
||||
ptr_c0 += 4;
|
||||
ptr_c1 += 4;
|
||||
}
|
||||
|
||||
if (m & 2) {
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_a += 2 * k;
|
||||
ptr_a += 2 * pad_k;
|
||||
ptr_b0 = ptr_b;
|
||||
|
||||
LOAD_C(0, 0);
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
LOAD_A(0);
|
||||
LOAD_B(0);
|
||||
INIT_C(0, 0);
|
||||
|
||||
for (BLASLONG p = 0; p < pad_k; p += 4) {
|
||||
ma0 = svld1_bf16(pg16, ptr_a0);
|
||||
mb0 = svld1_bf16(pg16, ptr_b0);
|
||||
|
||||
MATMUL(0, 0);
|
||||
|
||||
ptr_a0 += 8;
|
||||
ptr_b0 += 8;
|
||||
}
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1(a, 0);
|
||||
LOAD_KREST_1(b, 0);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2(a, 0);
|
||||
LOAD_KREST_2(b, 0);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3(a, 0);
|
||||
LOAD_KREST_3(b, 0);
|
||||
}
|
||||
MATMUL(0, 0);
|
||||
}
|
||||
STORE_C(0, 0);
|
||||
ptr_c00 += 2;
|
||||
|
||||
vc0 = svuzp1(mc00, mc00);
|
||||
vc1 = svuzp2(mc00, mc00);
|
||||
UPDATE_C(pg32_low, ptr_c0, oc0, vc0);
|
||||
UPDATE_C(pg32_low, ptr_c1, oc1, vc1);
|
||||
|
||||
ptr_c0 += 2;
|
||||
ptr_c1 += 2;
|
||||
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
ptr_a0 = ptr_a;
|
||||
|
||||
ptr_b0 = ptr_b;
|
||||
|
||||
LOAD_C(0, 0);
|
||||
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
INIT_C(0, 0);
|
||||
for (BLASLONG p = 0; p < pad_k; p += 4) {
|
||||
ma0 = svld1_bf16(pg16_low, ptr_a0);
|
||||
LOAD_B(0);
|
||||
mb0 = svld1_bf16(pg16, ptr_b0);
|
||||
MATMUL(0, 0);
|
||||
ptr_a0 += 4;
|
||||
ptr_b0 += 8;
|
||||
}
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1_LOW(a, 0);
|
||||
LOAD_KREST_1(b, 0);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2_LOW(a, 0);
|
||||
LOAD_KREST_2(b, 0);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3_LOW(a, 0);
|
||||
LOAD_KREST_3(b, 0);
|
||||
}
|
||||
MATMUL(0, 0);
|
||||
}
|
||||
STORE_C_LOW(0, 0);
|
||||
vc1 = svuzp2(mc00, mc00);
|
||||
|
||||
UPDATE_C(pg32_first, ptr_c0, oc0, mc00);
|
||||
UPDATE_C(pg32_first, ptr_c1, oc1, vc1);
|
||||
}
|
||||
|
||||
ptr_b += 2 * k;
|
||||
ptr_b += 2 * pad_k;
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
ptr_c00 = ptr_c;
|
||||
ptr_c0 = ptr_c;
|
||||
ptr_a = (bfloat16_t *)A;
|
||||
|
||||
for (BLASLONG i = 0; i < m / 8; i++) {
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_a1 = ptr_a0 + 2 * k;
|
||||
ptr_a2 = ptr_a1 + 2 * k;
|
||||
ptr_a3 = ptr_a2 + 2 * k;
|
||||
ptr_a += 8 * k;
|
||||
ptr_a += 8 * pad_k;
|
||||
|
||||
ptr_b0 = ptr_b;
|
||||
|
||||
LOAD_C_EVEN(0, 0);
|
||||
LOAD_C_EVEN(1, 0);
|
||||
LOAD_C_EVEN(2, 0);
|
||||
LOAD_C_EVEN(3, 0);
|
||||
INIT_C(0, 0);
|
||||
INIT_C(1, 0);
|
||||
INIT_C(2, 0);
|
||||
INIT_C(3, 0);
|
||||
|
||||
for (BLASLONG p = 0; p < pad_k; p += 4) {
|
||||
ma0 = svld1_bf16(pg16, ptr_a0);
|
||||
ma1 = svld1_bf16(pg16, ptr_a0 + 8);
|
||||
ma2 = svld1_bf16(pg16, ptr_a0 + 16);
|
||||
ma3 = svld1_bf16(pg16, ptr_a0 + 24);
|
||||
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3);
|
||||
mb0 = svld1_bf16(pg16_low, ptr_b0);
|
||||
|
||||
MATMUL(0, 0);
|
||||
|
@ -528,86 +398,48 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp
|
|||
MATMUL(2, 0);
|
||||
MATMUL(3, 0);
|
||||
|
||||
ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8;
|
||||
ptr_a0 += 32;
|
||||
ptr_b0 += 4;
|
||||
}
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
|
||||
LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3);
|
||||
LOAD_KREST_1_LOW(b, 0);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
|
||||
LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3);
|
||||
LOAD_KREST_2_LOW(b, 0);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
|
||||
LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3);
|
||||
LOAD_KREST_3_LOW(b, 0);
|
||||
}
|
||||
MATMUL(0, 0);
|
||||
MATMUL(1, 0);
|
||||
MATMUL(2, 0);
|
||||
MATMUL(3, 0);
|
||||
}
|
||||
STORE_C_EVEN(0, 0)
|
||||
STORE_C_EVEN(1, 0);
|
||||
STORE_C_EVEN(2, 0);
|
||||
STORE_C_EVEN(3, 0);
|
||||
|
||||
ptr_c00 += 8;
|
||||
vc0 = svuzp1(mc00, mc10);
|
||||
vc1 = svuzp1(mc20, mc30);
|
||||
|
||||
UPDATE_C(pg32, ptr_c0, oc0, vc0);
|
||||
UPDATE_C(pg32, ptr_c0 + 4, oc1, vc1);
|
||||
|
||||
ptr_c0 += 8;
|
||||
}
|
||||
|
||||
if (m & 4) {
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_a1 = ptr_a0 + 2 * k;
|
||||
ptr_a += 4 * k;
|
||||
|
||||
ptr_a += 4 * pad_k;
|
||||
ptr_b0 = ptr_b;
|
||||
|
||||
LOAD_C_EVEN(0, 0);
|
||||
LOAD_C_EVEN(1, 0);
|
||||
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
LOAD_A(0); LOAD_A(1);
|
||||
INIT_C(0, 0);
|
||||
INIT_C(1, 0);
|
||||
for (BLASLONG p = 0; p < pad_k; p += 4) {
|
||||
ma0 = svld1_bf16(pg16, ptr_a0);
|
||||
ma1 = svld1_bf16(pg16, ptr_a0 + 8);
|
||||
mb0 = svld1_bf16(pg16_low, ptr_b0);
|
||||
|
||||
MATMUL(0, 0);
|
||||
MATMUL(1, 0);
|
||||
|
||||
ptr_a0 += 8; ptr_a1 += 8;
|
||||
ptr_a0 += 16;
|
||||
ptr_b0 += 4;
|
||||
}
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
|
||||
LOAD_KREST_1_LOW(b, 0);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
|
||||
LOAD_KREST_2_LOW(b, 0);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
|
||||
LOAD_KREST_3_LOW(b, 0);
|
||||
}
|
||||
MATMUL(0, 0);
|
||||
MATMUL(1, 0);
|
||||
}
|
||||
STORE_C_EVEN(0, 0)
|
||||
STORE_C_EVEN(1, 0)
|
||||
|
||||
ptr_c00 += 4;
|
||||
vc0 = svuzp1(mc00, mc10);
|
||||
UPDATE_C(pg32, ptr_c0, oc0, vc0);
|
||||
ptr_c0 += 4;
|
||||
}
|
||||
|
||||
if (m & 2) {
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_a += 2 * k;
|
||||
|
||||
ptr_a += 2 * pad_k;
|
||||
ptr_b0 = ptr_b;
|
||||
|
||||
LOAD_C_EVEN(0, 0);
|
||||
INIT_C(0, 0);
|
||||
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
LOAD_A(0);
|
||||
for (BLASLONG p = 0; p < pad_k; p += 4) {
|
||||
ma0 = svld1_bf16(pg16, ptr_a0);
|
||||
mb0 = svld1_bf16(pg16_low, ptr_b0);
|
||||
|
||||
MATMUL(0, 0);
|
||||
|
@ -615,49 +447,23 @@ int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alp
|
|||
ptr_a0 += 8;
|
||||
ptr_b0 += 4;
|
||||
}
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1(a, 0);
|
||||
LOAD_KREST_1_LOW(b, 0);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2(a, 0);
|
||||
LOAD_KREST_2_LOW(b, 0);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3(a, 0);
|
||||
LOAD_KREST_3_LOW(b, 0);
|
||||
}
|
||||
MATMUL(0, 0);
|
||||
}
|
||||
STORE_C_EVEN(0, 0);
|
||||
ptr_c00 += 2;
|
||||
vc0 = svuzp1(mc00, mc00);
|
||||
UPDATE_C(pg32_low, ptr_c0, oc0, vc0);
|
||||
ptr_c0 += 2;
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_b0 = ptr_b;
|
||||
LOAD_C_FIRST(0, 0);
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
INIT_C(0, 0);
|
||||
for (BLASLONG p = 0; p < pad_k; p += 4) {
|
||||
ma0 = svld1_bf16(pg16_low, ptr_a0);
|
||||
mb0 = svld1_bf16(pg16_low, ptr_b0);
|
||||
|
||||
MATMUL(0, 0);
|
||||
|
||||
ptr_a0 += 4;
|
||||
ptr_b0 += 4;
|
||||
}
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1_LOW(a, 0);
|
||||
LOAD_KREST_1_LOW(b, 0);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2_LOW(a, 0);
|
||||
LOAD_KREST_2_LOW(b, 0);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3_LOW(a, 0);
|
||||
LOAD_KREST_3_LOW(b, 0);
|
||||
}
|
||||
MATMUL(0, 0);
|
||||
}
|
||||
STORE_C_FIRST(0, 0);
|
||||
UPDATE_C(pg32_first, ptr_c0, oc0, mc00);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,126 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) 2022, The OpenBLAS Project
|
||||
* All rights reserved.
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* 3. Neither the name of the OpenBLAS project nor the names of
|
||||
* its contributors may be used to endorse or promote products
|
||||
* derived from this software without specific prior written permission.
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
* *****************************************************************************/
|
||||
|
||||
#include <arm_sve.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
|
||||
IFLOAT *a_offset;
|
||||
IFLOAT *a_offsetx[4];
|
||||
IFLOAT *b_offset;
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
svbool_t pg16 = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0);
|
||||
svbfloat16_t v0, v1, v2, v3;
|
||||
|
||||
for (BLASLONG j = 0; j < n / 4; j++) {
|
||||
a_offsetx[0] = a_offset;
|
||||
a_offsetx[1] = a_offsetx[0] + lda;
|
||||
a_offsetx[2] = a_offsetx[1] + lda;
|
||||
a_offsetx[3] = a_offsetx[2] + lda;
|
||||
a_offset += 4 * lda;
|
||||
|
||||
for (BLASLONG i = 0; i < m / 4; i++) {
|
||||
v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]);
|
||||
v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]);
|
||||
v2 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[2]);
|
||||
v3 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[3]);
|
||||
|
||||
svst1_bf16(pg16, (bfloat16_t *)b_offset, v0);
|
||||
svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1);
|
||||
svst1_bf16(pg16, (bfloat16_t *)b_offset + 8, v2);
|
||||
svst1_bf16(pg16, (bfloat16_t *)b_offset + 12, v3);
|
||||
|
||||
b_offset += 16;
|
||||
a_offsetx[0] += 4;
|
||||
a_offsetx[1] += 4;
|
||||
a_offsetx[2] += 4;
|
||||
a_offsetx[3] += 4;
|
||||
}
|
||||
|
||||
if (m & 3) {
|
||||
BLASLONG rest = m & 3;
|
||||
for (BLASLONG col = 0; col < 4; col++) {
|
||||
b_offset[4 * col] = a_offsetx[col][0];
|
||||
b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1];
|
||||
b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2];
|
||||
b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3];
|
||||
}
|
||||
b_offset += 16;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
a_offsetx[0] = a_offset;
|
||||
a_offsetx[1] = a_offsetx[0] + lda;
|
||||
a_offset += 2 * lda;
|
||||
|
||||
for (BLASLONG i = 0; i < m / 4; i++) {
|
||||
v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]);
|
||||
v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]);
|
||||
svst1_bf16(pg16, (bfloat16_t *)b_offset, v0);
|
||||
svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1);
|
||||
|
||||
b_offset += 8;
|
||||
a_offsetx[0] += 4;
|
||||
a_offsetx[1] += 4;
|
||||
}
|
||||
|
||||
if (m & 3) {
|
||||
BLASLONG rest = m & 3;
|
||||
for (BLASLONG col = 0; col < 2; col++) {
|
||||
b_offset[4 * col] = a_offsetx[col][0];
|
||||
b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1];
|
||||
b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2];
|
||||
b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3];
|
||||
}
|
||||
b_offset += 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
a_offsetx[0] = a_offset;
|
||||
for (BLASLONG i = 0; i < m / 4; i++) {
|
||||
v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]);
|
||||
svst1_bf16(pg16, (bfloat16_t *)b_offset, v0);
|
||||
b_offset += 4;
|
||||
a_offsetx[0] += 4;
|
||||
}
|
||||
if (m & 3) {
|
||||
BLASLONG rest = m & 3;
|
||||
b_offset[0] = a_offsetx[0][0];
|
||||
b_offset[1] = rest == 1 ? 0 : a_offsetx[0][1];
|
||||
b_offset[2] = rest <= 2 ? 0 : a_offsetx[0][2];
|
||||
b_offset[3] = rest <= 3 ? 0 : a_offsetx[0][3];
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,101 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) 2022, The OpenBLAS Project
|
||||
* All rights reserved.
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* 3. Neither the name of the OpenBLAS project nor the names of
|
||||
* its contributors may be used to endorse or promote products
|
||||
* derived from this software without specific prior written permission.
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
* *****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
|
||||
IFLOAT *a_offset, *a_offset1, *a_offset2;
|
||||
IFLOAT *b_offset;
|
||||
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
for (BLASLONG j = 0; j < n / 2; j++) {
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset += 2 * lda;
|
||||
for (BLASLONG i = 0; i < m / 4; i++) {
|
||||
*(b_offset + 0) = *(a_offset1 + 0);
|
||||
*(b_offset + 1) = *(a_offset1 + 1);
|
||||
*(b_offset + 2) = *(a_offset1 + 2);
|
||||
*(b_offset + 3) = *(a_offset1 + 3);
|
||||
*(b_offset + 4) = *(a_offset2 + 0);
|
||||
*(b_offset + 5) = *(a_offset2 + 1);
|
||||
*(b_offset + 6) = *(a_offset2 + 2);
|
||||
*(b_offset + 7) = *(a_offset2 + 3);
|
||||
|
||||
a_offset1 += 4;
|
||||
a_offset2 += 4;
|
||||
b_offset += 8;
|
||||
}
|
||||
BLASLONG rest = m & 3;
|
||||
if (rest == 3) {
|
||||
*(b_offset + 0) = *(a_offset1 + 0);
|
||||
*(b_offset + 1) = *(a_offset1 + 1);
|
||||
*(b_offset + 2) = *(a_offset1 + 2);
|
||||
*(b_offset + 3) = *(a_offset2 + 0);
|
||||
*(b_offset + 4) = *(a_offset2 + 1);
|
||||
*(b_offset + 5) = *(a_offset2 + 2);
|
||||
b_offset += 6;
|
||||
} else if (rest == 2) {
|
||||
*(b_offset + 0) = *(a_offset1 + 0);
|
||||
*(b_offset + 1) = *(a_offset1 + 1);
|
||||
*(b_offset + 2) = *(a_offset2 + 0);
|
||||
*(b_offset + 3) = *(a_offset2 + 1);
|
||||
b_offset += 4;
|
||||
} else if (rest == 1) {
|
||||
*(b_offset + 0) = *(a_offset1 + 0);
|
||||
*(b_offset + 1) = *(a_offset2 + 0);
|
||||
b_offset += 2;
|
||||
}
|
||||
}
|
||||
if (n & 1) {
|
||||
for (BLASLONG i = 0; i < m / 4; i++) {
|
||||
*(b_offset + 0) = *(a_offset + 0);
|
||||
*(b_offset + 1) = *(a_offset + 1);
|
||||
*(b_offset + 2) = *(a_offset + 2);
|
||||
*(b_offset + 3) = *(a_offset + 3);
|
||||
|
||||
b_offset += 4;
|
||||
a_offset += 4;
|
||||
}
|
||||
BLASLONG rest = m & 3;
|
||||
if (rest == 3) {
|
||||
*(b_offset + 0) = *(a_offset + 0);
|
||||
*(b_offset + 1) = *(a_offset + 1);
|
||||
*(b_offset + 2) = *(a_offset + 2);
|
||||
} else if (rest == 2) {
|
||||
*(b_offset + 0) = *(a_offset + 0);
|
||||
*(b_offset + 1) = *(a_offset + 1);
|
||||
} else if (rest == 1) {
|
||||
*(b_offset + 0) = *(a_offset + 0);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,165 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) 2022, The OpenBLAS Project
|
||||
* All rights reserved.
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* 3. Neither the name of the OpenBLAS project nor the names of
|
||||
* its contributors may be used to endorse or promote products
|
||||
* derived from this software without specific prior written permission.
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
* *****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
|
||||
IFLOAT *a_offset, *a_offset0, *a_offset1, *a_offset2, *a_offset3;
|
||||
IFLOAT *b_offset;
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
for (BLASLONG j = 0; j < n / 8; j++) {
|
||||
a_offset0 = a_offset;
|
||||
a_offset1 = a_offset0 + lda;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset3 = a_offset2 + lda;
|
||||
a_offset += 8;
|
||||
|
||||
for (BLASLONG i = 0; i < m / 4; i++) {
|
||||
for (BLASLONG line = 0; line < 8; line++) {
|
||||
b_offset[line * 4] = a_offset0[line];
|
||||
b_offset[line * 4 + 1] = a_offset1[line];
|
||||
b_offset[line * 4 + 2] = a_offset2[line];
|
||||
b_offset[line * 4 + 3] = a_offset3[line];
|
||||
}
|
||||
|
||||
b_offset += 32;
|
||||
a_offset0 += 4 * lda;
|
||||
a_offset1 += 4 * lda;
|
||||
a_offset2 += 4 * lda;
|
||||
a_offset3 += 4 * lda;
|
||||
}
|
||||
|
||||
if (m & 3) {
|
||||
BLASLONG rest = m & 3;
|
||||
for (BLASLONG line = 0; line < 8; line++) {
|
||||
b_offset[line * 4] = a_offset0[line];
|
||||
b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line];
|
||||
b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line];
|
||||
b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line];
|
||||
}
|
||||
b_offset += 32;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 4) {
|
||||
a_offset0 = a_offset;
|
||||
a_offset1 = a_offset0 + lda;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset3 = a_offset2 + lda;
|
||||
a_offset += 4;
|
||||
|
||||
for (BLASLONG i = 0; i < m / 4; i++) {
|
||||
for (BLASLONG line = 0; line < 4; line++) {
|
||||
b_offset[line * 4] = a_offset0[line];
|
||||
b_offset[line * 4 + 1] = a_offset1[line];
|
||||
b_offset[line * 4 + 2] = a_offset2[line];
|
||||
b_offset[line * 4 + 3] = a_offset3[line];
|
||||
}
|
||||
|
||||
b_offset += 16;
|
||||
a_offset0 += 4 * lda;
|
||||
a_offset1 += 4 * lda;
|
||||
a_offset2 += 4 * lda;
|
||||
a_offset3 += 4 * lda;
|
||||
}
|
||||
|
||||
if (m & 3) {
|
||||
BLASLONG rest = m & 3;
|
||||
for (BLASLONG line = 0; line < 4; line++) {
|
||||
b_offset[line * 4] = a_offset0[line];
|
||||
b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line];
|
||||
b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line];
|
||||
b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line];
|
||||
}
|
||||
b_offset += 16;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
a_offset0 = a_offset;
|
||||
a_offset1 = a_offset0 + lda;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset3 = a_offset2 + lda;
|
||||
a_offset += 2;
|
||||
|
||||
for (BLASLONG i = 0; i < m / 4; i++) {
|
||||
for (BLASLONG line = 0; line < 2; line++) {
|
||||
b_offset[line * 4] = a_offset0[line];
|
||||
b_offset[line * 4 + 1] = a_offset1[line];
|
||||
b_offset[line * 4 + 2] = a_offset2[line];
|
||||
b_offset[line * 4 + 3] = a_offset3[line];
|
||||
}
|
||||
b_offset += 8;
|
||||
a_offset0 += 4 * lda;
|
||||
a_offset1 += 4 * lda;
|
||||
a_offset2 += 4 * lda;
|
||||
a_offset3 += 4 * lda;
|
||||
}
|
||||
|
||||
if (m & 3) {
|
||||
BLASLONG rest = m & 3;
|
||||
for (BLASLONG line = 0; line < 2; line++) {
|
||||
b_offset[line * 4] = a_offset0[line];
|
||||
b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line];
|
||||
b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line];
|
||||
b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line];
|
||||
}
|
||||
b_offset += 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
a_offset0 = a_offset;
|
||||
a_offset1 = a_offset0 + lda;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset3 = a_offset2 + lda;
|
||||
|
||||
for (BLASLONG i = 0; i < m / 4; i++) {
|
||||
b_offset[0] = *a_offset0;
|
||||
b_offset[1] = *a_offset1;
|
||||
b_offset[2] = *a_offset2;
|
||||
b_offset[3] = *a_offset3;
|
||||
b_offset += 4;
|
||||
a_offset0 += 4 * lda;
|
||||
a_offset1 += 4 * lda;
|
||||
a_offset2 += 4 * lda;
|
||||
a_offset3 += 4 * lda;
|
||||
}
|
||||
|
||||
if (m & 3) {
|
||||
BLASLONG rest = m & 3;
|
||||
b_offset[0] = *a_offset0;
|
||||
b_offset[1] = rest == 1 ? 0 : *a_offset1;
|
||||
b_offset[2] = rest <= 2 ? 0 : *a_offset2;
|
||||
b_offset[3] = rest <= 3 ? 0 : *a_offset3;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -1,109 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) 2022, The OpenBLAS Project
|
||||
* All rights reserved.
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* 3. Neither the name of the OpenBLAS project nor the names of
|
||||
* its contributors may be used to endorse or promote products
|
||||
* derived from this software without specific prior written permission.
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
* *****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
|
||||
IFLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
|
||||
IFLOAT *b_offset;
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
for (BLASLONG j = 0; j < n / 2; j++) {
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset3 = a_offset2 + lda;
|
||||
a_offset4 = a_offset3 + lda;
|
||||
a_offset += 2;
|
||||
|
||||
for (BLASLONG i = 0; i < m / 4; i++) {
|
||||
*(b_offset + 0) = *(a_offset1 + 0);
|
||||
*(b_offset + 1) = *(a_offset2 + 0);
|
||||
*(b_offset + 2) = *(a_offset3 + 0);
|
||||
*(b_offset + 3) = *(a_offset4 + 0);
|
||||
*(b_offset + 4) = *(a_offset1 + 1);
|
||||
*(b_offset + 5) = *(a_offset2 + 1);
|
||||
*(b_offset + 6) = *(a_offset3 + 1);
|
||||
*(b_offset + 7) = *(a_offset4 + 1);
|
||||
|
||||
b_offset += 8;
|
||||
a_offset1 += 4 * lda;
|
||||
a_offset2 += 4 * lda;
|
||||
a_offset3 += 4 * lda;
|
||||
a_offset4 += 4 * lda;
|
||||
}
|
||||
|
||||
if (m & 3) {
|
||||
BLASLONG rest = m & 3;
|
||||
if (rest == 3) {
|
||||
*(b_offset + 0) = *(a_offset1 + 0);
|
||||
*(b_offset + 1) = *(a_offset2 + 0);
|
||||
*(b_offset + 2) = *(a_offset3 + 0);
|
||||
*(b_offset + 3) = *(a_offset1 + 1);
|
||||
*(b_offset + 4) = *(a_offset2 + 1);
|
||||
*(b_offset + 5) = *(a_offset3 + 1);
|
||||
b_offset += 6;
|
||||
} else if (rest == 2) {
|
||||
*(b_offset + 0) = *(a_offset1 + 0);
|
||||
*(b_offset + 1) = *(a_offset2 + 0);
|
||||
*(b_offset + 2) = *(a_offset1 + 1);
|
||||
*(b_offset + 3) = *(a_offset2 + 1);
|
||||
b_offset += 4;
|
||||
} else if (rest == 1) {
|
||||
*(b_offset + 0) = *(a_offset1 + 0);
|
||||
*(b_offset + 1) = *(a_offset1 + 1);
|
||||
b_offset += 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (n & 1) {
|
||||
for (BLASLONG i = 0; i < m / 4; i++) {
|
||||
*(b_offset + 0) = *(a_offset);
|
||||
*(b_offset + 1) = *(a_offset + lda);
|
||||
*(b_offset + 2) = *(a_offset + lda * 2);
|
||||
*(b_offset + 3) = *(a_offset + lda * 3);
|
||||
|
||||
b_offset += 4;
|
||||
a_offset += 4 * lda;
|
||||
}
|
||||
BLASLONG rest = m & 3;
|
||||
if (rest == 3) {
|
||||
*(b_offset + 0) = *(a_offset);
|
||||
*(b_offset + 1) = *(a_offset + lda);
|
||||
*(b_offset + 2) = *(a_offset + lda * 2);
|
||||
} else if (rest == 2) {
|
||||
*(b_offset + 0) = *(a_offset);
|
||||
*(b_offset + 1) = *(a_offset + lda);
|
||||
} else if (rest == 1) {
|
||||
*(b_offset + 0) = *(a_offset);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -39,10 +39,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
FLOAT x0, x1, x2, x3, y0, y1, y2, y3;
|
||||
v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
|
||||
v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
|
||||
#if defined(DSDOT)
|
||||
v2f64 dvx0, dvx1, dvx2, dvx3, dvx4, dvx5, dvx6, dvx7;
|
||||
v2f64 dvy0, dvy1, dvy2, dvy3, dvy4, dvy5, dvy6, dvy7;
|
||||
v2f64 dot0 = {0, 0};
|
||||
v2f64 dot1 = {0, 0};
|
||||
v2f64 dot2 = {0, 0};
|
||||
v2f64 dot3 = {0, 0};
|
||||
#else
|
||||
v4f32 dot0 = {0, 0, 0, 0};
|
||||
v4f32 dot1 = {0, 0, 0, 0};
|
||||
v4f32 dot2 = {0, 0, 0, 0};
|
||||
v4f32 dot3 = {0, 0, 0, 0};
|
||||
#endif
|
||||
|
||||
if (n < 1) return (dot);
|
||||
|
||||
|
@ -83,6 +92,61 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
x_pref += 32;
|
||||
y_pref += 32;
|
||||
|
||||
#if defined(DSDOT)
|
||||
/* Extend single precision to double precision */
|
||||
dvy0 = __msa_fexupr_d(vy0);
|
||||
dvy1 = __msa_fexupr_d(vy1);
|
||||
dvy2 = __msa_fexupr_d(vy2);
|
||||
dvy3 = __msa_fexupr_d(vy3);
|
||||
dvy4 = __msa_fexupr_d(vy4);
|
||||
dvy5 = __msa_fexupr_d(vy5);
|
||||
dvy6 = __msa_fexupr_d(vy6);
|
||||
dvy7 = __msa_fexupr_d(vy7);
|
||||
|
||||
vy0 = (v4f32)__msa_fexupl_d(vy0);
|
||||
vy1 = (v4f32)__msa_fexupl_d(vy1);
|
||||
vy2 = (v4f32)__msa_fexupl_d(vy2);
|
||||
vy3 = (v4f32)__msa_fexupl_d(vy3);
|
||||
vy4 = (v4f32)__msa_fexupl_d(vy4);
|
||||
vy5 = (v4f32)__msa_fexupl_d(vy5);
|
||||
vy6 = (v4f32)__msa_fexupl_d(vy6);
|
||||
vy7 = (v4f32)__msa_fexupl_d(vy7);
|
||||
|
||||
dvx0 = __msa_fexupr_d(vx0);
|
||||
dvx1 = __msa_fexupr_d(vx1);
|
||||
dvx2 = __msa_fexupr_d(vx2);
|
||||
dvx3 = __msa_fexupr_d(vx3);
|
||||
dvx4 = __msa_fexupr_d(vx4);
|
||||
dvx5 = __msa_fexupr_d(vx5);
|
||||
dvx6 = __msa_fexupr_d(vx6);
|
||||
dvx7 = __msa_fexupr_d(vx7);
|
||||
|
||||
vx0 = (v4f32)__msa_fexupl_d(vx0);
|
||||
vx1 = (v4f32)__msa_fexupl_d(vx1);
|
||||
vx2 = (v4f32)__msa_fexupl_d(vx2);
|
||||
vx3 = (v4f32)__msa_fexupl_d(vx3);
|
||||
vx4 = (v4f32)__msa_fexupl_d(vx4);
|
||||
vx5 = (v4f32)__msa_fexupl_d(vx5);
|
||||
vx6 = (v4f32)__msa_fexupl_d(vx6);
|
||||
vx7 = (v4f32)__msa_fexupl_d(vx7);
|
||||
|
||||
dot0 += (dvy0 * dvx0);
|
||||
dot1 += (dvy1 * dvx1);
|
||||
dot2 += (dvy2 * dvx2);
|
||||
dot3 += (dvy3 * dvx3);
|
||||
dot0 += (dvy4 * dvx4);
|
||||
dot1 += (dvy5 * dvx5);
|
||||
dot2 += (dvy6 * dvx6);
|
||||
dot3 += (dvy7 * dvx7);
|
||||
dot0 += ((v2f64)vy0 * (v2f64)vx0);
|
||||
dot1 += ((v2f64)vy1 * (v2f64)vx1);
|
||||
dot2 += ((v2f64)vy2 * (v2f64)vx2);
|
||||
dot3 += ((v2f64)vy3 * (v2f64)vx3);
|
||||
dot0 += ((v2f64)vy4 * (v2f64)vx4);
|
||||
dot1 += ((v2f64)vy5 * (v2f64)vx5);
|
||||
dot2 += ((v2f64)vy6 * (v2f64)vx6);
|
||||
dot3 += ((v2f64)vy7 * (v2f64)vx7);
|
||||
#else
|
||||
dot0 += (vy0 * vx0);
|
||||
dot1 += (vy1 * vx1);
|
||||
dot2 += (vy2 * vx2);
|
||||
|
@ -91,6 +155,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
dot1 += (vy5 * vx5);
|
||||
dot2 += (vy6 * vx6);
|
||||
dot3 += (vy7 * vx7);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (n & 31)
|
||||
|
@ -100,10 +165,41 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
|
||||
LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
|
||||
|
||||
#if defined(DSDOT)
|
||||
dvy0 = __msa_fexupr_d(vy0);
|
||||
dvy1 = __msa_fexupr_d(vy1);
|
||||
dvy2 = __msa_fexupr_d(vy2);
|
||||
dvy3 = __msa_fexupr_d(vy3);
|
||||
|
||||
vy0 = (v4f32)__msa_fexupl_d(vy0);
|
||||
vy1 = (v4f32)__msa_fexupl_d(vy1);
|
||||
vy2 = (v4f32)__msa_fexupl_d(vy2);
|
||||
vy3 = (v4f32)__msa_fexupl_d(vy3);
|
||||
|
||||
dvx0 = __msa_fexupr_d(vx0);
|
||||
dvx1 = __msa_fexupr_d(vx1);
|
||||
dvx2 = __msa_fexupr_d(vx2);
|
||||
dvx3 = __msa_fexupr_d(vx3);
|
||||
|
||||
vx0 = (v4f32)__msa_fexupl_d(vx0);
|
||||
vx1 = (v4f32)__msa_fexupl_d(vx1);
|
||||
vx2 = (v4f32)__msa_fexupl_d(vx2);
|
||||
vx3 = (v4f32)__msa_fexupl_d(vx3);
|
||||
|
||||
dot0 += (dvy0 * dvx0);
|
||||
dot1 += (dvy1 * dvx1);
|
||||
dot2 += (dvy2 * dvx2);
|
||||
dot3 += (dvy3 * dvx3);
|
||||
dot0 += ((v2f64)vy0 * (v2f64)vx0);
|
||||
dot1 += ((v2f64)vy1 * (v2f64)vx1);
|
||||
dot2 += ((v2f64)vy2 * (v2f64)vx2);
|
||||
dot3 += ((v2f64)vy3 * (v2f64)vx3);
|
||||
#else
|
||||
dot0 += (vy0 * vx0);
|
||||
dot1 += (vy1 * vx1);
|
||||
dot2 += (vy2 * vx2);
|
||||
dot3 += (vy3 * vx3);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (n & 8)
|
||||
|
@ -111,8 +207,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
LD_SP2_INC(x, 4, vx0, vx1);
|
||||
LD_SP2_INC(y, 4, vy0, vy1);
|
||||
|
||||
#if defined(DSDOT)
|
||||
dvy0 = __msa_fexupr_d(vy0);
|
||||
dvy1 = __msa_fexupr_d(vy1);
|
||||
|
||||
vy0 = (v4f32)__msa_fexupl_d(vy0);
|
||||
vy1 = (v4f32)__msa_fexupl_d(vy1);
|
||||
|
||||
dvx0 = __msa_fexupr_d(vx0);
|
||||
dvx1 = __msa_fexupr_d(vx1);
|
||||
|
||||
vx0 = (v4f32)__msa_fexupl_d(vx0);
|
||||
vx1 = (v4f32)__msa_fexupl_d(vx1);
|
||||
|
||||
dot0 += (dvy0 * dvx0);
|
||||
dot1 += (dvy1 * dvx1);
|
||||
dot0 += ((v2f64)vy0 * (v2f64)vx0);
|
||||
dot1 += ((v2f64)vy1 * (v2f64)vx1);
|
||||
#else
|
||||
dot0 += (vy0 * vx0);
|
||||
dot1 += (vy1 * vx1);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
|
@ -120,7 +235,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
vx0 = LD_SP(x); x += 4;
|
||||
vy0 = LD_SP(y); y += 4;
|
||||
|
||||
#if defined(DSDOT)
|
||||
dvy0 = __msa_fexupr_d(vy0);
|
||||
vy0 = (v4f32)__msa_fexupl_d(vy0);
|
||||
dvx0 = __msa_fexupr_d(vx0);
|
||||
vx0 = (v4f32)__msa_fexupl_d(vx0);
|
||||
dot0 += (dvy0 * dvx0);
|
||||
dot0 += ((v2f64)vy0 * (v2f64)vx0);
|
||||
#else
|
||||
dot0 += (vy0 * vx0);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
|
@ -128,8 +252,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
LD_GP2_INC(x, 1, x0, x1);
|
||||
LD_GP2_INC(y, 1, y0, y1);
|
||||
|
||||
#if defined(DSDOT)
|
||||
dot += ((double)y0 * (double)x0);
|
||||
dot += ((double)y1 * (double)x1);
|
||||
#else
|
||||
dot += (y0 * x0);
|
||||
dot += (y1 * x1);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
|
@ -137,7 +266,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
x0 = *x;
|
||||
y0 = *y;
|
||||
|
||||
#if defined(DSDOT)
|
||||
dot += ((double)y0 * (double)x0);
|
||||
#else
|
||||
dot += (y0 * x0);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -145,8 +278,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
|
||||
dot += dot0[0];
|
||||
dot += dot0[1];
|
||||
#if !defined(DSDOT)
|
||||
dot += dot0[2];
|
||||
dot += dot0[3];
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -155,10 +290,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
LD_GP4_INC(x, inc_x, x0, x1, x2, x3);
|
||||
LD_GP4_INC(y, inc_y, y0, y1, y2, y3);
|
||||
|
||||
#if defined(DSDOT)
|
||||
dot += ((double)y0 * (double)x0);
|
||||
dot += ((double)y1 * (double)x1);
|
||||
dot += ((double)y2 * (double)x2);
|
||||
dot += ((double)y3 * (double)x3);
|
||||
#else
|
||||
dot += (y0 * x0);
|
||||
dot += (y1 * x1);
|
||||
dot += (y2 * x2);
|
||||
dot += (y3 * x3);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
|
@ -166,8 +308,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
LD_GP2_INC(x, inc_x, x0, x1);
|
||||
LD_GP2_INC(y, inc_y, y0, y1);
|
||||
|
||||
#if defined(DSDOT)
|
||||
dot += ((double)y0 * (double)x0);
|
||||
dot += ((double)y1 * (double)x1);
|
||||
#else
|
||||
dot += (y0 * x0);
|
||||
dot += (y1 * x1);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
|
@ -175,7 +322,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
x0 = *x;
|
||||
y0 = *y;
|
||||
|
||||
#if defined(DSDOT)
|
||||
dot += ((double)y0 * (double)x0);
|
||||
#else
|
||||
dot += (y0 * x0);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,160 @@
|
|||
SGEMM_BETA = ../generic/gemm_beta.c
|
||||
DGEMM_BETA = ../generic/gemm_beta.c
|
||||
CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
#Pure C for other kernels
|
||||
SAMAXKERNEL = ../mips/amax.c
|
||||
DAMAXKERNEL = ../mips/amax.c
|
||||
CAMAXKERNEL = ../mips/zamax.c
|
||||
ZAMAXKERNEL = ../mips/zamax.c
|
||||
|
||||
SAMINKERNEL = ../mips/amin.c
|
||||
DAMINKERNEL = ../mips/amin.c
|
||||
CAMINKERNEL = ../mips/zamin.c
|
||||
ZAMINKERNEL = ../mips/zamin.c
|
||||
|
||||
SMAXKERNEL = ../mips/max.c
|
||||
DMAXKERNEL = ../mips/max.c
|
||||
|
||||
SMINKERNEL = ../mips/min.c
|
||||
DMINKERNEL = ../mips/min.c
|
||||
|
||||
ISAMAXKERNEL = ../mips/iamax.c
|
||||
IDAMAXKERNEL = ../mips/iamax.c
|
||||
ICAMAXKERNEL = ../mips/izamax.c
|
||||
IZAMAXKERNEL = ../mips/izamax.c
|
||||
|
||||
ISAMINKERNEL = ../mips/iamin.c
|
||||
IDAMINKERNEL = ../mips/iamin.c
|
||||
ICAMINKERNEL = ../mips/izamin.c
|
||||
IZAMINKERNEL = ../mips/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../mips/imax.c
|
||||
IDMAXKERNEL = ../mips/imax.c
|
||||
|
||||
ISMINKERNEL = ../mips/imin.c
|
||||
IDMINKERNEL = ../mips/imin.c
|
||||
|
||||
SASUMKERNEL = ../mips/asum.c
|
||||
DASUMKERNEL = ../mips/asum.c
|
||||
CASUMKERNEL = ../mips/zasum.c
|
||||
ZASUMKERNEL = ../mips/zasum.c
|
||||
|
||||
SSUMKERNEL = ../mips/sum.c
|
||||
DSUMKERNEL = ../mips/sum.c
|
||||
CSUMKERNEL = ../mips/zsum.c
|
||||
ZSUMKERNEL = ../mips/zsum.c
|
||||
|
||||
SAXPYKERNEL = ../mips/axpy.c
|
||||
DAXPYKERNEL = ../mips/axpy.c
|
||||
CAXPYKERNEL = ../mips/zaxpy.c
|
||||
ZAXPYKERNEL = ../mips/zaxpy.c
|
||||
|
||||
SCOPYKERNEL = ../mips/copy.c
|
||||
DCOPYKERNEL = ../mips/copy.c
|
||||
CCOPYKERNEL = ../mips/zcopy.c
|
||||
ZCOPYKERNEL = ../mips/zcopy.c
|
||||
|
||||
SDOTKERNEL = ../mips/dot.c
|
||||
DDOTKERNEL = ../mips/dot.c
|
||||
CDOTKERNEL = ../mips/zdot.c
|
||||
ZDOTKERNEL = ../mips/zdot.c
|
||||
|
||||
SNRM2KERNEL = ../mips/nrm2.c
|
||||
DNRM2KERNEL = ../mips/nrm2.c
|
||||
CNRM2KERNEL = ../mips/znrm2.c
|
||||
ZNRM2KERNEL = ../mips/znrm2.c
|
||||
|
||||
SROTKERNEL = ../mips/rot.c
|
||||
DROTKERNEL = ../mips/rot.c
|
||||
CROTKERNEL = ../mips/zrot.c
|
||||
ZROTKERNEL = ../mips/zrot.c
|
||||
|
||||
SSCALKERNEL = ../mips/scal.c
|
||||
DSCALKERNEL = ../mips/scal.c
|
||||
CSCALKERNEL = ../mips/zscal.c
|
||||
ZSCALKERNEL = ../mips/zscal.c
|
||||
|
||||
SSWAPKERNEL = ../mips/swap.c
|
||||
DSWAPKERNEL = ../mips/swap.c
|
||||
CSWAPKERNEL = ../mips/zswap.c
|
||||
ZSWAPKERNEL = ../mips/zswap.c
|
||||
|
||||
SGEMVNKERNEL = ../mips/gemv_n.c
|
||||
DGEMVNKERNEL = ../mips/gemv_n.c
|
||||
CGEMVNKERNEL = ../mips/zgemv_n.c
|
||||
ZGEMVNKERNEL = ../mips/zgemv_n.c
|
||||
|
||||
SGEMVTKERNEL = ../mips/gemv_t.c
|
||||
DGEMVTKERNEL = ../mips/gemv_t.c
|
||||
CGEMVTKERNEL = ../mips/zgemv_t.c
|
||||
ZGEMVTKERNEL = ../mips/zgemv_t.c
|
||||
|
||||
SSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
SSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
DSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
DSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
QSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
QSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
CSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
CSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
ZSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
ZSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
XSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
XSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
|
||||
ZHEMV_U_KERNEL = ../generic/zhemv_k.c
|
||||
ZHEMV_L_KERNEL = ../generic/zhemv_k.c
|
||||
|
||||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
|
@ -90,7 +90,7 @@
|
|||
//Init INF
|
||||
lui TEMP, 0x7FF0
|
||||
dsll TEMP, TEMP, 32
|
||||
MTC1 TEMP, INF
|
||||
MTC TEMP, INF
|
||||
|
||||
LD a1, 0 * SIZE(X)
|
||||
daddiu N, N, -1
|
||||
|
|
|
@ -62,6 +62,8 @@ gotoblas_t TABLE_NAME = {
|
|||
MAX(SBGEMM_DEFAULT_UNROLL_M, SBGEMM_DEFAULT_UNROLL_N),
|
||||
#endif
|
||||
|
||||
SBGEMM_ALIGN_K,
|
||||
|
||||
sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS,
|
||||
|
||||
samax_kTS, samin_kTS, smax_kTS, smin_kTS,
|
||||
|
@ -866,7 +868,7 @@ gotoblas_t TABLE_NAME = {
|
|||
cgeadd_kTS,
|
||||
#endif
|
||||
#if BUILD_COMPLEX16==1
|
||||
zgeadd_kTS
|
||||
zgeadd_kTS,
|
||||
#endif
|
||||
};
|
||||
|
||||
|
|
|
@ -44,8 +44,5 @@ DGEMM_BETA = dgemm_beta_skylakex.c
|
|||
CGEMMKERNEL = cgemm_kernel_8x2_skylakex.c
|
||||
ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c
|
||||
|
||||
CSCALKERNEL = ../arm/zscal.c
|
||||
ZSCALKERNEL = ../arm/zscal.c
|
||||
|
||||
CASUMKERNEL = casum.c
|
||||
ZASUMKERNEL = zasum.c
|
||||
|
|
|
@ -25,10 +25,25 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/*
|
||||
* Avoid contraction of floating point operations, specifically fused
|
||||
* multiply-add, because they can cause unexpected results in complex
|
||||
* multiplication.
|
||||
*/
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
#pragma GCC optimize ("fp-contract=off")
|
||||
#endif
|
||||
|
||||
#if defined(__clang__)
|
||||
#pragma clang fp contract(off)
|
||||
#endif
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
|
||||
#if defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
|
||||
#include "cscal_microk_skylakex-2.c"
|
||||
#elif defined(HASWELL) || defined(ZEN)
|
||||
#include "cscal_microk_haswell-2.c"
|
||||
#elif defined(BULLDOZER) || defined(PILEDRIVER)
|
||||
#include "cscal_microk_bulldozer-2.c"
|
||||
|
|
|
@ -0,0 +1,152 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014-2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
|
||||
static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
BLASLONG n2 = n + n;
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
/* _mm512_addsub_ps does not exist so we flip signs for odd elements of da_i */
|
||||
__m512 da_r = _mm512_set1_ps(alpha[0]);
|
||||
__m512 da_i = _mm512_set1_ps(alpha[1]) * _mm512_set4_ps(1, -1, 1, -1);
|
||||
for (; i < n2; i += 32) {
|
||||
__m512 x0 = _mm512_loadu_ps(&x[i + 0]);
|
||||
__m512 x1 = _mm512_loadu_ps(&x[i + 16]);
|
||||
__m512 y0 = _mm512_permute_ps(x0, 0xb1);
|
||||
__m512 y1 = _mm512_permute_ps(x1, 0xb1);
|
||||
_mm512_storeu_ps(&x[i + 0], _mm512_add_ps(da_r * x0, da_i * y0));
|
||||
_mm512_storeu_ps(&x[i + 16], _mm512_add_ps(da_r * x1, da_i * y1));
|
||||
}
|
||||
#else
|
||||
__m256 da_r = _mm256_set1_ps(alpha[0]);
|
||||
__m256 da_i = _mm256_set1_ps(alpha[1]);
|
||||
for (; i < n2; i += 32) {
|
||||
__m256 x0 = _mm256_loadu_ps(&x[i + 0]);
|
||||
__m256 x1 = _mm256_loadu_ps(&x[i + 8]);
|
||||
__m256 x2 = _mm256_loadu_ps(&x[i + 16]);
|
||||
__m256 x3 = _mm256_loadu_ps(&x[i + 24]);
|
||||
__m256 y0 = _mm256_permute_ps(x0, 0xb1);
|
||||
__m256 y1 = _mm256_permute_ps(x1, 0xb1);
|
||||
__m256 y2 = _mm256_permute_ps(x2, 0xb1);
|
||||
__m256 y3 = _mm256_permute_ps(x3, 0xb1);
|
||||
_mm256_storeu_ps(&x[i + 0], _mm256_addsub_ps(da_r * x0, da_i * y0));
|
||||
_mm256_storeu_ps(&x[i + 8], _mm256_addsub_ps(da_r * x1, da_i * y1));
|
||||
_mm256_storeu_ps(&x[i + 16], _mm256_addsub_ps(da_r * x2, da_i * y2));
|
||||
_mm256_storeu_ps(&x[i + 24], _mm256_addsub_ps(da_r * x3, da_i * y3));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
BLASLONG n2 = n + n;
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
__m512 da_i = _mm512_set1_ps(alpha[1]) * _mm512_set4_ps(1, -1, 1, -1);
|
||||
for (; i < n2; i += 32) {
|
||||
__m512 y0 = _mm512_permute_ps(_mm512_loadu_ps(&x[i + 0]), 0xb1);
|
||||
__m512 y1 = _mm512_permute_ps(_mm512_loadu_ps(&x[i + 16]), 0xb1);
|
||||
_mm512_storeu_ps(&x[i + 0], da_i * y0);
|
||||
_mm512_storeu_ps(&x[i + 16], da_i * y1);
|
||||
}
|
||||
#else
|
||||
__m256 da_i = _mm256_set1_ps(alpha[1]) * _mm256_set_ps(1, -1, 1, -1, 1, -1, 1, -1);
|
||||
for (; i < n2; i += 32) {
|
||||
__m256 y0 = _mm256_permute_ps(_mm256_loadu_ps(&x[i + 0]), 0xb1);
|
||||
__m256 y1 = _mm256_permute_ps(_mm256_loadu_ps(&x[i + 8]), 0xb1);
|
||||
__m256 y2 = _mm256_permute_ps(_mm256_loadu_ps(&x[i + 16]), 0xb1);
|
||||
__m256 y3 = _mm256_permute_ps(_mm256_loadu_ps(&x[i + 24]), 0xb1);
|
||||
_mm256_storeu_ps(&x[i + 0], da_i * y0);
|
||||
_mm256_storeu_ps(&x[i + 8], da_i * y1);
|
||||
_mm256_storeu_ps(&x[i + 16], da_i * y2);
|
||||
_mm256_storeu_ps(&x[i + 24], da_i * y3);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
BLASLONG n2 = n + n;
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
__m512 da_r = _mm512_set1_ps(alpha[0]);
|
||||
for (; i < n2; i += 32) {
|
||||
_mm512_storeu_ps(&x[i + 0], da_r * _mm512_loadu_ps(&x[i + 0]));
|
||||
_mm512_storeu_ps(&x[i + 16], da_r * _mm512_loadu_ps(&x[i + 16]));
|
||||
}
|
||||
#else
|
||||
__m256 da_r = _mm256_set1_ps(alpha[0]);
|
||||
for (; i < n2; i += 32) {
|
||||
_mm256_storeu_ps(&x[i + 0], da_r * _mm256_loadu_ps(&x[i + 0]));
|
||||
_mm256_storeu_ps(&x[i + 8], da_r * _mm256_loadu_ps(&x[i + 8]));
|
||||
_mm256_storeu_ps(&x[i + 16], da_r * _mm256_loadu_ps(&x[i + 16]));
|
||||
_mm256_storeu_ps(&x[i + 24], da_r * _mm256_loadu_ps(&x[i + 24]));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
BLASLONG n2 = n + n;
|
||||
|
||||
/* question to self: Why is this not just memset() */
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
__m512 zero = _mm512_setzero_ps();
|
||||
for (; i < n2; i += 32) {
|
||||
_mm512_storeu_ps(&x[i], zero);
|
||||
_mm512_storeu_ps(&x[i + 16], zero);
|
||||
}
|
||||
#else
|
||||
__m256 zero = _mm256_setzero_ps();
|
||||
for (; i < n2; i += 32) {
|
||||
_mm256_storeu_ps(&x[i + 0], zero);
|
||||
_mm256_storeu_ps(&x[i + 8], zero);
|
||||
_mm256_storeu_ps(&x[i + 16], zero);
|
||||
_mm256_storeu_ps(&x[i + 24], zero);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
#include "cscal_microk_haswell-2.c"
|
||||
#endif
|
|
@ -52,18 +52,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __
|
|||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||
FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
|
||||
FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
|
||||
FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
|
||||
FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
|
||||
FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
|
||||
FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
|
||||
FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
|
||||
FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
|
||||
FLOAT ctemp17 /*, ctemp18, ctemp19, ctemp20*/ ;
|
||||
FLOAT /*ctemp21, ctemp22,*/ ctemp23, ctemp24;
|
||||
FLOAT ctemp25 /*, ctemp26, ctemp27, ctemp28*/ ;
|
||||
FLOAT /*ctemp29, ctemp30,*/ ctemp31, ctemp32;
|
||||
FLOAT ctemp33 /*, ctemp34, ctemp35, ctemp36*/ ;
|
||||
FLOAT /*ctemp37, ctemp38,*/ ctemp39, ctemp40;
|
||||
FLOAT ctemp41 /*, ctemp42, ctemp43, ctemp44*/ ;
|
||||
FLOAT /*ctemp45, ctemp46,*/ ctemp47, ctemp48;
|
||||
FLOAT ctemp49 /*, ctemp50, ctemp51, ctemp52*/ ;
|
||||
FLOAT /*ctemp53, ctemp54,*/ ctemp55, ctemp56;
|
||||
FLOAT ctemp57 /*, ctemp58, ctemp59, ctemp60*/ ;
|
||||
FLOAT /*ctemp61, ctemp62,*/ ctemp63, ctemp64;
|
||||
|
||||
|
||||
aoffset = a;
|
||||
|
|
|
@ -142,7 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
|
||||
}
|
||||
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb){
|
||||
float *src, *dst, *dst_tmp, *src_base, *dst_base;
|
||||
float *src, *dst, *dst_tmp=0, *src_base, *dst_base;
|
||||
uint64_t src_ld_bytes = (uint64_t)lda * sizeof(float), dst_ld_bytes = (uint64_t)ldb * sizeof(float), num_rows = 0;
|
||||
BLASLONG cols_left, rows_done; float ALPHA = alpha;
|
||||
if(ALPHA==0.0){
|
||||
|
|
|
@ -25,9 +25,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11)
|
||||
#pragma GCC optimize("no-tree-vectorize")
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||
#include "sgemv_n_microk_bulldozer-4.c"
|
||||
|
|
|
@ -25,9 +25,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11)
|
||||
#pragma GCC optimize("no-tree-vectorize")
|
||||
#endif
|
||||
|
||||
#if defined(NEHALEM)
|
||||
#include "sgemv_t_microk_nehalem-4.c"
|
||||
#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||
|
|
|
@ -25,9 +25,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11)
|
||||
#pragma GCC optimize("no-tree-vectorize")
|
||||
#endif
|
||||
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||
#include "ssymv_L_microk_bulldozer-2.c"
|
||||
#elif defined(NEHALEM)
|
||||
|
|
|
@ -25,9 +25,12 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11)
|
||||
#pragma GCC optimize("no-tree-vectorize")
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||
#include "ssymv_U_microk_bulldozer-2.c"
|
||||
|
|
|
@ -25,9 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11)
|
||||
#pragma GCC optimize("no-tree-vectorize")
|
||||
#endif
|
||||
|
||||
#if defined(BULLDOZER)
|
||||
#include "zdot_microk_bulldozer-2.c"
|
||||
|
|
|
@ -25,10 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11)
|
||||
#pragma GCC optimize("no-tree-vectorize")
|
||||
#endif
|
||||
|
||||
#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
|
||||
#include "zgemv_n_microk_haswell-4.c"
|
||||
|
@ -231,10 +232,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i,
|
|||
FLOAT xbuffer[8],*ybuffer;
|
||||
|
||||
|
||||
#if 0
|
||||
printf("%s %d %d %.16f %.16f %d %d %d\n","zgemv_n",m,n,alpha_r,alpha_i,lda,inc_x,inc_y);
|
||||
#endif
|
||||
|
||||
if ( m < 1 ) return(0);
|
||||
if ( n < 1 ) return(0);
|
||||
|
||||
|
|
|
@ -25,9 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11)
|
||||
#pragma GCC optimize("no-tree-vectorize")
|
||||
#endif
|
||||
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||
#include "zgemv_t_microk_bulldozer-4.c"
|
||||
|
|
|
@ -25,10 +25,25 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/*
|
||||
* Avoid contraction of floating point operations, specifically fused
|
||||
* multiply-add, because they can cause unexpected results in complex
|
||||
* multiplication.
|
||||
*/
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
#pragma GCC optimize ("fp-contract=off")
|
||||
#endif
|
||||
|
||||
#if defined(__clang__)
|
||||
#pragma clang fp contract(off)
|
||||
#endif
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
|
||||
#if defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
|
||||
#include "zscal_microk_skylakex-2.c"
|
||||
#elif defined(HASWELL) || defined(ZEN)
|
||||
#include "zscal_microk_haswell-2.c"
|
||||
#elif defined(BULLDOZER) || defined(PILEDRIVER)
|
||||
#include "zscal_microk_bulldozer-2.c"
|
||||
|
|
|
@ -0,0 +1,152 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014-2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
|
||||
static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
BLASLONG n2 = n + n;
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
/* _mm512_addsub_pd does not exist so we flip signs for odd elements of da_i */
|
||||
__m512d da_r = _mm512_set1_pd(alpha[0]);
|
||||
__m512d da_i = _mm512_set1_pd(alpha[1]) * _mm512_set4_pd(1, -1, 1, -1);
|
||||
for (; i < n2; i += 16) {
|
||||
__m512d x0 = _mm512_loadu_pd(&x[i + 0]);
|
||||
__m512d x1 = _mm512_loadu_pd(&x[i + 8]);
|
||||
__m512d y0 = _mm512_permute_pd(x0, 0x55);
|
||||
__m512d y1 = _mm512_permute_pd(x1, 0x55);
|
||||
_mm512_storeu_pd(&x[i + 0], _mm512_add_pd(da_r * x0, da_i * y0));
|
||||
_mm512_storeu_pd(&x[i + 8], _mm512_add_pd(da_r * x1, da_i * y1));
|
||||
}
|
||||
#else
|
||||
__m256d da_r = _mm256_set1_pd(alpha[0]);
|
||||
__m256d da_i = _mm256_set1_pd(alpha[1]);
|
||||
for (; i < n2; i += 16) {
|
||||
__m256d x0 = _mm256_loadu_pd(&x[i + 0]);
|
||||
__m256d x1 = _mm256_loadu_pd(&x[i + 4]);
|
||||
__m256d x2 = _mm256_loadu_pd(&x[i + 8]);
|
||||
__m256d x3 = _mm256_loadu_pd(&x[i + 12]);
|
||||
__m256d y0 = _mm256_permute_pd(x0, 0x05);
|
||||
__m256d y1 = _mm256_permute_pd(x1, 0x05);
|
||||
__m256d y2 = _mm256_permute_pd(x2, 0x05);
|
||||
__m256d y3 = _mm256_permute_pd(x3, 0x05);
|
||||
_mm256_storeu_pd(&x[i + 0], _mm256_addsub_pd(da_r * x0, da_i * y0));
|
||||
_mm256_storeu_pd(&x[i + 4], _mm256_addsub_pd(da_r * x1, da_i * y1));
|
||||
_mm256_storeu_pd(&x[i + 8], _mm256_addsub_pd(da_r * x2, da_i * y2));
|
||||
_mm256_storeu_pd(&x[i + 12], _mm256_addsub_pd(da_r * x3, da_i * y3));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
BLASLONG n2 = n + n;
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
__m512d da_i = _mm512_set1_pd(alpha[1]) * _mm512_set4_pd(1, -1, 1, -1);
|
||||
for (; i < n2; i += 16) {
|
||||
__m512d y0 = _mm512_permute_pd(_mm512_loadu_pd(&x[i + 0]), 0x55);
|
||||
__m512d y1 = _mm512_permute_pd(_mm512_loadu_pd(&x[i + 8]), 0x55);
|
||||
_mm512_storeu_pd(&x[i + 0], da_i * y0);
|
||||
_mm512_storeu_pd(&x[i + 8], da_i * y1);
|
||||
}
|
||||
#else
|
||||
__m256d da_i = _mm256_set1_pd(alpha[1]) * _mm256_set_pd(1, -1, 1, -1);
|
||||
for (; i < n2; i += 16) {
|
||||
__m256d y0 = _mm256_permute_pd(_mm256_loadu_pd(&x[i + 0]), 0x05);
|
||||
__m256d y1 = _mm256_permute_pd(_mm256_loadu_pd(&x[i + 8]), 0x05);
|
||||
__m256d y2 = _mm256_permute_pd(_mm256_loadu_pd(&x[i + 16]), 0x05);
|
||||
__m256d y3 = _mm256_permute_pd(_mm256_loadu_pd(&x[i + 24]), 0x05);
|
||||
_mm256_storeu_pd(&x[i + 0], da_i * y0);
|
||||
_mm256_storeu_pd(&x[i + 4], da_i * y1);
|
||||
_mm256_storeu_pd(&x[i + 8], da_i * y2);
|
||||
_mm256_storeu_pd(&x[i + 12], da_i * y3);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
BLASLONG n2 = n + n;
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
__m512d da_r = _mm512_set1_pd(alpha[0]);
|
||||
for (; i < n2; i += 16) {
|
||||
_mm512_storeu_pd(&x[i + 0], da_r * _mm512_loadu_pd(&x[i + 0]));
|
||||
_mm512_storeu_pd(&x[i + 8], da_r * _mm512_loadu_pd(&x[i + 8]));
|
||||
}
|
||||
#else
|
||||
__m256d da_r = _mm256_set1_pd(alpha[0]);
|
||||
for (; i < n2; i += 16) {
|
||||
_mm256_storeu_pd(&x[i + 0], da_r * _mm256_loadu_pd(&x[i + 0]));
|
||||
_mm256_storeu_pd(&x[i + 4], da_r * _mm256_loadu_pd(&x[i + 4]));
|
||||
_mm256_storeu_pd(&x[i + 8], da_r * _mm256_loadu_pd(&x[i + 8]));
|
||||
_mm256_storeu_pd(&x[i + 12], da_r * _mm256_loadu_pd(&x[i + 12]));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
BLASLONG n2 = n + n;
|
||||
|
||||
/* question to self: Why is this not just memset() */
|
||||
|
||||
#ifdef __AVX512CD__
|
||||
__m512d zero = _mm512_setzero_pd();
|
||||
for (; i < n2; i += 16) {
|
||||
_mm512_storeu_pd(&x[i], zero);
|
||||
_mm512_storeu_pd(&x[i + 8], zero);
|
||||
}
|
||||
#else
|
||||
__m256d zero = _mm256_setzero_pd();
|
||||
for (; i < n2; i += 16) {
|
||||
_mm256_storeu_pd(&x[i + 0], zero);
|
||||
_mm256_storeu_pd(&x[i + 4], zero);
|
||||
_mm256_storeu_pd(&x[i + 8], zero);
|
||||
_mm256_storeu_pd(&x[i + 12], zero);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
#include "zscal_microk_haswell-2.c"
|
||||
#endif
|
|
@ -573,7 +573,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
|
|||
|
||||
/* ===================================================================== */
|
||||
*vers_major__ = 3;
|
||||
*vers_minor__ = 9;
|
||||
*vers_minor__ = 11;
|
||||
*vers_patch__ = 0;
|
||||
/* ===================================================================== */
|
||||
|
||||
|
|
|
@ -60,7 +60,7 @@
|
|||
INTEGER VERS_MAJOR, VERS_MINOR, VERS_PATCH
|
||||
* =====================================================================
|
||||
VERS_MAJOR = 3
|
||||
VERS_MINOR = 9
|
||||
VERS_MINOR = 11
|
||||
VERS_PATCH = 0
|
||||
* =====================================================================
|
||||
*
|
||||
|
|
|
@ -49,11 +49,9 @@
|
|||
|
||||
LAPACKE_dgels (row-major, high-level) Example Program Results
|
||||
|
||||
-- LAPACKE Example routine (version 3.7.0) --
|
||||
-- LAPACKE Example routine --
|
||||
-- LAPACK is a software package provided by Univ. of Tennessee, --
|
||||
-- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
|
||||
December 2016
|
||||
|
||||
*/
|
||||
/* Calling DGELS using row-major layout */
|
||||
|
||||
|
@ -66,8 +64,8 @@
|
|||
int main (int argc, const char * argv[])
|
||||
{
|
||||
/* Locals */
|
||||
double A[5][3] = {1,1,1,2,3,4,3,5,2,4,2,5,5,4,3};
|
||||
double b[5][2] = {-10,-3,12,14,14,12,16,16,18,16};
|
||||
double A[5][3] = {{1,1,1},{2,3,4},{3,5,2},{4,2,5},{5,4,3}};
|
||||
double b[5][2] = {{-10,-3},{12,14},{14,12},{16,16},{18,16}};
|
||||
lapack_int info,m,n,lda,ldb,nrhs;
|
||||
|
||||
/* Initialization */
|
||||
|
|
|
@ -25,11 +25,9 @@
|
|||
|
||||
LAPACKE_dgesv (col-major, high-level) Example Program Results
|
||||
|
||||
-- LAPACKE Example routine (version 3.7.0) --
|
||||
-- LAPACKE Example routine --
|
||||
-- LAPACK is a software package provided by Univ. of Tennessee, --
|
||||
-- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
|
||||
December 2016
|
||||
|
||||
*/
|
||||
/* Includes */
|
||||
#include <stdlib.h>
|
||||
|
@ -94,7 +92,7 @@ int main(int argc, char **argv) {
|
|||
/* Check for the exact singularity */
|
||||
if( info > 0 ) {
|
||||
printf( "The diagonal element of the triangular factor of A,\n" );
|
||||
printf( "U(%i,%i) is zero, so that A is singular;\n", info, info );
|
||||
printf( "U(%" LAPACK_IFMT ",%" LAPACK_IFMT ") is zero, so that A is singular;\n", info, info );
|
||||
printf( "the solution could not be computed.\n" );
|
||||
exit( 1 );
|
||||
}
|
||||
|
|
|
@ -25,11 +25,9 @@
|
|||
|
||||
LAPACKE_dgesv (row-major, high-level) Example Program Results
|
||||
|
||||
-- LAPACKE Example routine (version 3.7.0) --
|
||||
-- LAPACKE Example routine --
|
||||
-- LAPACK is a software package provided by Univ. of Tennessee, --
|
||||
-- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
|
||||
December 2016
|
||||
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
@ -91,7 +89,7 @@ int main(int argc, char **argv) {
|
|||
/* Check for the exact singularity */
|
||||
if( info > 0 ) {
|
||||
printf( "The diagonal element of the triangular factor of A,\n" );
|
||||
printf( "U(%i,%i) is zero, so that A is singular;\n", info, info );
|
||||
printf( "U(%" LAPACK_IFMT ",%" LAPACK_IFMT ") is zero, so that A is singular;\n", info, info );
|
||||
printf( "the solution could not be computed.\n" );
|
||||
exit( 1 );
|
||||
}
|
||||
|
|
|
@ -28,6 +28,6 @@ void print_matrix_colmajor( char* desc, lapack_int m, lapack_int n, double* mat,
|
|||
void print_vector( char* desc, lapack_int n, lapack_int* vec ) {
|
||||
lapack_int j;
|
||||
printf( "\n %s\n", desc );
|
||||
for( j = 0; j < n; j++ ) printf( " %6i", vec[j] );
|
||||
for( j = 0; j < n; j++ ) printf( " %6" LAPACK_IFMT, vec[j] );
|
||||
printf( "\n" );
|
||||
}
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
/* It seems all current Fortran compilers put strlen at end.
|
||||
* Some historical compilers put strlen after the str argument
|
||||
|
@ -80,7 +81,22 @@ extern "C" {
|
|||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
#ifndef lapack_int
|
||||
#define lapack_int int
|
||||
#if defined(LAPACK_ILP64)
|
||||
#define lapack_int int64_t
|
||||
#else
|
||||
#define lapack_int int32_t
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Integer format string
|
||||
*/
|
||||
#ifndef LAPACK_IFMT
|
||||
#if defined(LAPACK_ILP64)
|
||||
#define LAPACK_IFMT PRId64
|
||||
#else
|
||||
#define LAPACK_IFMT PRId32
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef lapack_logical
|
||||
|
@ -21986,6 +22002,84 @@ void LAPACK_ztrsyl_base(
|
|||
#define LAPACK_ztrsyl(...) LAPACK_ztrsyl_base(__VA_ARGS__)
|
||||
#endif
|
||||
|
||||
#define LAPACK_ctrsyl3_base LAPACK_GLOBAL(ctrsyl3,CTRSYL3)
|
||||
void LAPACK_ctrsyl3_base(
|
||||
char const* trana, char const* tranb,
|
||||
lapack_int const* isgn, lapack_int const* m, lapack_int const* n,
|
||||
lapack_complex_float const* A, lapack_int const* lda,
|
||||
lapack_complex_float const* B, lapack_int const* ldb,
|
||||
lapack_complex_float* C, lapack_int const* ldc, float* scale,
|
||||
float* swork, lapack_int const *ldswork,
|
||||
lapack_int* info
|
||||
#ifdef LAPACK_FORTRAN_STRLEN_END
|
||||
, size_t, size_t
|
||||
#endif
|
||||
);
|
||||
#ifdef LAPACK_FORTRAN_STRLEN_END
|
||||
#define LAPACK_ctrsyl3(...) LAPACK_ctrsyl3_base(__VA_ARGS__, 1, 1)
|
||||
#else
|
||||
#define LAPACK_ctrsyl3(...) LAPACK_ctrsyl3_base(__VA_ARGS__)
|
||||
#endif
|
||||
|
||||
#define LAPACK_dtrsyl3_base LAPACK_GLOBAL(dtrsyl3,DTRSYL3)
|
||||
void LAPACK_dtrsyl3_base(
|
||||
char const* trana, char const* tranb,
|
||||
lapack_int const* isgn, lapack_int const* m, lapack_int const* n,
|
||||
double const* A, lapack_int const* lda,
|
||||
double const* B, lapack_int const* ldb,
|
||||
double* C, lapack_int const* ldc, double* scale,
|
||||
lapack_int* iwork, lapack_int const* liwork,
|
||||
double* swork, lapack_int const *ldswork,
|
||||
lapack_int* info
|
||||
#ifdef LAPACK_FORTRAN_STRLEN_END
|
||||
, size_t, size_t
|
||||
#endif
|
||||
);
|
||||
#ifdef LAPACK_FORTRAN_STRLEN_END
|
||||
#define LAPACK_dtrsyl3(...) LAPACK_dtrsyl3_base(__VA_ARGS__, 1, 1)
|
||||
#else
|
||||
#define LAPACK_dtrsyl3(...) LAPACK_dtrsyl3_base(__VA_ARGS__)
|
||||
#endif
|
||||
|
||||
#define LAPACK_strsyl3_base LAPACK_GLOBAL(strsyl3,STRSYL3)
|
||||
void LAPACK_strsyl3_base(
|
||||
char const* trana, char const* tranb,
|
||||
lapack_int const* isgn, lapack_int const* m, lapack_int const* n,
|
||||
float const* A, lapack_int const* lda,
|
||||
float const* B, lapack_int const* ldb,
|
||||
float* C, lapack_int const* ldc, float* scale,
|
||||
lapack_int* iwork, lapack_int const* liwork,
|
||||
float* swork, lapack_int const *ldswork,
|
||||
lapack_int* info
|
||||
#ifdef LAPACK_FORTRAN_STRLEN_END
|
||||
, size_t, size_t
|
||||
#endif
|
||||
);
|
||||
#ifdef LAPACK_FORTRAN_STRLEN_END
|
||||
#define LAPACK_strsyl3(...) LAPACK_strsyl3_base(__VA_ARGS__, 1, 1)
|
||||
#else
|
||||
#define LAPACK_strsyl3(...) LAPACK_strsyl3_base(__VA_ARGS__)
|
||||
#endif
|
||||
|
||||
#define LAPACK_ztrsyl3_base LAPACK_GLOBAL(ztrsyl3,ZTRSYL3)
|
||||
void LAPACK_ztrsyl3_base(
|
||||
char const* trana, char const* tranb,
|
||||
lapack_int const* isgn, lapack_int const* m, lapack_int const* n,
|
||||
lapack_complex_double const* A, lapack_int const* lda,
|
||||
lapack_complex_double const* B, lapack_int const* ldb,
|
||||
lapack_complex_double* C, lapack_int const* ldc, double* scale,
|
||||
double* swork, lapack_int const *ldswork,
|
||||
lapack_int* info
|
||||
#ifdef LAPACK_FORTRAN_STRLEN_END
|
||||
, size_t, size_t
|
||||
#endif
|
||||
);
|
||||
#ifdef LAPACK_FORTRAN_STRLEN_END
|
||||
#define LAPACK_ztrsyl3(...) LAPACK_ztrsyl3_base(__VA_ARGS__, 1, 1)
|
||||
#else
|
||||
#define LAPACK_ztrsyl3(...) LAPACK_ztrsyl3_base(__VA_ARGS__)
|
||||
#endif
|
||||
|
||||
#define LAPACK_ctrtri_base LAPACK_GLOBAL(ctrtri,CTRTRI)
|
||||
void LAPACK_ctrtri_base(
|
||||
char const* uplo, char const* diag,
|
||||
|
|
|
@ -2313,6 +2313,19 @@ lapack_int LAPACKE_zlagge( int matrix_layout, lapack_int m, lapack_int n,
|
|||
float LAPACKE_slamch( char cmach );
|
||||
double LAPACKE_dlamch( char cmach );
|
||||
|
||||
float LAPACKE_slangb( int matrix_layout, char norm, lapack_int n,
|
||||
lapack_int kl, lapack_int ku, const float* ab,
|
||||
lapack_int ldab );
|
||||
double LAPACKE_dlangb( int matrix_layout, char norm, lapack_int n,
|
||||
lapack_int kl, lapack_int ku, const double* ab,
|
||||
lapack_int ldab );
|
||||
float LAPACKE_clangb( int matrix_layout, char norm, lapack_int n,
|
||||
lapack_int kl, lapack_int ku,
|
||||
const lapack_complex_float* ab, lapack_int ldab );
|
||||
double LAPACKE_zlangb( int matrix_layout, char norm, lapack_int n,
|
||||
lapack_int kl, lapack_int ku,
|
||||
const lapack_complex_double* ab, lapack_int ldab );
|
||||
|
||||
float LAPACKE_slange( int matrix_layout, char norm, lapack_int m,
|
||||
lapack_int n, const float* a, lapack_int lda );
|
||||
double LAPACKE_dlange( int matrix_layout, char norm, lapack_int m,
|
||||
|
@ -4477,6 +4490,23 @@ lapack_int LAPACKE_ztrsyl( int matrix_layout, char trana, char tranb,
|
|||
lapack_complex_double* c, lapack_int ldc,
|
||||
double* scale );
|
||||
|
||||
lapack_int LAPACKE_strsyl3( int matrix_layout, char trana, char tranb,
|
||||
lapack_int isgn, lapack_int m, lapack_int n,
|
||||
const float* a, lapack_int lda, const float* b,
|
||||
lapack_int ldb, float* c, lapack_int ldc,
|
||||
float* scale );
|
||||
lapack_int LAPACKE_dtrsyl3( int matrix_layout, char trana, char tranb,
|
||||
lapack_int isgn, lapack_int m, lapack_int n,
|
||||
const double* a, lapack_int lda, const double* b,
|
||||
lapack_int ldb, double* c, lapack_int ldc,
|
||||
double* scale );
|
||||
lapack_int LAPACKE_ztrsyl3( int matrix_layout, char trana, char tranb,
|
||||
lapack_int isgn, lapack_int m, lapack_int n,
|
||||
const lapack_complex_double* a, lapack_int lda,
|
||||
const lapack_complex_double* b, lapack_int ldb,
|
||||
lapack_complex_double* c, lapack_int ldc,
|
||||
double* scale );
|
||||
|
||||
lapack_int LAPACKE_strtri( int matrix_layout, char uplo, char diag, lapack_int n,
|
||||
float* a, lapack_int lda );
|
||||
lapack_int LAPACKE_dtrtri( int matrix_layout, char uplo, char diag, lapack_int n,
|
||||
|
@ -7576,6 +7606,21 @@ double LAPACKE_dlapy3_work( double x, double y, double z );
|
|||
float LAPACKE_slamch_work( char cmach );
|
||||
double LAPACKE_dlamch_work( char cmach );
|
||||
|
||||
float LAPACKE_slangb_work( int matrix_layout, char norm, lapack_int n,
|
||||
lapack_int kl, lapack_int ku, const float* ab,
|
||||
lapack_int ldab, float* work );
|
||||
double LAPACKE_dlangb_work( int matrix_layout, char norm, lapack_int n,
|
||||
lapack_int kl, lapack_int ku, const double* ab,
|
||||
lapack_int ldab, double* work );
|
||||
float LAPACKE_clangb_work( int matrix_layout, char norm, lapack_int n,
|
||||
lapack_int kl, lapack_int ku,
|
||||
const lapack_complex_float* ab, lapack_int ldab,
|
||||
float* work );
|
||||
double LAPACKE_zlangb_work( int matrix_layout, char norm, lapack_int n,
|
||||
lapack_int kl, lapack_int ku,
|
||||
const lapack_complex_double* ab, lapack_int ldab,
|
||||
double* work );
|
||||
|
||||
float LAPACKE_slange_work( int matrix_layout, char norm, lapack_int m,
|
||||
lapack_int n, const float* a, lapack_int lda,
|
||||
float* work );
|
||||
|
@ -10174,6 +10219,35 @@ lapack_int LAPACKE_ztrsyl_work( int matrix_layout, char trana, char tranb,
|
|||
lapack_complex_double* c, lapack_int ldc,
|
||||
double* scale );
|
||||
|
||||
lapack_int LAPACKE_strsyl3_work( int matrix_layout, char trana, char tranb,
|
||||
lapack_int isgn, lapack_int m, lapack_int n,
|
||||
const float* a, lapack_int lda,
|
||||
const float* b, lapack_int ldb,
|
||||
float* c, lapack_int ldc, float* scale,
|
||||
lapack_int* iwork, lapack_int liwork,
|
||||
float* swork, lapack_int ldswork );
|
||||
lapack_int LAPACKE_dtrsyl3_work( int matrix_layout, char trana, char tranb,
|
||||
lapack_int isgn, lapack_int m, lapack_int n,
|
||||
const double* a, lapack_int lda,
|
||||
const double* b, lapack_int ldb,
|
||||
double* c, lapack_int ldc, double* scale,
|
||||
lapack_int* iwork, lapack_int liwork,
|
||||
double* swork, lapack_int ldswork );
|
||||
lapack_int LAPACKE_ctrsyl3_work( int matrix_layout, char trana, char tranb,
|
||||
lapack_int isgn, lapack_int m, lapack_int n,
|
||||
const lapack_complex_float* a, lapack_int lda,
|
||||
const lapack_complex_float* b, lapack_int ldb,
|
||||
lapack_complex_float* c, lapack_int ldc,
|
||||
float* scale, float* swork,
|
||||
lapack_int ldswork );
|
||||
lapack_int LAPACKE_ztrsyl3_work( int matrix_layout, char trana, char tranb,
|
||||
lapack_int isgn, lapack_int m, lapack_int n,
|
||||
const lapack_complex_double* a, lapack_int lda,
|
||||
const lapack_complex_double* b, lapack_int ldb,
|
||||
lapack_complex_double* c, lapack_int ldc,
|
||||
double* scale, double* swork,
|
||||
lapack_int ldswork );
|
||||
|
||||
lapack_int LAPACKE_strtri_work( int matrix_layout, char uplo, char diag,
|
||||
lapack_int n, float* a, lapack_int lda );
|
||||
lapack_int LAPACKE_dtrtri_work( int matrix_layout, char uplo, char diag,
|
||||
|
|
|
@ -42,6 +42,7 @@ extern "C" {
|
|||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
#ifndef lapack_int
|
||||
#if defined(LAPACK_ILP64)
|
||||
|
@ -51,6 +52,17 @@ extern "C" {
|
|||
#endif
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Integer format string
|
||||
*/
|
||||
#ifndef LAPACK_IFMT
|
||||
#if defined(LAPACK_ILP64)
|
||||
#define LAPACK_IFMT PRId64
|
||||
#else
|
||||
#define LAPACK_IFMT PRId32
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef lapack_logical
|
||||
#define lapack_logical lapack_int
|
||||
#endif
|
||||
|
|
|
@ -128,6 +128,10 @@ void LAPACKE_ctp_trans( int matrix_layout, char uplo, char diag,
|
|||
void LAPACKE_ctr_trans( int matrix_layout, char uplo, char diag, lapack_int n,
|
||||
const lapack_complex_float *in, lapack_int ldin,
|
||||
lapack_complex_float *out, lapack_int ldout );
|
||||
void LAPACKE_ctz_trans( int matrix_layout, char direct, char uplo,
|
||||
char diag, lapack_int m, lapack_int n,
|
||||
const lapack_complex_float *in, lapack_int ldin,
|
||||
lapack_complex_float *out, lapack_int ldout );
|
||||
|
||||
void LAPACKE_dgb_trans( int matrix_layout, lapack_int m, lapack_int n,
|
||||
lapack_int kl, lapack_int ku,
|
||||
|
@ -178,6 +182,10 @@ void LAPACKE_dtp_trans( int matrix_layout, char uplo, char diag,
|
|||
void LAPACKE_dtr_trans( int matrix_layout, char uplo, char diag, lapack_int n,
|
||||
const double *in, lapack_int ldin,
|
||||
double *out, lapack_int ldout );
|
||||
void LAPACKE_dtz_trans( int matrix_layout, char direct, char uplo,
|
||||
char diag, lapack_int m, lapack_int n,
|
||||
const double *in, lapack_int ldin,
|
||||
double *out, lapack_int ldout );
|
||||
|
||||
void LAPACKE_sgb_trans( int matrix_layout, lapack_int m, lapack_int n,
|
||||
lapack_int kl, lapack_int ku,
|
||||
|
@ -228,6 +236,10 @@ void LAPACKE_stp_trans( int matrix_layout, char uplo, char diag,
|
|||
void LAPACKE_str_trans( int matrix_layout, char uplo, char diag, lapack_int n,
|
||||
const float *in, lapack_int ldin,
|
||||
float *out, lapack_int ldout );
|
||||
void LAPACKE_stz_trans( int matrix_layout, char direct, char uplo,
|
||||
char diag, lapack_int m, lapack_int n,
|
||||
const float *in, lapack_int ldin,
|
||||
float *out, lapack_int ldout );
|
||||
|
||||
void LAPACKE_zgb_trans( int matrix_layout, lapack_int m, lapack_int n,
|
||||
lapack_int kl, lapack_int ku,
|
||||
|
@ -284,6 +296,10 @@ void LAPACKE_ztp_trans( int matrix_layout, char uplo, char diag,
|
|||
void LAPACKE_ztr_trans( int matrix_layout, char uplo, char diag, lapack_int n,
|
||||
const lapack_complex_double *in, lapack_int ldin,
|
||||
lapack_complex_double *out, lapack_int ldout );
|
||||
void LAPACKE_ztz_trans( int matrix_layout, char direct, char uplo,
|
||||
char diag, lapack_int m, lapack_int n,
|
||||
const lapack_complex_double *in, lapack_int ldin,
|
||||
lapack_complex_double *out, lapack_int ldout );
|
||||
|
||||
/* NaN checkers */
|
||||
#define LAPACK_SISNAN( x ) ( x != x )
|
||||
|
@ -376,6 +392,10 @@ lapack_logical LAPACKE_ctr_nancheck( int matrix_layout, char uplo, char diag,
|
|||
lapack_int n,
|
||||
const lapack_complex_float *a,
|
||||
lapack_int lda );
|
||||
lapack_logical LAPACKE_ctz_nancheck( int matrix_layout, char direct, char uplo,
|
||||
char diag, lapack_int m, lapack_int n,
|
||||
const lapack_complex_float *a,
|
||||
lapack_int lda );
|
||||
|
||||
lapack_logical LAPACKE_dgb_nancheck( int matrix_layout, lapack_int m,
|
||||
lapack_int n, lapack_int kl,
|
||||
|
@ -440,6 +460,9 @@ lapack_logical LAPACKE_dtr_nancheck( int matrix_layout, char uplo, char diag,
|
|||
lapack_int n,
|
||||
const double *a,
|
||||
lapack_int lda );
|
||||
lapack_logical LAPACKE_dtz_nancheck( int matrix_layout, char direct, char uplo,
|
||||
char diag, lapack_int m, lapack_int n,
|
||||
const double *a, lapack_int lda );
|
||||
|
||||
lapack_logical LAPACKE_sgb_nancheck( int matrix_layout, lapack_int m,
|
||||
lapack_int n, lapack_int kl,
|
||||
|
@ -504,6 +527,9 @@ lapack_logical LAPACKE_str_nancheck( int matrix_layout, char uplo, char diag,
|
|||
lapack_int n,
|
||||
const float *a,
|
||||
lapack_int lda );
|
||||
lapack_logical LAPACKE_stz_nancheck( int matrix_layout, char direct, char uplo,
|
||||
char diag, lapack_int m, lapack_int n,
|
||||
const float *a, lapack_int lda );
|
||||
|
||||
lapack_logical LAPACKE_zgb_nancheck( int matrix_layout, lapack_int m,
|
||||
lapack_int n, lapack_int kl,
|
||||
|
@ -574,6 +600,10 @@ lapack_logical LAPACKE_ztr_nancheck( int matrix_layout, char uplo, char diag,
|
|||
lapack_int n,
|
||||
const lapack_complex_double *a,
|
||||
lapack_int lda );
|
||||
lapack_logical LAPACKE_ztz_nancheck( int matrix_layout, char direct, char uplo,
|
||||
char diag, lapack_int m, lapack_int n,
|
||||
const lapack_complex_double *a,
|
||||
lapack_int lda );
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -358,6 +358,8 @@ lapacke_clacrm.o \
|
|||
lapacke_clacrm_work.o \
|
||||
lapacke_clag2z.o \
|
||||
lapacke_clag2z_work.o \
|
||||
lapacke_clangb.o \
|
||||
lapacke_clangb_work.o \
|
||||
lapacke_clange.o \
|
||||
lapacke_clange_work.o \
|
||||
lapacke_clanhe.o \
|
||||
|
@ -842,6 +844,8 @@ lapacke_dlag2s.o \
|
|||
lapacke_dlag2s_work.o \
|
||||
lapacke_dlamch.o \
|
||||
lapacke_dlamch_work.o \
|
||||
lapacke_dlangb.o \
|
||||
lapacke_dlangb_work.o \
|
||||
lapacke_dlange.o \
|
||||
lapacke_dlange_work.o \
|
||||
lapacke_dlansy.o \
|
||||
|
@ -1414,6 +1418,8 @@ lapacke_slacpy.o \
|
|||
lapacke_slacpy_work.o \
|
||||
lapacke_slamch.o \
|
||||
lapacke_slamch_work.o \
|
||||
lapacke_slangb.o \
|
||||
lapacke_slangb_work.o \
|
||||
lapacke_slange.o \
|
||||
lapacke_slange_work.o \
|
||||
lapacke_slansy.o \
|
||||
|
@ -2116,6 +2122,8 @@ lapacke_zlacrm.o \
|
|||
lapacke_zlacrm_work.o \
|
||||
lapacke_zlag2c.o \
|
||||
lapacke_zlag2c_work.o \
|
||||
lapacke_zlangb.o \
|
||||
lapacke_zlangb_work.o \
|
||||
lapacke_zlange.o \
|
||||
lapacke_zlange_work.o \
|
||||
lapacke_zlanhe.o \
|
||||
|
|
|
@ -61,12 +61,12 @@ lapack_int LAPACKE_cgeev_work( int matrix_layout, char jobvl, char jobvr,
|
|||
LAPACKE_xerbla( "LAPACKE_cgeev_work", info );
|
||||
return info;
|
||||
}
|
||||
if( ldvl < n ) {
|
||||
if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) {
|
||||
info = -9;
|
||||
LAPACKE_xerbla( "LAPACKE_cgeev_work", info );
|
||||
return info;
|
||||
}
|
||||
if( ldvr < n ) {
|
||||
if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) {
|
||||
info = -11;
|
||||
LAPACKE_xerbla( "LAPACKE_cgeev_work", info );
|
||||
return info;
|
||||
|
|
|
@ -65,12 +65,12 @@ lapack_int LAPACKE_cgeevx_work( int matrix_layout, char balanc, char jobvl,
|
|||
LAPACKE_xerbla( "LAPACKE_cgeevx_work", info );
|
||||
return info;
|
||||
}
|
||||
if( ldvl < n ) {
|
||||
if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) {
|
||||
info = -11;
|
||||
LAPACKE_xerbla( "LAPACKE_cgeevx_work", info );
|
||||
return info;
|
||||
}
|
||||
if( ldvr < n ) {
|
||||
if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) {
|
||||
info = -13;
|
||||
LAPACKE_xerbla( "LAPACKE_cgeevx_work", info );
|
||||
return info;
|
||||
|
|
|
@ -48,7 +48,6 @@ lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp,
|
|||
lapack_int lrwork = -1;
|
||||
float* rwork = NULL;
|
||||
float rwork_query;
|
||||
lapack_int i;
|
||||
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
|
||||
LAPACKE_xerbla( "LAPACKE_cgesvdq", -1 );
|
||||
return -1;
|
||||
|
|
|
@ -72,12 +72,12 @@ lapack_int LAPACKE_cgges_work( int matrix_layout, char jobvsl, char jobvsr,
|
|||
LAPACKE_xerbla( "LAPACKE_cgges_work", info );
|
||||
return info;
|
||||
}
|
||||
if( ldvsl < n ) {
|
||||
if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) {
|
||||
info = -15;
|
||||
LAPACKE_xerbla( "LAPACKE_cgges_work", info );
|
||||
return info;
|
||||
}
|
||||
if( ldvsr < n ) {
|
||||
if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) {
|
||||
info = -17;
|
||||
LAPACKE_xerbla( "LAPACKE_cgges_work", info );
|
||||
return info;
|
||||
|
|
|
@ -76,12 +76,12 @@ lapack_int LAPACKE_cggesx_work( int matrix_layout, char jobvsl, char jobvsr,
|
|||
LAPACKE_xerbla( "LAPACKE_cggesx_work", info );
|
||||
return info;
|
||||
}
|
||||
if( ldvsl < n ) {
|
||||
if( ldvsl < 1 || ( LAPACKE_lsame( jobvsl, 'v' ) && ldvsl < n ) ) {
|
||||
info = -16;
|
||||
LAPACKE_xerbla( "LAPACKE_cggesx_work", info );
|
||||
return info;
|
||||
}
|
||||
if( ldvsr < n ) {
|
||||
if( ldvsr < 1 || ( LAPACKE_lsame( jobvsr, 'v' ) && ldvsr < n ) ) {
|
||||
info = -18;
|
||||
LAPACKE_xerbla( "LAPACKE_cggesx_work", info );
|
||||
return info;
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2022, Intel Corp.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its contributors
|
||||
may be used to endorse or promote products derived from this software
|
||||
without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
|
||||
THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************
|
||||
* Contents: Native high-level C interface to LAPACK function clangb
|
||||
* Author: Simon Märtens
|
||||
*****************************************************************************/
|
||||
|
||||
#include "lapacke_utils.h"
|
||||
|
||||
float LAPACKE_clangb( int matrix_layout, char norm, lapack_int n,
|
||||
lapack_int kl, lapack_int ku,
|
||||
const lapack_complex_float* ab, lapack_int ldab )
|
||||
{
|
||||
lapack_int info = 0;
|
||||
float res = 0.;
|
||||
float* work = NULL;
|
||||
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
|
||||
LAPACKE_xerbla( "LAPACKE_clangb", -1 );
|
||||
return -1;
|
||||
}
|
||||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||
if( LAPACKE_get_nancheck() ) {
|
||||
/* Optionally check input matrices for NaNs */
|
||||
if( LAPACKE_cgb_nancheck( matrix_layout, n, n, kl, ku, ab, ldab ) ) {
|
||||
return -6;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
/* Allocate memory for working array(s) */
|
||||
if( LAPACKE_lsame( norm, 'i' ) ) {
|
||||
work = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) );
|
||||
if( work == NULL ) {
|
||||
info = LAPACK_WORK_MEMORY_ERROR;
|
||||
goto exit_level_0;
|
||||
}
|
||||
}
|
||||
/* Call middle-level interface */
|
||||
res = LAPACKE_clangb_work( matrix_layout, norm, n, kl, ku, ab, ldab, work );
|
||||
/* Release memory and exit */
|
||||
if( LAPACKE_lsame( norm, 'i' ) ) {
|
||||
LAPACKE_free( work );
|
||||
}
|
||||
exit_level_0:
|
||||
if( info == LAPACK_WORK_MEMORY_ERROR ) {
|
||||
LAPACKE_xerbla( "LAPACKE_clangb", info );
|
||||
}
|
||||
return res;
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2022, Intel Corp.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of Intel Corporation nor the names of its contributors
|
||||
may be used to endorse or promote products derived from this software
|
||||
without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
|
||||
THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************
|
||||
* Contents: Native middle-level C interface to LAPACK function clangb
|
||||
* Author: Simon Märtens
|
||||
*****************************************************************************/
|
||||
|
||||
#include "lapacke_utils.h"
|
||||
|
||||
float LAPACKE_clangb_work( int matrix_layout, char norm, lapack_int n,
|
||||
lapack_int kl, lapack_int ku,
|
||||
const lapack_complex_float* ab, lapack_int ldab,
|
||||
float* work )
|
||||
{
|
||||
lapack_int info = 0;
|
||||
float res = 0.;
|
||||
if( matrix_layout == LAPACK_COL_MAJOR ) {
|
||||
/* Call LAPACK function and adjust info */
|
||||
res = LAPACK_clangb( &norm, &n, &kl, &ku, ab, &ldab, work );
|
||||
} else if( matrix_layout == LAPACK_ROW_MAJOR ) {
|
||||
char norm_lapack;
|
||||
float* work_lapack = NULL;
|
||||
/* Check leading dimension(s) */
|
||||
if( ldab < kl+ku+1 ) {
|
||||
info = -7;
|
||||
LAPACKE_xerbla( "LAPACKE_clangb_work", info );
|
||||
return info;
|
||||
}
|
||||
if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) {
|
||||
norm_lapack = 'i';
|
||||
} else if( LAPACKE_lsame( norm, 'i' ) ) {
|
||||
norm_lapack = '1';
|
||||
} else {
|
||||
norm_lapack = norm;
|
||||
}
|
||||
/* Allocate memory for work array(s) */
|
||||
if( LAPACKE_lsame( norm_lapack, 'i' ) ) {
|
||||
work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) );
|
||||
if( work_lapack == NULL ) {
|
||||
info = LAPACK_WORK_MEMORY_ERROR;
|
||||
goto exit_level_0;
|
||||
}
|
||||
}
|
||||
/* Call LAPACK function */
|
||||
res = LAPACK_clangb( &norm, &n, &ku, &kl, ab, &ldab, work );
|
||||
/* Release memory and exit */
|
||||
if( work_lapack ) {
|
||||
LAPACKE_free( work_lapack );
|
||||
}
|
||||
exit_level_0:
|
||||
if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
|
||||
LAPACKE_xerbla( "LAPACKE_clangb_work", info );
|
||||
}
|
||||
} else {
|
||||
info = -1;
|
||||
LAPACKE_xerbla( "LAPACKE_clangb_work", info );
|
||||
}
|
||||
return res;
|
||||
}
|
|
@ -46,7 +46,7 @@ float LAPACKE_clantr( int matrix_layout, char norm, char uplo, char diag,
|
|||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||
if( LAPACKE_get_nancheck() ) {
|
||||
/* Optionally check input matrices for NaNs */
|
||||
if( LAPACKE_ctr_nancheck( matrix_layout, uplo, diag, MIN(m,n), a, lda ) ) {
|
||||
if( LAPACKE_ctz_nancheck( matrix_layout, 'f', uplo, diag, m, n, a, lda ) ) {
|
||||
return -7;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -42,7 +42,9 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct
|
|||
lapack_int info = 0;
|
||||
lapack_int ldwork;
|
||||
lapack_complex_float* work = NULL;
|
||||
lapack_int ncols_v, nrows_v;
|
||||
lapack_int nrows_v, ncols_v;
|
||||
lapack_logical left, col, forward;
|
||||
char uplo;
|
||||
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
|
||||
LAPACKE_xerbla( "LAPACKE_clarfb", -1 );
|
||||
return -1;
|
||||
|
@ -50,59 +52,27 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct
|
|||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||
if( LAPACKE_get_nancheck() ) {
|
||||
/* Optionally check input matrices for NaNs */
|
||||
lapack_int lrv, lcv; /* row, column stride */
|
||||
if( matrix_layout == LAPACK_COL_MAJOR ) {
|
||||
lrv = 1;
|
||||
lcv = ldv;
|
||||
} else {
|
||||
lrv = ldv;
|
||||
lcv = 1;
|
||||
}
|
||||
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
|
||||
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||
( ( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
|
||||
left = LAPACKE_lsame( side, 'l' );
|
||||
col = LAPACKE_lsame( storev, 'c' );
|
||||
forward = LAPACKE_lsame( direct, 'f' );
|
||||
|
||||
nrows_v = ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||
( ( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( side, 'r' ) ) ? n :
|
||||
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
|
||||
if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||
return -13;
|
||||
nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
|
||||
ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
|
||||
uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
|
||||
|
||||
if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) {
|
||||
LAPACKE_xerbla( "LAPACKE_clarfb", -8 );
|
||||
return -8;
|
||||
}
|
||||
if( LAPACKE_ctz_nancheck( matrix_layout, direct, uplo, 'u',
|
||||
nrows_v, ncols_v, v, ldv ) ) {
|
||||
return -9;
|
||||
}
|
||||
if( LAPACKE_cge_nancheck( matrix_layout, k, k, t, ldt ) ) {
|
||||
return -11;
|
||||
}
|
||||
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||
if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k, v, ldv ) )
|
||||
return -9;
|
||||
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v,
|
||||
&v[k*lrv], ldv ) )
|
||||
return -9;
|
||||
} else if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
||||
if( k > nrows_v ) {
|
||||
LAPACKE_xerbla( "LAPACKE_clarfb", -8 );
|
||||
return -8;
|
||||
}
|
||||
if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k,
|
||||
&v[(nrows_v-k)*lrv], ldv ) )
|
||||
return -9;
|
||||
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v-k, ncols_v, v, ldv ) )
|
||||
return -9;
|
||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||
if( LAPACKE_ctr_nancheck( matrix_layout, 'u', 'u', k, v, ldv ) )
|
||||
return -9;
|
||||
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k,
|
||||
&v[k*lrv], ldv ) )
|
||||
return -9;
|
||||
} else if( LAPACKE_lsame( storev, 'r' ) && LAPACKE_lsame( direct, 'b' ) ) {
|
||||
if( k > ncols_v ) {
|
||||
LAPACKE_xerbla( "LAPACKE_clarfb", -8 );
|
||||
return -8;
|
||||
}
|
||||
if( LAPACKE_ctr_nancheck( matrix_layout, 'l', 'u', k,
|
||||
&v[(ncols_v-k)*lcv], ldv ) )
|
||||
return -9;
|
||||
if( LAPACKE_cge_nancheck( matrix_layout, nrows_v, ncols_v-k, v, ldv ) )
|
||||
return -9;
|
||||
if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||
return -13;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -42,6 +42,8 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans,
|
|||
{
|
||||
lapack_int info = 0;
|
||||
lapack_int nrows_v, ncols_v;
|
||||
lapack_logical left, col, forward;
|
||||
char uplo;
|
||||
lapack_int ldc_t, ldt_t, ldv_t;
|
||||
lapack_complex_float *v_t = NULL, *t_t = NULL, *c_t = NULL;
|
||||
if( matrix_layout == LAPACK_COL_MAJOR ) {
|
||||
|
@ -52,16 +54,14 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans,
|
|||
info = info - 1;
|
||||
}
|
||||
} else if( matrix_layout == LAPACK_ROW_MAJOR ) {
|
||||
nrows_v = ( LAPACKE_lsame( storev, 'c' ) &&
|
||||
LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||
( ( LAPACKE_lsame( storev, 'c' ) &&
|
||||
LAPACKE_lsame( side, 'r' ) ) ? n :
|
||||
( LAPACKE_lsame( storev, 'r' ) ? k : 1) );
|
||||
ncols_v = LAPACKE_lsame( storev, 'c' ) ? k :
|
||||
( ( LAPACKE_lsame( storev, 'r' ) &&
|
||||
LAPACKE_lsame( side, 'l' ) ) ? m :
|
||||
( ( LAPACKE_lsame( storev, 'r' ) &&
|
||||
LAPACKE_lsame( side, 'r' ) ) ? n : 1) );
|
||||
left = LAPACKE_lsame( side, 'l' );
|
||||
col = LAPACKE_lsame( storev, 'c' );
|
||||
forward = LAPACKE_lsame( direct, 'f' );
|
||||
|
||||
nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
|
||||
ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
|
||||
uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
|
||||
|
||||
ldc_t = MAX(1,m);
|
||||
ldt_t = MAX(1,k);
|
||||
ldv_t = MAX(1,nrows_v);
|
||||
|
@ -81,6 +81,11 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans,
|
|||
LAPACKE_xerbla( "LAPACKE_clarfb_work", info );
|
||||
return info;
|
||||
}
|
||||
if( !forward && ( col && k > nrows_v ) || ( !col && k > ncols_v )) {
|
||||
info = -8;
|
||||
LAPACKE_xerbla( "LAPACKE_clarfb_work", info );
|
||||
return info;
|
||||
}
|
||||
/* Allocate memory for temporary array(s) */
|
||||
v_t = (lapack_complex_float*)
|
||||
LAPACKE_malloc( sizeof(lapack_complex_float) *
|
||||
|
@ -102,36 +107,8 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans,
|
|||
goto exit_level_2;
|
||||
}
|
||||
/* Transpose input matrices */
|
||||
if( LAPACKE_lsame( storev, 'c' ) && LAPACKE_lsame( direct, 'f' ) ) {
|
||||
LAPACKE_ctr_trans( matrix_layout, 'l', 'u', k, v, ldv, v_t, ldv_t );
|
||||
LAPACKE_cge_trans( matrix_layout, nrows_v-k, ncols_v, &v[k*ldv], ldv,
|
||||
&v_t[k], ldv_t );
|
||||
} else if( LAPACKE_lsame( storev, 'c' ) &&
|
||||
LAPACKE_lsame( direct, 'b' ) ) {
|
||||
if( k > nrows_v ) {
|
||||
LAPACKE_xerbla( "LAPACKE_clarfb_work", -8 );
|
||||
return -8;
|
||||
}
|
||||
LAPACKE_ctr_trans( matrix_layout, 'u', 'u', k, &v[(nrows_v-k)*ldv],
|
||||
ldv, &v_t[nrows_v-k], ldv_t );
|
||||
LAPACKE_cge_trans( matrix_layout, nrows_v-k, ncols_v, v, ldv, v_t,
|
||||
ldv_t );
|
||||
} else if( LAPACKE_lsame( storev, 'r' ) &&
|
||||
LAPACKE_lsame( direct, 'f' ) ) {
|
||||
LAPACKE_ctr_trans( matrix_layout, 'u', 'u', k, v, ldv, v_t, ldv_t );
|
||||
LAPACKE_cge_trans( matrix_layout, nrows_v, ncols_v-k, &v[k], ldv,
|
||||
&v_t[k*ldv_t], ldv_t );
|
||||
} else if( LAPACKE_lsame( storev, 'r' ) &&
|
||||
LAPACKE_lsame( direct, 'b' ) ) {
|
||||
if( k > ncols_v ) {
|
||||
LAPACKE_xerbla( "LAPACKE_clarfb_work", -8 );
|
||||
return -8;
|
||||
}
|
||||
LAPACKE_ctr_trans( matrix_layout, 'l', 'u', k, &v[ncols_v-k], ldv,
|
||||
&v_t[(ncols_v-k)*ldv_t], ldv_t );
|
||||
LAPACKE_cge_trans( matrix_layout, nrows_v, ncols_v-k, v, ldv, v_t,
|
||||
ldv_t );
|
||||
}
|
||||
LAPACKE_ctz_trans( matrix_layout, direct, uplo, 'u', nrows_v, ncols_v,
|
||||
v, ldv, v_t, ldv_t );
|
||||
LAPACKE_cge_trans( matrix_layout, k, k, t, ldt, t_t, ldt_t );
|
||||
LAPACKE_cge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t );
|
||||
/* Call LAPACK function and adjust info */
|
||||
|
|
|
@ -50,16 +50,24 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans,
|
|||
info = info - 1;
|
||||
}
|
||||
} else if( matrix_layout == LAPACK_ROW_MAJOR ) {
|
||||
lapack_int lda_t = MAX(1,k);
|
||||
lapack_int nrowsA, ncolsA, nrowsV;
|
||||
if ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; }
|
||||
else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; }
|
||||
else {
|
||||
info = -2;
|
||||
LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info );
|
||||
return info;
|
||||
}
|
||||
lapack_int lda_t = MAX(1,nrowsA);
|
||||
lapack_int ldb_t = MAX(1,m);
|
||||
lapack_int ldt_t = MAX(1,ldt);
|
||||
lapack_int ldv_t = MAX(1,ldv);
|
||||
lapack_int ldt_t = MAX(1,nb);
|
||||
lapack_int ldv_t = MAX(1,nrowsV);
|
||||
lapack_complex_float* v_t = NULL;
|
||||
lapack_complex_float* t_t = NULL;
|
||||
lapack_complex_float* a_t = NULL;
|
||||
lapack_complex_float* b_t = NULL;
|
||||
/* Check leading dimension(s) */
|
||||
if( lda < m ) {
|
||||
if( lda < ncolsA ) {
|
||||
info = -14;
|
||||
LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info );
|
||||
return info;
|
||||
|
@ -69,7 +77,7 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans,
|
|||
LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info );
|
||||
return info;
|
||||
}
|
||||
if( ldt < nb ) {
|
||||
if( ldt < k ) {
|
||||
info = -12;
|
||||
LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info );
|
||||
return info;
|
||||
|
@ -87,13 +95,13 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans,
|
|||
goto exit_level_0;
|
||||
}
|
||||
t_t = (lapack_complex_float*)
|
||||
LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,nb) );
|
||||
LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,k) );
|
||||
if( t_t == NULL ) {
|
||||
info = LAPACK_TRANSPOSE_MEMORY_ERROR;
|
||||
goto exit_level_1;
|
||||
}
|
||||
a_t = (lapack_complex_float*)
|
||||
LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,m) );
|
||||
LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,ncolsA) );
|
||||
if( a_t == NULL ) {
|
||||
info = LAPACK_TRANSPOSE_MEMORY_ERROR;
|
||||
goto exit_level_2;
|
||||
|
@ -105,10 +113,10 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans,
|
|||
goto exit_level_3;
|
||||
}
|
||||
/* Transpose input matrices */
|
||||
LAPACKE_cge_trans( matrix_layout, ldv, k, v, ldv, v_t, ldv_t );
|
||||
LAPACKE_cge_trans( matrix_layout, ldt, nb, t, ldt, t_t, ldt_t );
|
||||
LAPACKE_cge_trans( matrix_layout, k, m, a, lda, a_t, lda_t );
|
||||
LAPACKE_cge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
|
||||
LAPACKE_cge_trans( LAPACK_ROW_MAJOR, nrowsV, k, v, ldv, v_t, ldv_t );
|
||||
LAPACKE_cge_trans( LAPACK_ROW_MAJOR, nb, k, t, ldt, t_t, ldt_t );
|
||||
LAPACKE_cge_trans( LAPACK_ROW_MAJOR, nrowsA, ncolsA, a, lda, a_t, lda_t );
|
||||
LAPACKE_cge_trans( LAPACK_ROW_MAJOR, m, n, b, ldb, b_t, ldb_t );
|
||||
/* Call LAPACK function and adjust info */
|
||||
LAPACK_ctpmqrt( &side, &trans, &m, &n, &k, &l, &nb, v_t, &ldv_t, t_t,
|
||||
&ldt_t, a_t, &lda_t, b_t, &ldb_t, work, &info );
|
||||
|
@ -116,7 +124,7 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans,
|
|||
info = info - 1;
|
||||
}
|
||||
/* Transpose output matrices */
|
||||
LAPACKE_cge_trans( LAPACK_COL_MAJOR, k, m, a_t, lda_t, a, lda );
|
||||
LAPACKE_cge_trans( LAPACK_COL_MAJOR, nrowsA, ncolsA, a_t, lda_t, a, lda );
|
||||
LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
|
||||
/* Release memory and exit */
|
||||
LAPACKE_free( b_t );
|
||||
|
|
|
@ -0,0 +1,56 @@
|
|||
#include "lapacke_utils.h"
|
||||
|
||||
lapack_int LAPACKE_ctrsyl3( int matrix_layout, char trana, char tranb,
|
||||
lapack_int isgn, lapack_int m, lapack_int n,
|
||||
const lapack_complex_float* a, lapack_int lda,
|
||||
const lapack_complex_float* b, lapack_int ldb,
|
||||
lapack_complex_float* c, lapack_int ldc,
|
||||
float* scale )
|
||||
{
|
||||
lapack_int info = 0;
|
||||
float swork_query[2];
|
||||
float* swork = NULL;
|
||||
lapack_int ldswork = -1;
|
||||
lapack_int swork_size = -1;
|
||||
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
|
||||
LAPACKE_xerbla( "LAPACKE_ctrsyl3", -1 );
|
||||
return -1;
|
||||
}
|
||||
#ifndef LAPACK_DISABLE_NAN_CHECK
|
||||
if( LAPACKE_get_nancheck() ) {
|
||||
/* Optionally check input matrices for NaNs */
|
||||
if( LAPACKE_cge_nancheck( matrix_layout, m, m, a, lda ) ) {
|
||||
return -7;
|
||||
}
|
||||
if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) {
|
||||
return -9;
|
||||
}
|
||||
if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
|
||||
return -11;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
/* Query optimal working array sizes */
|
||||
info = LAPACKE_ctrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a, lda,
|
||||
b, ldb, c, ldc, scale, swork_query, ldswork );
|
||||
if( info != 0 ) {
|
||||
goto exit_level_0;
|
||||
}
|
||||
ldswork = swork_query[0];
|
||||
swork_size = ldswork * swork_query[1];
|
||||
swork = (float*)LAPACKE_malloc( sizeof(float) * swork_size);
|
||||
if( swork == NULL ) {
|
||||
info = LAPACK_WORK_MEMORY_ERROR;
|
||||
goto exit_level_0;
|
||||
}
|
||||
/* Call middle-level interface */
|
||||
info = LAPACKE_ctrsyl3_work( matrix_layout, trana, tranb, isgn, m, n, a,
|
||||
lda, b, ldb, c, ldc, scale, swork, ldswork );
|
||||
/* Release memory and exit */
|
||||
LAPACKE_free( swork );
|
||||
exit_level_0:
|
||||
if( info == LAPACK_WORK_MEMORY_ERROR ) {
|
||||
LAPACKE_xerbla( "LAPACKE_ctrsyl3", info );
|
||||
}
|
||||
return info;
|
||||
}
|
|
@ -0,0 +1,88 @@
|
|||
#include "lapacke_utils.h"
|
||||
|
||||
lapack_int LAPACKE_ctrsyl3_work( int matrix_layout, char trana, char tranb,
|
||||
lapack_int isgn, lapack_int m, lapack_int n,
|
||||
const lapack_complex_float* a, lapack_int lda,
|
||||
const lapack_complex_float* b, lapack_int ldb,
|
||||
lapack_complex_float* c, lapack_int ldc,
|
||||
float* scale, float* swork,
|
||||
lapack_int ldswork )
|
||||
{
|
||||
lapack_int info = 0;
|
||||
if( matrix_layout == LAPACK_COL_MAJOR ) {
|
||||
/* Call LAPACK function and adjust info */
|
||||
LAPACK_ctrsyl3( &trana, &tranb, &isgn, &m, &n, a, &lda, b, &ldb, c, &ldc,
|
||||
scale, swork, &ldswork, &info );
|
||||
if( info < 0 ) {
|
||||
info = info - 1;
|
||||
}
|
||||
} else if( matrix_layout == LAPACK_ROW_MAJOR ) {
|
||||
lapack_int lda_t = MAX(1,m);
|
||||
lapack_int ldb_t = MAX(1,n);
|
||||
lapack_int ldc_t = MAX(1,m);
|
||||
lapack_complex_float* a_t = NULL;
|
||||
lapack_complex_float* b_t = NULL;
|
||||
lapack_complex_float* c_t = NULL;
|
||||
/* Check leading dimension(s) */
|
||||
if( lda < m ) {
|
||||
info = -8;
|
||||
LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info );
|
||||
return info;
|
||||
}
|
||||
if( ldb < n ) {
|
||||
info = -10;
|
||||
LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info );
|
||||
return info;
|
||||
}
|
||||
if( ldc < n ) {
|
||||
info = -12;
|
||||
LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info );
|
||||
return info;
|
||||
}
|
||||
/* Allocate memory for temporary array(s) */
|
||||
a_t = (lapack_complex_float*)
|
||||
LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,m) );
|
||||
if( a_t == NULL ) {
|
||||
info = LAPACK_TRANSPOSE_MEMORY_ERROR;
|
||||
goto exit_level_0;
|
||||
}
|
||||
b_t = (lapack_complex_float*)
|
||||
LAPACKE_malloc( sizeof(lapack_complex_float) * ldb_t * MAX(1,n) );
|
||||
if( b_t == NULL ) {
|
||||
info = LAPACK_TRANSPOSE_MEMORY_ERROR;
|
||||
goto exit_level_1;
|
||||
}
|
||||
c_t = (lapack_complex_float*)
|
||||
LAPACKE_malloc( sizeof(lapack_complex_float) * ldc_t * MAX(1,n) );
|
||||
if( c_t == NULL ) {
|
||||
info = LAPACK_TRANSPOSE_MEMORY_ERROR;
|
||||
goto exit_level_2;
|
||||
}
|
||||
/* Transpose input matrices */
|
||||
LAPACKE_cge_trans( matrix_layout, m, m, a, lda, a_t, lda_t );
|
||||
LAPACKE_cge_trans( matrix_layout, n, n, b, ldb, b_t, ldb_t );
|
||||
LAPACKE_cge_trans( matrix_layout, m, n, c, ldc, c_t, ldc_t );
|
||||
/* Call LAPACK function and adjust info */
|
||||
LAPACK_ctrsyl3( &trana, &tranb, &isgn, &m, &n, a_t, &lda_t, b_t, &ldb_t,
|
||||
c_t, &ldc_t, scale, swork, &ldswork, &info );
|
||||
if( info < 0 ) {
|
||||
info = info - 1;
|
||||
}
|
||||
/* Transpose output matrices */
|
||||
LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, c_t, ldc_t, c, ldc );
|
||||
/* Release memory and exit */
|
||||
LAPACKE_free( c_t );
|
||||
exit_level_2:
|
||||
LAPACKE_free( b_t );
|
||||
exit_level_1:
|
||||
LAPACKE_free( a_t );
|
||||
exit_level_0:
|
||||
if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
|
||||
LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info );
|
||||
}
|
||||
} else {
|
||||
info = -1;
|
||||
LAPACKE_xerbla( "LAPACKE_ctrsyl3_work", info );
|
||||
}
|
||||
return info;
|
||||
}
|
|
@ -59,12 +59,12 @@ lapack_int LAPACKE_dgeev_work( int matrix_layout, char jobvl, char jobvr,
|
|||
LAPACKE_xerbla( "LAPACKE_dgeev_work", info );
|
||||
return info;
|
||||
}
|
||||
if( ldvl < n ) {
|
||||
if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) {
|
||||
info = -10;
|
||||
LAPACKE_xerbla( "LAPACKE_dgeev_work", info );
|
||||
return info;
|
||||
}
|
||||
if( ldvr < n ) {
|
||||
if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) {
|
||||
info = -12;
|
||||
LAPACKE_xerbla( "LAPACKE_dgeev_work", info );
|
||||
return info;
|
||||
|
|
|
@ -63,12 +63,12 @@ lapack_int LAPACKE_dgeevx_work( int matrix_layout, char balanc, char jobvl,
|
|||
LAPACKE_xerbla( "LAPACKE_dgeevx_work", info );
|
||||
return info;
|
||||
}
|
||||
if( ldvl < n ) {
|
||||
if( ldvl < 1 || ( LAPACKE_lsame( jobvl, 'v' ) && ldvl < n ) ) {
|
||||
info = -12;
|
||||
LAPACKE_xerbla( "LAPACKE_dgeevx_work", info );
|
||||
return info;
|
||||
}
|
||||
if( ldvr < n ) {
|
||||
if( ldvr < 1 || ( LAPACKE_lsame( jobvr, 'v' ) && ldvr < n ) ) {
|
||||
info = -14;
|
||||
LAPACKE_xerbla( "LAPACKE_dgeevx_work", info );
|
||||
return info;
|
||||
|
|
|
@ -48,7 +48,6 @@ lapack_int LAPACKE_dgesvdq( int matrix_layout, char joba, char jobp,
|
|||
lapack_int lrwork = -1;
|
||||
double* rwork = NULL;
|
||||
double rwork_query;
|
||||
lapack_int i;
|
||||
if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
|
||||
LAPACKE_xerbla( "LAPACKE_dgesvdq", -1 );
|
||||
return -1;
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue