Merge pull request #4210 from xianyi/develop
merge develop into 0.3.0 for 0.3.24
This commit is contained in:
commit
2c68822cde
|
@ -0,0 +1,167 @@
|
||||||
|
macos_instance:
|
||||||
|
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
|
||||||
|
|
||||||
|
task:
|
||||||
|
name: AppleM1/LLVM
|
||||||
|
compile_script:
|
||||||
|
- brew install llvm
|
||||||
|
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||||
|
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||||
|
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||||
|
- make TARGET=VORTEX USE_OPENMP=1 CC=clang
|
||||||
|
|
||||||
|
task:
|
||||||
|
name: AppleM1/LLVM/ILP64
|
||||||
|
compile_script:
|
||||||
|
- brew install llvm
|
||||||
|
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||||
|
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||||
|
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||||
|
- make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1
|
||||||
|
|
||||||
|
task:
|
||||||
|
name: AppleM1/LLVM/CMAKE
|
||||||
|
compile_script:
|
||||||
|
- brew install llvm
|
||||||
|
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||||
|
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||||
|
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||||
|
- mkdir build
|
||||||
|
- cd build
|
||||||
|
- cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
|
||||||
|
- make
|
||||||
|
|
||||||
|
task:
|
||||||
|
name: AppleM1/GCC/MAKE/OPENMP
|
||||||
|
compile_script:
|
||||||
|
- brew install gcc@11
|
||||||
|
- export PATH=/opt/homebrew/bin:$PATH
|
||||||
|
- export LDFLAGS="-L/opt/homebrew/lib"
|
||||||
|
- export CPPFLAGS="-I/opt/homebrew/include"
|
||||||
|
- make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
|
||||||
|
|
||||||
|
macos_instance:
|
||||||
|
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
|
||||||
|
task:
|
||||||
|
name: AppleM1/LLVM x86_64 xbuild
|
||||||
|
compile_script:
|
||||||
|
- #brew install llvm
|
||||||
|
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||||
|
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||||
|
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||||
|
- export ARCHS="i386 x86_64"
|
||||||
|
- export ARCHS_STANDARD="i386 x86_64"
|
||||||
|
- export ARCHS_STANDARD_32_64_BIT="i386 x86_64"
|
||||||
|
- export ARCHS_STANDARD_64_BIT=x86_64
|
||||||
|
- export ARCHS_STANDARD_INCLUDING_64_BIT="i386 x86_64"
|
||||||
|
- export ARCHS_UNIVERSAL_IPHONE_OS="i386 x86_64"
|
||||||
|
- export VALID_ARCHS="i386 x86_64"
|
||||||
|
- xcrun --sdk macosx --show-sdk-path
|
||||||
|
- xcodebuild -version
|
||||||
|
- export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||||
|
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX12.3.sdk -arch x86_64"
|
||||||
|
- make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
|
||||||
|
always:
|
||||||
|
config_artifacts:
|
||||||
|
path: "*conf*"
|
||||||
|
type: text/plain
|
||||||
|
# lib_artifacts:
|
||||||
|
# path: "libopenblas*"
|
||||||
|
# type: application/octet-streamm
|
||||||
|
|
||||||
|
macos_instance:
|
||||||
|
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
|
||||||
|
task:
|
||||||
|
name: AppleM1/LLVM armv8-ios xbuild
|
||||||
|
compile_script:
|
||||||
|
- #brew install llvm
|
||||||
|
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||||
|
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||||
|
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||||
|
- export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||||
|
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||||
|
- make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1
|
||||||
|
always:
|
||||||
|
config_artifacts:
|
||||||
|
path: "*conf*"
|
||||||
|
type: text/plain
|
||||||
|
|
||||||
|
macos_instance:
|
||||||
|
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
|
||||||
|
task:
|
||||||
|
name: AppleM1/LLVM armv7-androidndk xbuild
|
||||||
|
compile_script:
|
||||||
|
- #brew install android-ndk
|
||||||
|
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||||
|
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||||
|
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||||
|
- find /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b -name "armv7a-linux-androideabi*-ranlib"
|
||||||
|
- #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||||
|
- #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||||
|
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b/AndroidNDK8937393.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
|
||||||
|
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
|
||||||
|
always:
|
||||||
|
config_artifacts:
|
||||||
|
path: "*conf*"
|
||||||
|
type: text/plain
|
||||||
|
|
||||||
|
task:
|
||||||
|
name: NeoverseN1
|
||||||
|
arm_container:
|
||||||
|
image: node:latest
|
||||||
|
compile_script:
|
||||||
|
- make
|
||||||
|
|
||||||
|
task:
|
||||||
|
name: NeoverseN1-ILP64
|
||||||
|
arm_container:
|
||||||
|
image: node:latest
|
||||||
|
compile_script:
|
||||||
|
- make INTERFACE64=1
|
||||||
|
|
||||||
|
task:
|
||||||
|
name: NeoverseN1-OMP
|
||||||
|
arm_container:
|
||||||
|
image: node:latest
|
||||||
|
cpu: 8
|
||||||
|
compile_script:
|
||||||
|
- make USE_OPENMP=1
|
||||||
|
|
||||||
|
FreeBSD_task:
|
||||||
|
name: FreeBSD-gcc12
|
||||||
|
freebsd_instance:
|
||||||
|
image_family: freebsd-13-2
|
||||||
|
install_script:
|
||||||
|
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
|
||||||
|
compile_script:
|
||||||
|
- ls -l /usr/local/lib
|
||||||
|
- gmake CC=gcc
|
||||||
|
|
||||||
|
|
||||||
|
FreeBSD_task:
|
||||||
|
name: freebsd-gcc12-ilp64
|
||||||
|
freebsd_instance:
|
||||||
|
image_family: freebsd-13-2
|
||||||
|
install_script:
|
||||||
|
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
|
||||||
|
compile_script:
|
||||||
|
- ls -l /usr/local/lib
|
||||||
|
- gmake CC=gcc INTERFACE64=1
|
||||||
|
|
||||||
|
#task:
|
||||||
|
# name: Windows/LLVM16 --- too slow ---
|
||||||
|
# windows_container:
|
||||||
|
# image: cirrusci/windowsservercore:cmake-2021.12.07
|
||||||
|
# install_script:
|
||||||
|
# - choco list --localonly
|
||||||
|
# - choco install -y llvm
|
||||||
|
# - # choco install -y cmake --installargs '"ADD_CMAKE_TO_PATH=System"'
|
||||||
|
# - choco install -y ninja
|
||||||
|
# - refreshenv
|
||||||
|
# - cd "c:/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/VC/Auxiliary/Build"
|
||||||
|
# - vcvarsall x64
|
||||||
|
# - cd "C:\Users\ContainerAdministrator\AppData\Local\Temp\cirrus-ci-build"
|
||||||
|
# - cmake -S . -B build -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release
|
||||||
|
# - cd build
|
||||||
|
# - cmake --build .
|
||||||
|
# - ctest
|
|
@ -0,0 +1,121 @@
|
||||||
|
name: c910v qemu test
|
||||||
|
|
||||||
|
on: [push, pull_request]
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read # to fetch code (actions/checkout)
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
TEST:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
env:
|
||||||
|
xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282
|
||||||
|
toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- target: RISCV64_GENERIC
|
||||||
|
triple: riscv64-linux-gnu
|
||||||
|
apt_triple: riscv64-linux-gnu
|
||||||
|
opts: NO_SHARED=1 TARGET=RISCV64_GENERIC
|
||||||
|
- target: C910V
|
||||||
|
triple: riscv64-unknown-linux-gnu
|
||||||
|
apt_triple: riscv64-linux-gnu
|
||||||
|
opts: NO_SHARED=1 TARGET=C910V
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: install build deps
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
|
||||||
|
gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross
|
||||||
|
|
||||||
|
- name: checkout qemu
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
with:
|
||||||
|
repository: T-head-Semi/qemu
|
||||||
|
path: qemu
|
||||||
|
ref: 1e692ebb43d396c52352406323fc782c1ac99a42
|
||||||
|
|
||||||
|
- name: build qemu
|
||||||
|
run: |
|
||||||
|
# Force use c910v qemu-user
|
||||||
|
wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
|
||||||
|
cd qemu
|
||||||
|
patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
|
||||||
|
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
|
||||||
|
make -j$(nproc)
|
||||||
|
make install
|
||||||
|
|
||||||
|
- name: Compilation cache
|
||||||
|
uses: actions/cache@v3
|
||||||
|
with:
|
||||||
|
path: ~/.ccache
|
||||||
|
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
|
||||||
|
restore-keys: |
|
||||||
|
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
|
||||||
|
ccache-${{ runner.os }}-${{ matrix.target }}
|
||||||
|
|
||||||
|
- name: Configure ccache
|
||||||
|
run: |
|
||||||
|
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||||
|
echo "max_size = 300M" > ~/.ccache/ccache.conf
|
||||||
|
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||||
|
ccache -s
|
||||||
|
|
||||||
|
- name: build OpenBLAS
|
||||||
|
run: |
|
||||||
|
wget ${xuetie_toolchain}/${toolchain_file_name}
|
||||||
|
tar -xvf ${toolchain_file_name} -C /opt
|
||||||
|
export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1/bin:$PATH"
|
||||||
|
|
||||||
|
make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
|
||||||
|
|
||||||
|
- name: test
|
||||||
|
run: |
|
||||||
|
export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
|
||||||
|
qemu-riscv64 ./utest/openblas_utest
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat1
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat2 < ./ctest/sin2
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat2 < ./ctest/din2
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat2 < ./ctest/cin2
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat2 < ./ctest/zin2
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat3 < ./ctest/sin3
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat3 < ./ctest/din3
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat3 < ./ctest/cin3
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat3 < ./ctest/zin3
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat1
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat1
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat1
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat1
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat1
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat1
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat1
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat1
|
||||||
|
rm -f ./test/?BLAT2.SUMM
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat
|
||||||
|
rm -f ./test/?BLAT2.SUMM
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat
|
||||||
|
rm -f ./test/?BLAT3.SUMM
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat
|
||||||
|
rm -f ./test/?BLAT3.SUMM
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat
|
|
@ -151,40 +151,53 @@ jobs:
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
msystem: [MINGW64, MINGW32, CLANG64]
|
msystem: [MINGW64, MINGW32, CLANG64, CLANG32]
|
||||||
idx: [int32, int64]
|
idx: [int32, int64]
|
||||||
build-type: [Release]
|
build-type: [Release]
|
||||||
include:
|
include:
|
||||||
- msystem: MINGW64
|
- msystem: MINGW64
|
||||||
idx: int32
|
idx: int32
|
||||||
target-prefix: mingw-w64-x86_64
|
target-prefix: mingw-w64-x86_64
|
||||||
fc-pkg: mingw-w64-x86_64-gcc-fortran
|
fc-pkg: fc
|
||||||
- msystem: MINGW32
|
- msystem: MINGW32
|
||||||
idx: int32
|
idx: int32
|
||||||
target-prefix: mingw-w64-i686
|
target-prefix: mingw-w64-i686
|
||||||
fc-pkg: mingw-w64-i686-gcc-fortran
|
fc-pkg: fc
|
||||||
- msystem: CLANG64
|
- msystem: CLANG64
|
||||||
idx: int32
|
idx: int32
|
||||||
target-prefix: mingw-w64-clang-x86_64
|
target-prefix: mingw-w64-clang-x86_64
|
||||||
|
fc-pkg: fc
|
||||||
|
# Compiling with Flang 16 seems to cause test errors on machines
|
||||||
|
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
|
||||||
|
no-avx512-flags: -DNO_AVX512=1
|
||||||
|
- msystem: CLANG32
|
||||||
|
idx: int32
|
||||||
|
target-prefix: mingw-w64-clang-i686
|
||||||
|
fc-pkg: cc
|
||||||
c-lapack-flags: -DC_LAPACK=ON
|
c-lapack-flags: -DC_LAPACK=ON
|
||||||
- msystem: MINGW64
|
- msystem: MINGW64
|
||||||
idx: int64
|
idx: int64
|
||||||
idx64-flags: -DBINARY=64 -DINTERFACE64=1
|
idx64-flags: -DBINARY=64 -DINTERFACE64=1
|
||||||
target-prefix: mingw-w64-x86_64
|
target-prefix: mingw-w64-x86_64
|
||||||
fc-pkg: mingw-w64-x86_64-gcc-fortran
|
fc-pkg: fc
|
||||||
- msystem: CLANG64
|
- msystem: CLANG64
|
||||||
idx: int64
|
idx: int64
|
||||||
idx64-flags: -DBINARY=64 -DINTERFACE64=1
|
idx64-flags: -DBINARY=64 -DINTERFACE64=1
|
||||||
target-prefix: mingw-w64-clang-x86_64
|
target-prefix: mingw-w64-clang-x86_64
|
||||||
c-lapack-flags: -DC_LAPACK=ON
|
fc-pkg: fc
|
||||||
|
# Compiling with Flang 16 seems to cause test errors on machines
|
||||||
|
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
|
||||||
|
no-avx512-flags: -DNO_AVX512=1
|
||||||
- msystem: MINGW64
|
- msystem: MINGW64
|
||||||
idx: int32
|
idx: int32
|
||||||
target-prefix: mingw-w64-x86_64
|
target-prefix: mingw-w64-x86_64
|
||||||
fc-pkg: mingw-w64-x86_64-gcc-fortran
|
fc-pkg: fc
|
||||||
build-type: None
|
build-type: None
|
||||||
exclude:
|
exclude:
|
||||||
- msystem: MINGW32
|
- msystem: MINGW32
|
||||||
idx: int64
|
idx: int64
|
||||||
|
- msystem: CLANG32
|
||||||
|
idx: int64
|
||||||
|
|
||||||
defaults:
|
defaults:
|
||||||
run:
|
run:
|
||||||
|
@ -209,7 +222,7 @@ jobs:
|
||||||
install: >-
|
install: >-
|
||||||
base-devel
|
base-devel
|
||||||
${{ matrix.target-prefix }}-cc
|
${{ matrix.target-prefix }}-cc
|
||||||
${{ matrix.fc-pkg }}
|
${{ matrix.target-prefix }}-${{ matrix.fc-pkg }}
|
||||||
${{ matrix.target-prefix }}-cmake
|
${{ matrix.target-prefix }}-cmake
|
||||||
${{ matrix.target-prefix }}-ninja
|
${{ matrix.target-prefix }}-ninja
|
||||||
${{ matrix.target-prefix }}-ccache
|
${{ matrix.target-prefix }}-ccache
|
||||||
|
@ -217,14 +230,21 @@ jobs:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: Compilation cache
|
- name: Prepare ccache
|
||||||
uses: actions/cache@v3
|
# Get cache location of ccache
|
||||||
with:
|
# Create key that is used in action/cache/restore and action/cache/save steps
|
||||||
# It looks like this path needs to be hard-coded.
|
id: ccache-prepare
|
||||||
path: C:/msys64/home/runneradmin/.ccache
|
run: |
|
||||||
|
echo "ccachedir=$(cygpath -m $(ccache -k cache_dir))" >> $GITHUB_OUTPUT
|
||||||
# We include the commit sha in the cache key, as new cache entries are
|
# We include the commit sha in the cache key, as new cache entries are
|
||||||
# only created if there is no existing entry for the key yet.
|
# only created if there is no existing entry for the key yet.
|
||||||
key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}
|
echo "key=ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
|
- name: Restore ccache
|
||||||
|
uses: actions/cache/restore@v3
|
||||||
|
with:
|
||||||
|
path: ${{ steps.ccache-prepare.outputs.ccachedir }}
|
||||||
|
key: ${{ steps.ccache-prepare.outputs.key }}
|
||||||
# Restore a matching ccache cache entry. Prefer same branch.
|
# Restore a matching ccache cache entry. Prefer same branch.
|
||||||
restore-keys: |
|
restore-keys: |
|
||||||
ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}
|
ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}
|
||||||
|
@ -234,9 +254,10 @@ jobs:
|
||||||
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota.
|
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota.
|
||||||
run: |
|
run: |
|
||||||
which ccache
|
which ccache
|
||||||
test -d ~/.ccache || mkdir -p ~/.ccache
|
test -d ${{ steps.ccache-prepare.outputs.ccachedir }} || mkdir -p ${{ steps.ccache-prepare.outputs.ccachedir }}
|
||||||
echo "max_size = 250M" > ~/.ccache/ccache.conf
|
echo "max_size = 250M" > ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf
|
||||||
echo "compression = true" >> ~/.ccache/ccache.conf
|
echo "compression = true" >> ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf
|
||||||
|
ccache -p
|
||||||
ccache -s
|
ccache -s
|
||||||
echo $HOME
|
echo $HOME
|
||||||
cygpath -w $HOME
|
cygpath -w $HOME
|
||||||
|
@ -253,6 +274,7 @@ jobs:
|
||||||
-DTARGET=CORE2 \
|
-DTARGET=CORE2 \
|
||||||
${{ matrix.idx64-flags }} \
|
${{ matrix.idx64-flags }} \
|
||||||
${{ matrix.c-lapack-flags }} \
|
${{ matrix.c-lapack-flags }} \
|
||||||
|
${{ matrix.no-avx512-flags }} \
|
||||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||||
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
|
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
|
||||||
..
|
..
|
||||||
|
@ -264,10 +286,30 @@ jobs:
|
||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
run: ccache -s
|
run: ccache -s
|
||||||
|
|
||||||
|
- name: Save ccache
|
||||||
|
# Save the cache after we are done (successfully) building
|
||||||
|
uses: actions/cache/save@v3
|
||||||
|
with:
|
||||||
|
path: ${{ steps.ccache-prepare.outputs.ccachedir }}
|
||||||
|
key: ${{ steps.ccache-prepare.outputs.key }}
|
||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
|
id: run-ctest
|
||||||
timeout-minutes: 60
|
timeout-minutes: 60
|
||||||
run: cd build && ctest
|
run: cd build && ctest
|
||||||
|
|
||||||
|
- name: Re-run tests
|
||||||
|
if: always() && (steps.run-ctest.outcome == 'failure')
|
||||||
|
timeout-minutes: 60
|
||||||
|
run: |
|
||||||
|
cd build
|
||||||
|
echo "::group::Re-run ctest"
|
||||||
|
ctest --rerun-failed --output-on-failure || true
|
||||||
|
echo "::endgroup::"
|
||||||
|
echo "::group::Log from these tests"
|
||||||
|
[ ! -f Testing/Temporary/LastTest.log ] || cat Testing/Temporary/LastTest.log
|
||||||
|
echo "::endgroup::"
|
||||||
|
|
||||||
|
|
||||||
cross_build:
|
cross_build:
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
|
@ -295,6 +337,7 @@ jobs:
|
||||||
|
|
||||||
- name: Install Dependencies
|
- name: Install Dependencies
|
||||||
run: |
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
sudo apt-get install -y ccache gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-${{ matrix.target }}-cross
|
sudo apt-get install -y ccache gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-${{ matrix.target }}-cross
|
||||||
|
|
||||||
- name: Compilation cache
|
- name: Compilation cache
|
||||||
|
|
|
@ -0,0 +1,110 @@
|
||||||
|
name: loongarch64 qemu test
|
||||||
|
|
||||||
|
on: [push, pull_request]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
TEST:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- target: LOONGSONGENERIC
|
||||||
|
triple: loongarch64-unknown-linux-gnu
|
||||||
|
opts: NO_SHARED=1 TARGET=LOONGSONGENERIC
|
||||||
|
- target: LOONGSON3R5
|
||||||
|
triple: loongarch64-unknown-linux-gnu
|
||||||
|
opts: NO_SHARED=1 TARGET=LOONGSON3R5
|
||||||
|
- target: LOONGSON2K1000
|
||||||
|
triple: loongarch64-unknown-linux-gnu
|
||||||
|
opts: NO_SHARED=1 TARGET=LOONGSON2K1000
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Install APT deps
|
||||||
|
run: |
|
||||||
|
sudo add-apt-repository ppa:savoury1/virtualisation
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
|
||||||
|
qemu-user-static
|
||||||
|
|
||||||
|
- name: Download and install loongarch64-toolchain
|
||||||
|
run: |
|
||||||
|
wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz
|
||||||
|
tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt
|
||||||
|
|
||||||
|
- name: Set env
|
||||||
|
run: |
|
||||||
|
echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
|
||||||
|
echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Compilation cache
|
||||||
|
uses: actions/cache@v3
|
||||||
|
with:
|
||||||
|
path: ~/.ccache
|
||||||
|
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
|
||||||
|
restore-keys: |
|
||||||
|
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
|
||||||
|
ccache-${{ runner.os }}-${{ matrix.target }}
|
||||||
|
|
||||||
|
- name: Configure ccache
|
||||||
|
run: |
|
||||||
|
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||||
|
echo "max_size = 300M" > ~/.ccache/ccache.conf
|
||||||
|
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||||
|
ccache -s
|
||||||
|
|
||||||
|
- name: Disable utest dsdot:dsdot_n_1
|
||||||
|
run: |
|
||||||
|
echo -n > utest/test_dsdot.c
|
||||||
|
echo "Due to the qemu versions 7.2 causing utest cases to fail,"
|
||||||
|
echo "the utest dsdot:dsdot_n_1 have been temporarily disabled."
|
||||||
|
|
||||||
|
- name: Build OpenBLAS
|
||||||
|
run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
|
||||||
|
|
||||||
|
- name: Test
|
||||||
|
run: |
|
||||||
|
qemu-loongarch64-static ./utest/openblas_utest
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1
|
||||||
|
rm -f ./test/?BLAT2.SUMM
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
|
||||||
|
rm -f ./test/?BLAT2.SUMM
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
|
||||||
|
rm -f ./test/?BLAT3.SUMM
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
|
||||||
|
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat
|
||||||
|
rm -f ./test/?BLAT3.SUMM
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
|
||||||
|
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat
|
|
@ -14,6 +14,7 @@ lapack-3.4.2
|
||||||
lapack-3.4.2.tgz
|
lapack-3.4.2.tgz
|
||||||
lapack-netlib/make.inc
|
lapack-netlib/make.inc
|
||||||
lapack-netlib/lapacke/include/lapacke_mangling.h
|
lapack-netlib/lapacke/include/lapacke_mangling.h
|
||||||
|
lapack-netlib/SRC/la_constants.mod
|
||||||
lapack-netlib/TESTING/testing_results.txt
|
lapack-netlib/TESTING/testing_results.txt
|
||||||
lapack-netlib/INSTALL/test*
|
lapack-netlib/INSTALL/test*
|
||||||
lapack-netlib/TESTING/xeigtstc
|
lapack-netlib/TESTING/xeigtstc
|
||||||
|
@ -71,6 +72,7 @@ test/SBLAT3.SUMM
|
||||||
test/ZBLAT2.SUMM
|
test/ZBLAT2.SUMM
|
||||||
test/ZBLAT3.SUMM
|
test/ZBLAT3.SUMM
|
||||||
test/SHBLAT3.SUMM
|
test/SHBLAT3.SUMM
|
||||||
|
test/SBBLAT3.SUMM
|
||||||
test/cblat1
|
test/cblat1
|
||||||
test/cblat2
|
test/cblat2
|
||||||
test/cblat3
|
test/cblat3
|
||||||
|
@ -81,6 +83,7 @@ test/sblat1
|
||||||
test/sblat2
|
test/sblat2
|
||||||
test/sblat3
|
test/sblat3
|
||||||
test/test_shgemm
|
test/test_shgemm
|
||||||
|
test/test_sbgemm
|
||||||
test/zblat1
|
test/zblat1
|
||||||
test/zblat2
|
test/zblat2
|
||||||
test/zblat3
|
test/zblat3
|
||||||
|
|
|
@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
|
||||||
|
|
||||||
set(OpenBLAS_MAJOR_VERSION 0)
|
set(OpenBLAS_MAJOR_VERSION 0)
|
||||||
set(OpenBLAS_MINOR_VERSION 3)
|
set(OpenBLAS_MINOR_VERSION 3)
|
||||||
set(OpenBLAS_PATCH_VERSION 23)
|
set(OpenBLAS_PATCH_VERSION 23.dev)
|
||||||
|
|
||||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||||
|
|
||||||
|
@ -20,6 +20,8 @@ include(CMakePackageConfigHelpers)
|
||||||
#######
|
#######
|
||||||
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF)
|
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF)
|
||||||
|
|
||||||
|
option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON)
|
||||||
|
|
||||||
option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
|
option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
|
||||||
|
|
||||||
option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
|
option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
|
||||||
|
@ -309,20 +311,26 @@ endif()
|
||||||
|
|
||||||
#if (MSVC OR NOT NOFORTRAN)
|
#if (MSVC OR NOT NOFORTRAN)
|
||||||
if (NOT NO_CBLAS)
|
if (NOT NO_CBLAS)
|
||||||
|
if (NOT ONLY_CBLAS)
|
||||||
# Broken without fortran on unix
|
# Broken without fortran on unix
|
||||||
add_subdirectory(utest)
|
add_subdirectory(utest)
|
||||||
endif()
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
if (NOT NOFORTRAN)
|
if (NOT NOFORTRAN)
|
||||||
|
if (NOT ONLY_CBLAS)
|
||||||
# Build test and ctest
|
# Build test and ctest
|
||||||
add_subdirectory(test)
|
add_subdirectory(test)
|
||||||
|
endif()
|
||||||
if (BUILD_TESTING)
|
if (BUILD_TESTING)
|
||||||
add_subdirectory(lapack-netlib/TESTING)
|
add_subdirectory(lapack-netlib/TESTING)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
if(NOT NO_CBLAS)
|
if(NOT NO_CBLAS)
|
||||||
|
if (NOT ONLY_CBLAS)
|
||||||
add_subdirectory(ctest)
|
add_subdirectory(ctest)
|
||||||
endif()
|
endif()
|
||||||
|
endif()
|
||||||
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
|
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
|
||||||
add_subdirectory(cpp_thread_test)
|
add_subdirectory(cpp_thread_test)
|
||||||
endif()
|
endif()
|
||||||
|
@ -398,15 +406,45 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
|
||||||
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
|
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (${BUILD_LAPACK_DEPRECATED})
|
||||||
|
set (BLD 1)
|
||||||
|
else ()
|
||||||
|
set (BLD 0)
|
||||||
|
endif()
|
||||||
|
if (${BUILD_BFLOAT16})
|
||||||
|
set (BBF16 1)
|
||||||
|
else ()
|
||||||
|
set (BBF16 0)
|
||||||
|
endif()
|
||||||
|
if (${BUILD_SINGLE})
|
||||||
|
set (BS 1)
|
||||||
|
else ()
|
||||||
|
set (BS 0)
|
||||||
|
endif()
|
||||||
|
if (${BUILD_DOUBLE})
|
||||||
|
set (BD 1)
|
||||||
|
else ()
|
||||||
|
set (BD 0)
|
||||||
|
endif()
|
||||||
|
if (${BUILD_COMPLEX})
|
||||||
|
set (BC 1)
|
||||||
|
else ()
|
||||||
|
set (BC 0)
|
||||||
|
endif()
|
||||||
|
if (${BUILD_COMPLEX16})
|
||||||
|
set (BZ 1)
|
||||||
|
else ()
|
||||||
|
set (BZ 0)
|
||||||
|
endif()
|
||||||
if (NOT USE_PERL)
|
if (NOT USE_PERL)
|
||||||
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
|
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
|
||||||
COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||||
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
|
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
|
||||||
COMMENT "renaming symbols"
|
COMMENT "renaming symbols"
|
||||||
)
|
)
|
||||||
else()
|
else()
|
||||||
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
|
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
|
||||||
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||||
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
|
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
|
||||||
COMMENT "renaming symbols"
|
COMMENT "renaming symbols"
|
||||||
)
|
)
|
||||||
|
@ -511,9 +549,8 @@ configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/
|
||||||
install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
|
install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
|
||||||
|
|
||||||
|
|
||||||
# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
|
|
||||||
set(PN OpenBLAS)
|
set(PN OpenBLAS)
|
||||||
set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}")
|
set(CMAKECONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PN}${SUFFIX64}")
|
||||||
configure_package_config_file(cmake/${PN}Config.cmake.in
|
configure_package_config_file(cmake/${PN}Config.cmake.in
|
||||||
"${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake"
|
"${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake"
|
||||||
INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR})
|
INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR})
|
||||||
|
|
|
@ -23,6 +23,9 @@
|
||||||
* Optimization on AMD Piledriver
|
* Optimization on AMD Piledriver
|
||||||
* Optimization on Intel Haswell
|
* Optimization on Intel Haswell
|
||||||
|
|
||||||
|
* Chris Sidebottom <chris.sidebottom@arm.com>
|
||||||
|
* Optimizations and other improvements targeting AArch64
|
||||||
|
|
||||||
## Previous Developers
|
## Previous Developers
|
||||||
|
|
||||||
* Zaheer Chothia <zaheer.chothia@gmail.com>
|
* Zaheer Chothia <zaheer.chothia@gmail.com>
|
||||||
|
|
100
Changelog.txt
100
Changelog.txt
|
@ -1,4 +1,104 @@
|
||||||
OpenBLAS ChangeLog
|
OpenBLAS ChangeLog
|
||||||
|
====================================================================
|
||||||
|
Version 0.3.24
|
||||||
|
03-Sep-2023
|
||||||
|
|
||||||
|
general:
|
||||||
|
- declared the arguments of cblas_xerbla as const (in accordance with the reference implementation
|
||||||
|
and others, the previous discrepancy appears to have dated back to GotoBLAS)
|
||||||
|
- fixed the implementation of ?GEMMT that was added in 0.3.23
|
||||||
|
- made cpu-specific SWITCH_RATIO parameters for GEMM available to DYNAMIC_ARCH builds
|
||||||
|
- fixed application of SYMBOLSUFFIX in CMAKE builds
|
||||||
|
- fixed missing SSYCONVF function in the shared library
|
||||||
|
- fixed parallel build logic used with gmake
|
||||||
|
- added support for compilation with LLVM17, in particular its new Fortran compiler
|
||||||
|
- added support for CMAKE builds using the NVIDIA HPC compiler
|
||||||
|
- fixed INTERFACE64 builds with CMAKE and the f95 Fortran compiler
|
||||||
|
- fixed cross-build detection and management in c_check
|
||||||
|
- disabled building of the tests with CMAKE when ONLY_CBLAS is defined
|
||||||
|
- fixed several issues with the handling of runtime limits on the number of OPENMP threads
|
||||||
|
- corrected the error code returned by SGEADD/DGEADD when LDA is too small
|
||||||
|
- corrected the error code returned by IMATCOPY when LDB is too small
|
||||||
|
- updated ?NRM2 to support negative increment values (as introduced in release 3.10
|
||||||
|
of the reference BLAS)
|
||||||
|
- fixed OpenMP builds with CLANG for the case where libomp is not in a standard location
|
||||||
|
- fixed a potential overwrite of unrelated memory during thread initialisation on startup
|
||||||
|
- fixed a potential integer overflow in the multithreading threshold for ?SYMM/?SYRK
|
||||||
|
- fixed build of the LAPACKE interfaces for the LAPACK 3.11.0 ?TRSYL functions added in 0.3.22
|
||||||
|
- fixed installation of .cmake files in concurrent 32 and 64bit builds with CMAKE
|
||||||
|
- applied additions and corrections from the development branch of Reference-LAPACK:
|
||||||
|
- fixed actual arguments passed to a number of LAPACK functions (from Reference-LAPACK PR 885)
|
||||||
|
- fixed workspace query results in LAPACK ?SYTRF/?TRECV3 (from Reference-LAPACK PR 883)
|
||||||
|
- fixed derivation of the UPLO parameter in LAPACKE_?larfb (from Reference-LAPACK PR 878)
|
||||||
|
- fixed a crash in LAPACK ?GELSDD on NRHS=0 (from Reference-LAPACK PR 876)
|
||||||
|
- added new LAPACK utility functions CRSCL and ZRSCL (from Reference-LAPACK PR 839)
|
||||||
|
- corrected the order of eigenvalues for 2x2 matrices in ?STEMR (Reference-LAPACK PR 867)
|
||||||
|
- removed spurious reference to OpenMP variables outside OpenMP contexts (Reference-LAPACK PR 860)
|
||||||
|
- updated file comments on use of LAMBDA variable in LAPACK (Reference-LAPACK PR 852)
|
||||||
|
- fixed documentation of LAPACK SLASD0/DLASD0 (Reference-LAPACK PR 855)
|
||||||
|
- fixed confusing use of "minor" in LAPACK documentation (Reference-LAPACK PR 849)
|
||||||
|
- added new LAPACK functions ?GEDMD for dynamic mode decomposition (Reference-LAPACK PR 736)
|
||||||
|
- fixed potential stack overflows in the EIG part of the LAPACK testsuite (Reference-LAPACK PR 854)
|
||||||
|
- applied small improvements to the variants of Cholesky and QR functions (Reference-LAPACK PR 847)
|
||||||
|
- removed unused variables from LAPACK ?BDSQR (Reference-LAPACK PR 832)
|
||||||
|
- fixed a potential crash on allocation failure in LAPACKE SGEESX/DGEESX (Reference-LAPACK PR 836)
|
||||||
|
- added a quick return from SLARUV/DLARUV for N < 1 (Reference-LAPACK PR 837)
|
||||||
|
- updated function descriptions in LAPACK ?GEGS/?GEGV (Reference-LAPACK PR 831)
|
||||||
|
- improved algorithm description in ?GELSY (Reference-LAPACK PR 833)
|
||||||
|
- fixed scaling in LAPACK STGSNA/DTGSNA (Reference-LAPACK PR 830)
|
||||||
|
- fixed crash in LAPACKE_?geqrt with row-major data (Reference-LAPACK PR 768)
|
||||||
|
- added LAPACKE interfaces for C/ZUNHR_COL and S/DORHR_COL (Reference-LAPACK PR 827)
|
||||||
|
- added error exit tests for SYSV/SYTD2/GEHD2 to the testsuite (Reference-LAPACK PR 795)
|
||||||
|
- fixed typos in LAPACK source and comments (Reference-LAPACK PRs 809,811,812,814,820)
|
||||||
|
- adopt refactored ?GEBAL implementation (Reference-LAPACK PR 808)
|
||||||
|
|
||||||
|
x86_64:
|
||||||
|
- added cpu model autodetection for Intel Alder Lake N
|
||||||
|
- added activation of the AMX tile to the Sapphire Rapids SBGEMM kernel
|
||||||
|
- worked around miscompilations of GEMV/SYMV kernels by gcc's tree-vectorizer
|
||||||
|
- fixed compilation of Cooperlake and Sapphire Rapids kernels with CLANG
|
||||||
|
- fixed runtime detection of Cooperlake and Sapphire Rapids in DYNAMIC_ARCH
|
||||||
|
- fixed feature-based cputype fallback in DYNAMIC_ARCH
|
||||||
|
- added support for building the AVX512 kernels with the NVIDIA HPC compiler
|
||||||
|
- corrected ZAXPY result on old pre-AVX hardware for the INCX=0 case
|
||||||
|
- fixed a potential use of uninitialized variables in ZTRSM
|
||||||
|
|
||||||
|
ARM64:
|
||||||
|
- added cpu model autodetection for Apple M2
|
||||||
|
- fixed wrong results of CGEMM/CTRMM/DNRM2 under OSX (use of reserved register)
|
||||||
|
- added support for building the SVE kernels with the NVIDIA HPC compiler
|
||||||
|
- added support for building the SVE kernels with the Apple Clang compiler
|
||||||
|
- fixed compiler option handling for building the SVE kernels with LLVM
|
||||||
|
- implemented SWITCH_RATIO parameter for improved GEMM performance on Neoverse
|
||||||
|
- activated SVE SGEMM and DGEMM kernels for Neoverse V1
|
||||||
|
- improved performance of the SVE CGEMM and ZGEMM kernels on Neoverse V1
|
||||||
|
- improved kernel selection for the ARMV8SVE target and added it to DYNAMIC_ARCH
|
||||||
|
- fixed runtime check for SVE availability in DYNAMIC_ARCH builds to take OS or
|
||||||
|
container restrictions into account
|
||||||
|
- fixed a potential use of uninitialized variables in ZTRSM
|
||||||
|
- fix a potential misdetection of ARMV8 hardware as 32bit in CMAKE builds
|
||||||
|
|
||||||
|
LOONGARCH64:
|
||||||
|
- added ABI detection
|
||||||
|
- added support for cpu affinity handling
|
||||||
|
- fixed compilation with early versions of the Loongson toolchain
|
||||||
|
- added an optimized SGEMM kernel for 3A5000
|
||||||
|
- added optimized DGEMV kernels for 3A5000
|
||||||
|
- improved the performance of the DGEMM kernel for 3A5000
|
||||||
|
|
||||||
|
MIPS64:
|
||||||
|
- fixed miscompilation of TRMM kernels for the MIPS64_GENERIC target
|
||||||
|
|
||||||
|
POWER:
|
||||||
|
- fixed compiler warnings in the POWER10 SBGEMM kernel
|
||||||
|
|
||||||
|
RISCV:
|
||||||
|
- fixed application of the INTERFACE64 option when building with CMAKE
|
||||||
|
- fix a potential misdetection of RISCV hardware as 32bit in CMAKE builds
|
||||||
|
- fixed IDAMAX and DOT kernels for C910V
|
||||||
|
- fixed corner cases in the ROT and SWAP kernels for C910V
|
||||||
|
- fixed compilation of the C910V target with recent vendor compilers
|
||||||
|
|
||||||
====================================================================
|
====================================================================
|
||||||
Version 0.3.23
|
Version 0.3.23
|
||||||
01-Apr-2023
|
01-Apr-2023
|
||||||
|
|
|
@ -1,9 +1,14 @@
|
||||||
node {
|
pipeline {
|
||||||
stage('Checkout') {
|
agent {
|
||||||
checkout
|
docker {
|
||||||
|
image 'osuosl/ubuntu-s390x'
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
stages {
|
||||||
stage('Build') {
|
stage('Build') {
|
||||||
sh("make")
|
steps {
|
||||||
|
sh 'make clean && make'
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
pipeline {
|
||||||
|
agent {
|
||||||
|
docker {
|
||||||
|
image 'osuosl/ubuntu-ppc64le'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stages {
|
||||||
|
stage('Build') {
|
||||||
|
steps {
|
||||||
|
sh 'sudo apt update'
|
||||||
|
sh 'sudo apt install gfortran -y'
|
||||||
|
sh 'make clean && make'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
10
Makefile
10
Makefile
|
@ -40,9 +40,9 @@ LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
|
||||||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
|
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
|
||||||
|
|
||||||
.PHONY : all libs netlib $(RELA) test ctest shared install
|
.PHONY : all libs netlib $(RELA) test ctest shared install
|
||||||
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
|
.NOTPARALLEL : shared
|
||||||
|
|
||||||
all :: libs netlib $(RELA) tests shared
|
all :: tests
|
||||||
@echo
|
@echo
|
||||||
@echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
|
@echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
|
||||||
@echo
|
@echo
|
||||||
|
@ -150,7 +150,7 @@ ifeq ($(OSNAME), CYGWIN_NT)
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
tests : libs netlib $(RELA) shared
|
tests : shared
|
||||||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||||
touch $(LIBNAME)
|
touch $(LIBNAME)
|
||||||
ifndef NO_FBLAS
|
ifndef NO_FBLAS
|
||||||
|
@ -373,10 +373,10 @@ ifneq ($(CROSS), 1)
|
||||||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING)
|
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
lapack-runtest:
|
lapack-runtest: lapack-test
|
||||||
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
|
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
|
||||||
./testsecond; ./testdsecnd; ./testieee; ./testversion )
|
./testsecond; ./testdsecnd; ./testieee; ./testversion )
|
||||||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
|
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING )
|
||||||
|
|
||||||
|
|
||||||
blas-test:
|
blas-test:
|
||||||
|
|
|
@ -69,7 +69,7 @@ endif
|
||||||
# in GCC>=9
|
# in GCC>=9
|
||||||
ifeq ($(CORE), NEOVERSEN1)
|
ifeq ($(CORE), NEOVERSEN1)
|
||||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
|
||||||
CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
|
CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
|
FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
|
||||||
|
@ -92,9 +92,14 @@ endif
|
||||||
# in GCC>=10.4
|
# in GCC>=10.4
|
||||||
ifeq ($(CORE), NEOVERSEV1)
|
ifeq ($(CORE), NEOVERSEV1)
|
||||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
|
||||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
|
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
|
||||||
CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-v1
|
CCOMMON_OPT += -march=armv8.4-a+sve
|
||||||
|
ifeq (1, $(ISCLANG))
|
||||||
|
CCOMMON_OPT += -mtune=cortex-x1
|
||||||
|
else
|
||||||
|
CCOMMON_OPT += -mtune=neoverse-v1
|
||||||
|
endif
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
|
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
|
||||||
endif
|
endif
|
||||||
|
@ -122,8 +127,8 @@ endif
|
||||||
# in GCC>=10.4
|
# in GCC>=10.4
|
||||||
ifeq ($(CORE), NEOVERSEN2)
|
ifeq ($(CORE), NEOVERSEN2)
|
||||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
|
||||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
|
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
|
||||||
ifneq ($(OSNAME), Darwin)
|
ifneq ($(OSNAME), Darwin)
|
||||||
CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
|
CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
|
||||||
else
|
else
|
||||||
|
@ -155,7 +160,7 @@ endif
|
||||||
# Use a53 tunings because a55 is only available in GCC>=8.1
|
# Use a53 tunings because a55 is only available in GCC>=8.1
|
||||||
ifeq ($(CORE), CORTEXA55)
|
ifeq ($(CORE), CORTEXA55)
|
||||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||||
ifeq ($(GCCVERSIONGTEQ8), 1)
|
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ8) $(ISCLANG)))
|
||||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
|
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
|
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
|
||||||
|
@ -196,8 +201,13 @@ endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), THUNDERX3T110)
|
ifeq ($(CORE), THUNDERX3T110)
|
||||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
|
||||||
CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
|
CCOMMON_OPT += -march=armv8.3-a
|
||||||
|
ifeq (0, $(ISCLANG))
|
||||||
|
CCOMMON_OPT += -mtune=thunderx3t110
|
||||||
|
else
|
||||||
|
CCOMMON_OPT += -mtune=thunderx2t99
|
||||||
|
endif
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
|
FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
|
||||||
endif
|
endif
|
||||||
|
@ -225,9 +235,12 @@ endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
|
||||||
ifeq ($(CORE), EMAG8180)
|
ifeq ($(CORE), EMAG8180)
|
||||||
CCOMMON_OPT += -march=armv8-a -mtune=emag
|
CCOMMON_OPT += -march=armv8-a
|
||||||
|
ifeq ($(ISCLANG), 0)
|
||||||
|
CCOMMON_OPT += -mtune=emag
|
||||||
|
endif
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
FCOMMON_OPT += -march=armv8-a -mtune=emag
|
FCOMMON_OPT += -march=armv8-a -mtune=emag
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
# This library's version
|
# This library's version
|
||||||
VERSION = 0.3.23
|
VERSION = 0.3.23.dev
|
||||||
|
|
||||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||||
|
|
|
@ -384,6 +384,11 @@ GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d
|
||||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
|
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(C_COMPILER), CLANG)
|
||||||
|
CLANGVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||||
|
CLANGVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12)
|
||||||
|
endif
|
||||||
|
|
||||||
#
|
#
|
||||||
# OS dependent settings
|
# OS dependent settings
|
||||||
#
|
#
|
||||||
|
@ -645,7 +650,7 @@ DYNAMIC_CORE += HASWELL ZEN
|
||||||
endif
|
endif
|
||||||
ifneq ($(NO_AVX512), 1)
|
ifneq ($(NO_AVX512), 1)
|
||||||
ifneq ($(NO_AVX2), 1)
|
ifneq ($(NO_AVX2), 1)
|
||||||
DYNAMIC_CORE += SKYLAKEX COOPERLAKE
|
DYNAMIC_CORE += SKYLAKEX COOPERLAKE SAPPHIRERAPIDS
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
@ -668,6 +673,7 @@ DYNAMIC_CORE += NEOVERSEN1
|
||||||
ifneq ($(NO_SVE), 1)
|
ifneq ($(NO_SVE), 1)
|
||||||
DYNAMIC_CORE += NEOVERSEV1
|
DYNAMIC_CORE += NEOVERSEV1
|
||||||
DYNAMIC_CORE += NEOVERSEN2
|
DYNAMIC_CORE += NEOVERSEN2
|
||||||
|
DYNAMIC_CORE += ARMV8SVE
|
||||||
endif
|
endif
|
||||||
DYNAMIC_CORE += CORTEXA55
|
DYNAMIC_CORE += CORTEXA55
|
||||||
DYNAMIC_CORE += FALKOR
|
DYNAMIC_CORE += FALKOR
|
||||||
|
@ -932,8 +938,12 @@ BINARY_DEFINED = 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(ARCH), loongarch64)
|
ifeq ($(ARCH), loongarch64)
|
||||||
CCOMMON_OPT += -march=loongarch64 -mabi=lp64
|
LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d)
|
||||||
FCOMMON_OPT += -march=loongarch64 -mabi=lp64
|
ifneq ($(LA64_ABI), lp64d)
|
||||||
|
LA64_ABI=lp64
|
||||||
|
endif
|
||||||
|
CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
|
||||||
|
FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
@ -1082,8 +1092,9 @@ endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(F_COMPILER), GFORTRAN)
|
ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW))
|
||||||
CCOMMON_OPT += -DF_INTERFACE_GFORT
|
CCOMMON_OPT += -DF_INTERFACE_GFORT
|
||||||
|
ifeq ($(F_COMPILER), GFORTRAN)
|
||||||
FCOMMON_OPT += -Wall
|
FCOMMON_OPT += -Wall
|
||||||
# make single-threaded LAPACK calls thread-safe #1847
|
# make single-threaded LAPACK calls thread-safe #1847
|
||||||
FCOMMON_OPT += -frecursive
|
FCOMMON_OPT += -frecursive
|
||||||
|
@ -1097,6 +1108,7 @@ EXTRALIB += -lgfortran
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
ifdef NO_BINARY_MODE
|
ifdef NO_BINARY_MODE
|
||||||
ifeq ($(ARCH), $(filter $(ARCH),mips64))
|
ifeq ($(ARCH), $(filter $(ARCH),mips64))
|
||||||
ifdef BINARY64
|
ifdef BINARY64
|
||||||
|
@ -1763,6 +1775,8 @@ export TARGET_CORE
|
||||||
export NO_AVX512
|
export NO_AVX512
|
||||||
export NO_AVX2
|
export NO_AVX2
|
||||||
export BUILD_BFLOAT16
|
export BUILD_BFLOAT16
|
||||||
|
export NO_LSX
|
||||||
|
export NO_LASX
|
||||||
|
|
||||||
export SBGEMM_UNROLL_M
|
export SBGEMM_UNROLL_M
|
||||||
export SBGEMM_UNROLL_N
|
export SBGEMM_UNROLL_N
|
||||||
|
|
|
@ -75,18 +75,31 @@ endif
|
||||||
ifeq ($(CORE), COOPERLAKE)
|
ifeq ($(CORE), COOPERLAKE)
|
||||||
ifndef NO_AVX512
|
ifndef NO_AVX512
|
||||||
ifeq ($(C_COMPILER), GCC)
|
ifeq ($(C_COMPILER), GCC)
|
||||||
# cooperlake support was added in 10.1
|
# cooperlake support was added in 10.1
|
||||||
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
|
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
|
||||||
CCOMMON_OPT += -march=cooperlake
|
CCOMMON_OPT += -march=cooperlake
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
FCOMMON_OPT += -march=cooperlake
|
FCOMMON_OPT += -march=cooperlake
|
||||||
endif
|
endif
|
||||||
else # gcc not support, fallback to avx512
|
else # gcc not support, fallback to avx512
|
||||||
CCOMMON_OPT += -march=skylake-avx512
|
CCOMMON_OPT += -march=skylake-avx512
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
FCOMMON_OPT += -march=skylake-avx512
|
FCOMMON_OPT += -march=skylake-avx512
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
else ifeq ($(C_COMPILER), CLANG)
|
||||||
|
# cooperlake support was added in clang 9
|
||||||
|
ifeq ($(CLANGVERSIONGTEQ9), 1)
|
||||||
|
CCOMMON_OPT += -march=cooperlake
|
||||||
|
ifneq ($(F_COMPILER), NAG)
|
||||||
|
FCOMMON_OPT += -march=cooperlake
|
||||||
|
endif
|
||||||
|
else # not supported in clang, fallback to avx512
|
||||||
|
CCOMMON_OPT += -march=skylake-avx512
|
||||||
|
ifneq ($(F_COMPILER), NAG)
|
||||||
|
FCOMMON_OPT += -march=skylake-avx512
|
||||||
|
endif
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), CYGWIN_NT)
|
ifeq ($(OSNAME), CYGWIN_NT)
|
||||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||||
|
@ -104,18 +117,31 @@ endif
|
||||||
ifeq ($(CORE), SAPPHIRERAPIDS)
|
ifeq ($(CORE), SAPPHIRERAPIDS)
|
||||||
ifndef NO_AVX512
|
ifndef NO_AVX512
|
||||||
ifeq ($(C_COMPILER), GCC)
|
ifeq ($(C_COMPILER), GCC)
|
||||||
# sapphire rapids support was added in 11
|
# sapphire rapids support was added in 11
|
||||||
ifeq ($(GCCVERSIONGTEQ11), 1)
|
ifeq ($(GCCVERSIONGTEQ11), 1)
|
||||||
CCOMMON_OPT += -march=sapphirerapids
|
CCOMMON_OPT += -march=sapphirerapids
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
FCOMMON_OPT += -march=sapphirerapids
|
FCOMMON_OPT += -march=sapphirerapids
|
||||||
endif
|
endif
|
||||||
else # gcc not support, fallback to avx512
|
else # gcc not support, fallback to avx512
|
||||||
CCOMMON_OPT += -march=skylake-avx512
|
CCOMMON_OPT += -march=skylake-avx512
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
FCOMMON_OPT += -march=skylake-avx512
|
FCOMMON_OPT += -march=skylake-avx512
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
else ifeq ($(C_COMPILER), CLANG)
|
||||||
|
# cooperlake support was added in clang 12
|
||||||
|
ifeq ($(CLANGVERSIONGTEQ12), 1)
|
||||||
|
CCOMMON_OPT += -march=cooperlake
|
||||||
|
ifneq ($(F_COMPILER), NAG)
|
||||||
|
FCOMMON_OPT += -march=cooperlake
|
||||||
|
endif
|
||||||
|
else # not supported in clang, fallback to avx512
|
||||||
|
CCOMMON_OPT += -march=skylake-avx512
|
||||||
|
ifneq ($(F_COMPILER), NAG)
|
||||||
|
FCOMMON_OPT += -march=skylake-avx512
|
||||||
|
endif
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), CYGWIN_NT)
|
ifeq ($(OSNAME), CYGWIN_NT)
|
||||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||||
|
|
|
@ -6,11 +6,15 @@ Travis CI: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
|
AppVeyor: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
|
||||||
|
|
||||||
Drone CI: [](https://cloud.drone.io/xianyi/OpenBLAS/)
|
Cirrus CI: [](https://cirrus-ci.com/github/xianyi/OpenBLAS)
|
||||||
|
<!-- Drone CI: [](https://cloud.drone.io/xianyi/OpenBLAS/)-->
|
||||||
|
|
||||||
|
|
||||||
[](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
|
[](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
|
||||||
|
|
||||||
|
OSUOSL POWERCI [](http://powerci.osuosl.org/job/OpenBLAS_gh/job/develop/)
|
||||||
|
|
||||||
|
OSUOSL IBMZ-CI [](http://ibmz-ci.osuosl.org/job/OpenBLAS-Z/job/develop/)
|
||||||
## Introduction
|
## Introduction
|
||||||
|
|
||||||
OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version.
|
OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version.
|
||||||
|
|
|
@ -115,7 +115,7 @@ jobs:
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||||
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
|
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER="flang -I C:\Miniconda\Library\include\flang" -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
ctest
|
ctest
|
||||||
|
|
||||||
|
@ -271,6 +271,19 @@ jobs:
|
||||||
- script: |
|
- script: |
|
||||||
make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
|
make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
|
||||||
|
|
||||||
|
- job: OSX_xbuild_DYNAMIC_ARM64
|
||||||
|
pool:
|
||||||
|
vmImage: 'macOS-11'
|
||||||
|
variables:
|
||||||
|
CC: /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||||
|
CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX11.3.sdk -arch arm64
|
||||||
|
steps:
|
||||||
|
- script: |
|
||||||
|
ls /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs
|
||||||
|
/Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus
|
||||||
|
/Applications/Xcode_11.7.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version
|
||||||
|
make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
|
||||||
|
|
||||||
- job: ALPINE_MUSL
|
- job: ALPINE_MUSL
|
||||||
pool:
|
pool:
|
||||||
vmImage: 'ubuntu-latest'
|
vmImage: 'ubuntu-latest'
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/***************************************************************************
|
/***************************************************************************
|
||||||
Copyright (c) 2014, The OpenBLAS Project
|
Copyright (c) 2014, 2023 The OpenBLAS Project
|
||||||
All rights reserved.
|
All rights reserved.
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are
|
modification, are permitted provided that the following conditions are
|
||||||
|
@ -67,7 +67,7 @@ int main(int argc, char *argv[]){
|
||||||
int step = 1;
|
int step = 1;
|
||||||
int loops = 1;
|
int loops = 1;
|
||||||
|
|
||||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
|
||||||
|
|
||||||
double time1,timeg;
|
double time1,timeg;
|
||||||
|
|
||||||
|
@ -77,7 +77,7 @@ int main(int argc, char *argv[]){
|
||||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||||
|
|
||||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
|
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Loops = %d\n", from, to, step,uplo,trans,loops);
|
||||||
|
|
||||||
|
|
||||||
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
|
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
|
||||||
|
|
60
c_check
60
c_check
|
@ -31,13 +31,17 @@ flags="$*"
|
||||||
|
|
||||||
cross_suffix=""
|
cross_suffix=""
|
||||||
|
|
||||||
if [ "`dirname \"$compiler_name\"`" != '.' ]; then
|
if [ "`dirname "$compiler_name"`" != '.' ]; then
|
||||||
cross_suffix="$cross_suffix`dirname \"$compiler_name\"`/"
|
cross_suffix="$cross_suffix`dirname "$compiler_name"`/"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
bn=`basename $compiler_name`
|
cn=`echo $compiler_name | sed -e 's/ -.*//'`
|
||||||
|
bn=`basename "$cn"`
|
||||||
|
|
||||||
case "$bn" in
|
case "$bn" in
|
||||||
*-*) cross_suffix="$cross_suffix${bn%-*}-"
|
*-*) if [ "$bn" != '-' ]; then
|
||||||
|
cross_suffix="$cross_suffix${bn%-*}-"
|
||||||
|
fi
|
||||||
esac
|
esac
|
||||||
|
|
||||||
compiler=""
|
compiler=""
|
||||||
|
@ -164,7 +168,7 @@ fi
|
||||||
|
|
||||||
no_msa=0
|
no_msa=0
|
||||||
if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
|
if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
|
||||||
tmpd="$(mktemp -d)"
|
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||||
tmpf="$tmpd/a.c"
|
tmpf="$tmpd/a.c"
|
||||||
code='"addvi.b $w0, $w1, 1"'
|
code='"addvi.b $w0, $w1, 1"'
|
||||||
msa_flags='-mmsa -mfp64 -mload-store-pairs'
|
msa_flags='-mmsa -mfp64 -mload-store-pairs'
|
||||||
|
@ -181,6 +185,37 @@ if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
|
||||||
rm -rf "$tmpd"
|
rm -rf "$tmpd"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
no_lsx=0
|
||||||
|
no_lasx=0
|
||||||
|
if [ "$architecture" = "loongarch64" ]; then
|
||||||
|
tmpd="$(mktemp -d)"
|
||||||
|
tmplsx="$tmpd/lsx.c"
|
||||||
|
codelsx='"vadd.b $vr0, $vr0, $vr0"'
|
||||||
|
lsx_flags='-march=loongarch64 -mlsx'
|
||||||
|
printf "#include <lsxintrin.h>\n\n" >> "$tmplsx"
|
||||||
|
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx"
|
||||||
|
args="$lsx_flags -o $tmplsx.o $tmplsx"
|
||||||
|
{
|
||||||
|
$compiler_name $flags $args >/dev/null 2>&1
|
||||||
|
} || {
|
||||||
|
no_lsx=1
|
||||||
|
}
|
||||||
|
|
||||||
|
tmplasx="$tmpd/lasx.c"
|
||||||
|
codelasx='"xvadd.b $xr0, $xr0, $xr0"'
|
||||||
|
lasx_flags='-march=loongarch64 -mlasx'
|
||||||
|
printf "#include <lasxintrin.h>\n\n" >> "$tmplasx"
|
||||||
|
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx"
|
||||||
|
args="$lasx_flags -o $tmplasx.o $tmplasx"
|
||||||
|
{
|
||||||
|
$compiler_name $flags $args >/dev/null 2>&1
|
||||||
|
} || {
|
||||||
|
no_lasx=1
|
||||||
|
}
|
||||||
|
|
||||||
|
rm -rf "$tmpd"
|
||||||
|
fi
|
||||||
|
|
||||||
case "$data" in
|
case "$data" in
|
||||||
*ARCH_X86_64*) architecture=x86_64 ;;
|
*ARCH_X86_64*) architecture=x86_64 ;;
|
||||||
*ARCH_X86*) architecture=x86 ;;
|
*ARCH_X86*) architecture=x86 ;;
|
||||||
|
@ -204,7 +239,7 @@ esac
|
||||||
|
|
||||||
no_avx512=0
|
no_avx512=0
|
||||||
if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
|
if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
|
||||||
tmpd=`mktemp -d`
|
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||||
tmpf="$tmpd/a.c"
|
tmpf="$tmpd/a.c"
|
||||||
code='"vbroadcastss -4 * 4(%rsi), %zmm2"'
|
code='"vbroadcastss -4 * 4(%rsi), %zmm2"'
|
||||||
printf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
|
printf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
|
||||||
|
@ -225,7 +260,7 @@ fi
|
||||||
|
|
||||||
no_rv64gv=0
|
no_rv64gv=0
|
||||||
if [ "$architecture" = "riscv64" ]; then
|
if [ "$architecture" = "riscv64" ]; then
|
||||||
tmpd=`mktemp -d`
|
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||||
tmpf="$tmpd/a.c"
|
tmpf="$tmpd/a.c"
|
||||||
code='"vsetvli zero, zero, e8, m1\n"'
|
code='"vsetvli zero, zero, e8, m1\n"'
|
||||||
printf "int main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
|
printf "int main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
|
||||||
|
@ -241,13 +276,16 @@ fi
|
||||||
|
|
||||||
no_sve=0
|
no_sve=0
|
||||||
if [ "$architecture" = "arm64" ]; then
|
if [ "$architecture" = "arm64" ]; then
|
||||||
tmpd=`mktemp -d`
|
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||||
tmpf="$tmpd/a.c"
|
tmpf="$tmpd/a.c"
|
||||||
printf "#include <arm_sve.h>\n\n int main(void){}\n">> "$tmpf"
|
printf "#include <arm_sve.h>\n\n int main(void){}\n">> "$tmpf"
|
||||||
args=" -march=armv8-a+sve -c -o $tmpf.o $tmpf"
|
args=" -march=armv8-a+sve -c -o $tmpf.o $tmpf"
|
||||||
no_sve=0
|
no_sve=0
|
||||||
{
|
{
|
||||||
$compiler_name $flags $args >/dev/null 2>&1
|
$compiler_name $flags $args >/dev/null 2>&1
|
||||||
|
} || {
|
||||||
|
args=" -Msve_intrinsics -c -o $tmpf.o $tmpf"
|
||||||
|
$compiler_name $flags $args >/dev/null 2>&1
|
||||||
} || {
|
} || {
|
||||||
no_sve=1
|
no_sve=1
|
||||||
}
|
}
|
||||||
|
@ -257,7 +295,7 @@ fi
|
||||||
c11_atomics=0
|
c11_atomics=0
|
||||||
case "$data" in
|
case "$data" in
|
||||||
*HAVE_C11*)
|
*HAVE_C11*)
|
||||||
tmpd=`mktemp -d`
|
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||||
tmpf="$tmpd/a.c"
|
tmpf="$tmpd/a.c"
|
||||||
printf "#include <stdatomic.h>\nint main(void){}\n" >> "$tmpf"
|
printf "#include <stdatomic.h>\nint main(void){}\n" >> "$tmpf"
|
||||||
args=" -c -o $tmpf.o $tmpf"
|
args=" -c -o $tmpf.o $tmpf"
|
||||||
|
@ -395,6 +433,8 @@ done
|
||||||
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
|
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
|
||||||
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
|
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
|
||||||
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
|
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
|
||||||
|
[ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"
|
||||||
|
[ "$no_lasx" -eq 1 ] && printf "NO_LASX=1\n"
|
||||||
} >> "$makefile"
|
} >> "$makefile"
|
||||||
|
|
||||||
os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ `
|
os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ `
|
||||||
|
@ -410,6 +450,8 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' `
|
||||||
[ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu"
|
[ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu"
|
||||||
[ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n"
|
[ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n"
|
||||||
[ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n"
|
[ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n"
|
||||||
|
[ "$no_lsx" -eq 1 ] && printf "#define NO_LSX\t1\n"
|
||||||
|
[ "$no_lasx" -eq 1 ] && printf "#define NO_LASX\t1\n"
|
||||||
} >> "$config"
|
} >> "$config"
|
||||||
|
|
||||||
|
|
||||||
|
|
45
c_check.pl
45
c_check.pl
|
@ -232,6 +232,47 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$no_lsx = 0;
|
||||||
|
$no_lasx = 0;
|
||||||
|
if (($architecture eq "loongarch64")) {
|
||||||
|
eval "use File::Temp qw(tempfile)";
|
||||||
|
if ($@){
|
||||||
|
warn "could not load PERL module File::Temp, so could not check LSX and LASX capatibility";
|
||||||
|
} else {
|
||||||
|
$tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
|
||||||
|
$codelsx = '"vadd.b $vr0, $vr0, $vr0"';
|
||||||
|
$lsx_flags = "-march=loongarch64 -mlsx";
|
||||||
|
print $tmplsx "#include <lsxintrin.h>\n\n";
|
||||||
|
print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n";
|
||||||
|
|
||||||
|
$args = "$lsx_flags -o $tmplsx.o $tmplsx";
|
||||||
|
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
|
||||||
|
system(@cmd) == 0;
|
||||||
|
if ($? != 0) {
|
||||||
|
$no_lsx = 1;
|
||||||
|
} else {
|
||||||
|
$no_lsx = 0;
|
||||||
|
}
|
||||||
|
unlink("$tmplsx.o");
|
||||||
|
|
||||||
|
$tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
|
||||||
|
$codelasx = '"xvadd.b $xr0, $xr0, $xr0"';
|
||||||
|
$lasx_flags = "-march=loongarch64 -mlasx";
|
||||||
|
print $tmplasx "#include <lasxintrin.h>\n\n";
|
||||||
|
print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n";
|
||||||
|
|
||||||
|
$args = "$lasx_flags -o $tmplasx.o $tmplasx";
|
||||||
|
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
|
||||||
|
system(@cmd) == 0;
|
||||||
|
if ($? != 0) {
|
||||||
|
$no_lasx = 1;
|
||||||
|
} else {
|
||||||
|
$no_lasx = 0;
|
||||||
|
}
|
||||||
|
unlink("$tmplasx.o");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||||
$architecture = e2k if ($data =~ /ARCH_E2K/);
|
$architecture = e2k if ($data =~ /ARCH_E2K/);
|
||||||
|
@ -424,6 +465,8 @@ print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1;
|
||||||
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
|
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
|
||||||
print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1;
|
print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1;
|
||||||
print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1;
|
print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1;
|
||||||
|
print MAKEFILE "NO_LSX=1\n" if $no_lsx eq 1;
|
||||||
|
print MAKEFILE "NO_LASX=1\n" if $no_lasx eq 1;
|
||||||
|
|
||||||
$os =~ tr/[a-z]/[A-Z]/;
|
$os =~ tr/[a-z]/[A-Z]/;
|
||||||
$architecture =~ tr/[a-z]/[A-Z]/;
|
$architecture =~ tr/[a-z]/[A-Z]/;
|
||||||
|
@ -437,6 +480,8 @@ print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64;
|
||||||
print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
|
print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
|
||||||
print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1;
|
print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1;
|
||||||
print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1;
|
print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1;
|
||||||
|
print CONFFILE "#define NO_LSX\t1\n" if $no_lsx eq 1;
|
||||||
|
print CONFFILE "#define NO_LASX\t1\n" if $no_lasx eq 1;
|
||||||
|
|
||||||
|
|
||||||
if ($os eq "LINUX") {
|
if ($os eq "LINUX") {
|
||||||
|
|
2
cblas.h
2
cblas.h
|
@ -350,7 +350,7 @@ void cblas_cher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBL
|
||||||
void cblas_zher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
|
void cblas_zher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
|
||||||
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, void *C, OPENBLAS_CONST blasint ldc);
|
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, void *C, OPENBLAS_CONST blasint ldc);
|
||||||
|
|
||||||
void cblas_xerbla(blasint p, char *rout, char *form, ...);
|
void cblas_xerbla(blasint p, OPENBLAS_CONST char *rout, OPENBLAS_CONST char *form, ...);
|
||||||
|
|
||||||
/*** BLAS extensions ***/
|
/*** BLAS extensions ***/
|
||||||
|
|
||||||
|
|
|
@ -46,7 +46,7 @@ if (DYNAMIC_ARCH)
|
||||||
if (ARM64)
|
if (ARM64)
|
||||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
||||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
|
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
|
||||||
set(DYNAMIC_CORE "${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2")
|
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE)
|
||||||
endif ()
|
endif ()
|
||||||
if (DYNAMIC_LIST)
|
if (DYNAMIC_LIST)
|
||||||
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
|
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
|
||||||
|
@ -82,7 +82,7 @@ if (DYNAMIC_ARCH)
|
||||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
|
set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
|
||||||
endif ()
|
endif ()
|
||||||
if (NOT NO_AVX512)
|
if (NOT NO_AVX512)
|
||||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE)
|
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE SAPPHIRERAPIDS)
|
||||||
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
|
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
|
||||||
endif ()
|
endif ()
|
||||||
if (DYNAMIC_LIST)
|
if (DYNAMIC_LIST)
|
||||||
|
@ -135,7 +135,7 @@ if (ARM64)
|
||||||
set(BINARY_DEFINED 1)
|
set(BINARY_DEFINED 1)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (${ARCH} STREQUAL "riscv64")
|
if (RISCV64)
|
||||||
set(NO_BINARY_MODE 1)
|
set(NO_BINARY_MODE 1)
|
||||||
set(BINARY_DEFINED 1)
|
set(BINARY_DEFINED 1)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
|
@ -65,6 +65,14 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC")
|
||||||
|
if (POWER)
|
||||||
|
set(CCOMMON_OPT "${CCOMMON_OPT} -tp pwr8")
|
||||||
|
else ()
|
||||||
|
set(CCOMMON_OPT "${CCOMMON_OPT} -tp px")
|
||||||
|
endif ()
|
||||||
|
endif ()
|
||||||
|
|
||||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
|
if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
|
||||||
if (BINARY64)
|
if (BINARY64)
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
|
||||||
|
@ -172,6 +180,9 @@ endif ()
|
||||||
|
|
||||||
if (${CORE} STREQUAL NEOVERSEN2)
|
if (${CORE} STREQUAL NEOVERSEN2)
|
||||||
if (NOT DYNAMIC_ARCH)
|
if (NOT DYNAMIC_ARCH)
|
||||||
|
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||||
|
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
|
||||||
|
else ()
|
||||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
|
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
|
||||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
|
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
|
||||||
|
@ -179,16 +190,21 @@ if (${CORE} STREQUAL NEOVERSEN2)
|
||||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
|
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
|
||||||
endif()
|
endif()
|
||||||
endif ()
|
endif ()
|
||||||
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (${CORE} STREQUAL NEOVERSEV1)
|
if (${CORE} STREQUAL NEOVERSEV1)
|
||||||
if (NOT DYNAMIC_ARCH)
|
if (NOT DYNAMIC_ARCH)
|
||||||
|
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||||
|
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
|
||||||
|
else ()
|
||||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
|
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
|
||||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
|
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
|
||||||
else ()
|
else ()
|
||||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
|
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
|
||||||
endif()
|
endif()
|
||||||
|
endif()
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
@ -205,8 +221,12 @@ endif ()
|
||||||
|
|
||||||
if (${CORE} STREQUAL ARMV8SVE)
|
if (${CORE} STREQUAL ARMV8SVE)
|
||||||
if (NOT DYNAMIC_ARCH)
|
if (NOT DYNAMIC_ARCH)
|
||||||
|
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||||
|
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve")
|
||||||
|
else ()
|
||||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
|
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
|
||||||
endif ()
|
endif ()
|
||||||
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (${CORE} STREQUAL CORTEXA510)
|
if (${CORE} STREQUAL CORTEXA510)
|
||||||
|
|
|
@ -3,7 +3,8 @@
|
||||||
## Description: Ported from portion of OpenBLAS/Makefile.system
|
## Description: Ported from portion of OpenBLAS/Makefile.system
|
||||||
## Sets Fortran related variables.
|
## Sets Fortran related variables.
|
||||||
|
|
||||||
if (${F_COMPILER} STREQUAL "FLANG")
|
if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
|
||||||
|
# This is for classic Flang. LLVM Flang is handled with gfortran below.
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
|
||||||
if (BINARY64 AND INTERFACE64)
|
if (BINARY64 AND INTERFACE64)
|
||||||
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
|
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
|
||||||
|
@ -38,16 +39,18 @@ if (${F_COMPILER} STREQUAL "G95")
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (${F_COMPILER} STREQUAL "GFORTRAN")
|
if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
|
||||||
|
if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
|
||||||
# ensure reentrancy of lapack codes
|
# ensure reentrancy of lapack codes
|
||||||
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
|
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
|
||||||
# work around ABI violation in passing string arguments from C
|
# work around ABI violation in passing string arguments from C
|
||||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
|
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
|
||||||
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
|
||||||
if (NOT NO_LAPACK)
|
if (NOT NO_LAPACK)
|
||||||
|
# Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||||
set(EXTRALIB "${EXTRALIB} -lgfortran")
|
set(EXTRALIB "${EXTRALIB} -lgfortran")
|
||||||
endif ()
|
endif ()
|
||||||
|
endif ()
|
||||||
if (NO_BINARY_MODE)
|
if (NO_BINARY_MODE)
|
||||||
if (MIPS64)
|
if (MIPS64)
|
||||||
if (BINARY64)
|
if (BINARY64)
|
||||||
|
@ -63,6 +66,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN")
|
||||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
|
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
if (RISCV64)
|
||||||
|
if (BINARY64)
|
||||||
|
if (INTERFACE64)
|
||||||
|
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
|
||||||
|
endif ()
|
||||||
|
endif ()
|
||||||
|
endif ()
|
||||||
else ()
|
else ()
|
||||||
if (BINARY64)
|
if (BINARY64)
|
||||||
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
|
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
|
||||||
|
@ -121,7 +131,7 @@ if (${F_COMPILER} STREQUAL "IBM")
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (${F_COMPILER} STREQUAL "PGI")
|
if (${F_COMPILER} STREQUAL "PGI" OR ${F_COMPILER} STREQUAL "PGF95")
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PGI")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PGI")
|
||||||
set(COMMON_PROF "${COMMON_PROF} -DPGICOMPILER")
|
set(COMMON_PROF "${COMMON_PROF} -DPGICOMPILER")
|
||||||
if (BINARY64)
|
if (BINARY64)
|
||||||
|
|
|
@ -124,7 +124,7 @@ set(SLASRC
|
||||||
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
|
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
|
||||||
sgesvdq.f slaorhr_col_getrfnp.f
|
sgesvdq.f slaorhr_col_getrfnp.f
|
||||||
slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f
|
slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f
|
||||||
slatrs3.f strsyl3.f sgelst.f)
|
slatrs3.f strsyl3.f sgelst.f sgedmd.f90 sgedmdq.f90)
|
||||||
|
|
||||||
set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
|
set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
|
||||||
sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
|
sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
|
||||||
|
@ -187,7 +187,7 @@ set(CLASRC
|
||||||
cposv.f cposvx.f cpotrf2.f cpotri.f cpstrf.f cpstf2.f
|
cposv.f cposvx.f cpotrf2.f cpotri.f cpstrf.f cpstf2.f
|
||||||
cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f
|
cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f
|
||||||
cptcon.f cpteqr.f cptrfs.f cptsv.f cptsvx.f cpttrf.f cpttrs.f cptts2.f
|
cptcon.f cpteqr.f cptrfs.f cptsv.f cptsvx.f cpttrf.f cpttrs.f cptts2.f
|
||||||
crot.f cspcon.f csprfs.f cspsv.f
|
crot.f crscl.f cspcon.f csprfs.f cspsv.f
|
||||||
cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f
|
cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f
|
||||||
cstegr.f cstein.f csteqr.f csycon.f
|
cstegr.f cstein.f csteqr.f csycon.f
|
||||||
csyrfs.f csysv.f csysvx.f csytf2.f csytrf.f csytri.f
|
csyrfs.f csysv.f csysvx.f csytf2.f csytrf.f csytri.f
|
||||||
|
@ -223,7 +223,7 @@ set(CLASRC
|
||||||
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
|
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
|
||||||
cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f
|
cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f
|
||||||
cungtsqr.f cungtsqr_row.f cunhr_col.f
|
cungtsqr.f cungtsqr_row.f cunhr_col.f
|
||||||
clatrs3.f ctrsyl3.f cgelst.f)
|
clatrs3.f ctrsyl3.f cgelst.f cgedmd.f90 cgedmdq.f90)
|
||||||
|
|
||||||
set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
|
set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
|
||||||
cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
|
cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
|
||||||
|
@ -316,7 +316,7 @@ set(DLASRC
|
||||||
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
|
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
|
||||||
dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
|
dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
|
||||||
dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f
|
dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f
|
||||||
dlatrs3.f dtrsyl3.f dgelst.f)
|
dlatrs3.f dtrsyl3.f dgelst.f dgedmd.f90 dgedmdq.f90)
|
||||||
|
|
||||||
set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
|
set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
|
||||||
dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
|
dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
|
||||||
|
@ -381,7 +381,7 @@ set(ZLASRC
|
||||||
zposv.f zposvx.f zpotrf2.f zpotri.f zpotrs.f zpstrf.f zpstf2.f
|
zposv.f zposvx.f zpotrf2.f zpotri.f zpotrs.f zpstrf.f zpstf2.f
|
||||||
zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f
|
zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f
|
||||||
zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f
|
zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f
|
||||||
zrot.f zspcon.f zsprfs.f zspsv.f
|
zrot.f zrscl.f zspcon.f zsprfs.f zspsv.f
|
||||||
zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f
|
zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f
|
||||||
zstegr.f zstein.f zsteqr.f zsycon.f
|
zstegr.f zstein.f zsteqr.f zsycon.f
|
||||||
zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f
|
zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f
|
||||||
|
@ -419,7 +419,7 @@ set(ZLASRC
|
||||||
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
|
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
|
||||||
zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
|
zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
|
||||||
zungtsqr.f zungtsqr_row.f zunhr_col.f
|
zungtsqr.f zungtsqr_row.f zunhr_col.f
|
||||||
zlatrs3.f ztrsyl3.f zgelst.f)
|
zlatrs3.f ztrsyl3.f zgelst.f zgedmd.f90 zgedmdq.f90)
|
||||||
|
|
||||||
set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
|
set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
|
||||||
zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
|
zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
|
||||||
|
@ -436,6 +436,7 @@ if(USE_XBLAS)
|
||||||
set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
|
set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if(BUILD_LAPACK_DEPRECATED)
|
||||||
list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f
|
list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f
|
||||||
DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f
|
DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f
|
||||||
DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f)
|
DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f)
|
||||||
|
@ -449,6 +450,7 @@ list(APPEND ZLASRC DEPRECATED/zgegs.f DEPRECATED/zgegv.f
|
||||||
DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f
|
DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f
|
||||||
DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f)
|
DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f)
|
||||||
message(STATUS "Building deprecated routines")
|
message(STATUS "Building deprecated routines")
|
||||||
|
endif()
|
||||||
|
|
||||||
set(DSLASRC spotrs.f)
|
set(DSLASRC spotrs.f)
|
||||||
|
|
||||||
|
@ -622,7 +624,7 @@ set(SLASRC
|
||||||
ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c
|
ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c
|
||||||
sgesvdq.c slaorhr_col_getrfnp.c
|
sgesvdq.c slaorhr_col_getrfnp.c
|
||||||
slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c
|
slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c
|
||||||
slatrs3.c strsyl3.c sgelst.c)
|
slatrs3.c strsyl3.c sgelst.c sgedmd.c sgedmdq.c)
|
||||||
|
|
||||||
set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c
|
set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c
|
||||||
sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c
|
sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c
|
||||||
|
@ -684,7 +686,7 @@ set(CLASRC
|
||||||
cposv.c cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c
|
cposv.c cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c
|
||||||
cppcon.c cppequ.c cpprfs.c cppsv.c cppsvx.c cpptrf.c cpptri.c cpptrs.c
|
cppcon.c cppequ.c cpprfs.c cppsv.c cppsvx.c cpptrf.c cpptri.c cpptrs.c
|
||||||
cptcon.c cpteqr.c cptrfs.c cptsv.c cptsvx.c cpttrf.c cpttrs.c cptts2.c
|
cptcon.c cpteqr.c cptrfs.c cptsv.c cptsvx.c cpttrf.c cpttrs.c cptts2.c
|
||||||
crot.c cspcon.c csprfs.c cspsv.c
|
crot.c crscl.c cspcon.c csprfs.c cspsv.c
|
||||||
cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c
|
cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c
|
||||||
cstegr.c cstein.c csteqr.c csycon.c
|
cstegr.c cstein.c csteqr.c csycon.c
|
||||||
csyrfs.c csysv.c csysvx.c csytf2.c csytrf.c csytri.c
|
csyrfs.c csysv.c csysvx.c csytf2.c csytrf.c csytri.c
|
||||||
|
@ -720,7 +722,7 @@ set(CLASRC
|
||||||
chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c
|
chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c
|
||||||
cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c
|
cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c
|
||||||
cungtsqr.c cungtsqr_row.c cunhr_col.c
|
cungtsqr.c cungtsqr_row.c cunhr_col.c
|
||||||
clatrs3.c ctrsyl3.c cgelst.c)
|
clatrs3.c ctrsyl3.c cgelst.c cgedmd.c cgedmdq.c)
|
||||||
|
|
||||||
set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c
|
set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c
|
||||||
cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c
|
cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c
|
||||||
|
@ -812,7 +814,7 @@ set(DLASRC
|
||||||
dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c
|
dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c
|
||||||
dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c
|
dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c
|
||||||
dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c
|
dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c
|
||||||
dlatrs3.c dtrsyl3.c dgelst.c)
|
dlatrs3.c dtrsyl3.c dgelst.c dgedmd.c dgedmdq.c)
|
||||||
|
|
||||||
set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c
|
set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c
|
||||||
dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c
|
dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c
|
||||||
|
@ -876,7 +878,7 @@ set(ZLASRC
|
||||||
zposv.c zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c
|
zposv.c zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c
|
||||||
zppcon.c zppequ.c zpprfs.c zppsv.c zppsvx.c zpptrf.c zpptri.c zpptrs.c
|
zppcon.c zppequ.c zpprfs.c zppsv.c zppsvx.c zpptrf.c zpptri.c zpptrs.c
|
||||||
zptcon.c zpteqr.c zptrfs.c zptsv.c zptsvx.c zpttrf.c zpttrs.c zptts2.c
|
zptcon.c zpteqr.c zptrfs.c zptsv.c zptsvx.c zpttrf.c zpttrs.c zptts2.c
|
||||||
zrot.c zspcon.c zsprfs.c zspsv.c
|
zrot.c zrscl.c zspcon.c zsprfs.c zspsv.c
|
||||||
zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c
|
zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c
|
||||||
zstegr.c zstein.c zsteqr.c zsycon.c
|
zstegr.c zstein.c zsteqr.c zsycon.c
|
||||||
zsyrfs.c zsysv.c zsysvx.c zsytf2.c zsytrf.c zsytri.c
|
zsyrfs.c zsysv.c zsysvx.c zsytf2.c zsytrf.c zsytri.c
|
||||||
|
@ -913,7 +915,8 @@ set(ZLASRC
|
||||||
zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c
|
zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c
|
||||||
zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c
|
zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c
|
||||||
zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c
|
zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c
|
||||||
zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c)
|
zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c
|
||||||
|
zgedmd.c zgedmdq.c)
|
||||||
|
|
||||||
set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c
|
set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c
|
||||||
zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c
|
zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c
|
||||||
|
@ -930,6 +933,7 @@ if(USE_XBLAS)
|
||||||
set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
|
set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if(BUILD_LAPACK_DEPRECATED)
|
||||||
list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c
|
list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c
|
||||||
DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c
|
DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c
|
||||||
DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c)
|
DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c)
|
||||||
|
@ -943,6 +947,7 @@ list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c
|
||||||
DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c
|
DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c
|
||||||
DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c)
|
DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c)
|
||||||
message(STATUS "Building deprecated routines")
|
message(STATUS "Building deprecated routines")
|
||||||
|
endif()
|
||||||
|
|
||||||
set(DSLASRC spotrs.c)
|
set(DSLASRC spotrs.c)
|
||||||
|
|
||||||
|
|
|
@ -70,8 +70,6 @@ set(CSRC
|
||||||
lapacke_cgeqlf_work.c
|
lapacke_cgeqlf_work.c
|
||||||
lapacke_cgeqp3.c
|
lapacke_cgeqp3.c
|
||||||
lapacke_cgeqp3_work.c
|
lapacke_cgeqp3_work.c
|
||||||
lapacke_cgeqpf.c
|
|
||||||
lapacke_cgeqpf_work.c
|
|
||||||
lapacke_cgeqr.c
|
lapacke_cgeqr.c
|
||||||
lapacke_cgeqr_work.c
|
lapacke_cgeqr_work.c
|
||||||
lapacke_cgeqr2.c
|
lapacke_cgeqr2.c
|
||||||
|
@ -92,6 +90,10 @@ set(CSRC
|
||||||
lapacke_cgerqf_work.c
|
lapacke_cgerqf_work.c
|
||||||
lapacke_cgesdd.c
|
lapacke_cgesdd.c
|
||||||
lapacke_cgesdd_work.c
|
lapacke_cgesdd_work.c
|
||||||
|
lapacke_cgedmd.c
|
||||||
|
lapacke_cgedmd_work.c
|
||||||
|
lapacke_cgedmdq.c
|
||||||
|
lapacke_cgedmdq_work.c
|
||||||
lapacke_cgesv.c
|
lapacke_cgesv.c
|
||||||
lapacke_cgesv_work.c
|
lapacke_cgesv_work.c
|
||||||
lapacke_cgesvd.c
|
lapacke_cgesvd.c
|
||||||
|
@ -144,12 +146,8 @@ set(CSRC
|
||||||
lapacke_cggqrf_work.c
|
lapacke_cggqrf_work.c
|
||||||
lapacke_cggrqf.c
|
lapacke_cggrqf.c
|
||||||
lapacke_cggrqf_work.c
|
lapacke_cggrqf_work.c
|
||||||
lapacke_cggsvd.c
|
|
||||||
lapacke_cggsvd_work.c
|
|
||||||
lapacke_cggsvd3.c
|
lapacke_cggsvd3.c
|
||||||
lapacke_cggsvd3_work.c
|
lapacke_cggsvd3_work.c
|
||||||
lapacke_cggsvp.c
|
|
||||||
lapacke_cggsvp_work.c
|
|
||||||
lapacke_cggsvp3.c
|
lapacke_cggsvp3.c
|
||||||
lapacke_cggsvp3_work.c
|
lapacke_cggsvp3_work.c
|
||||||
lapacke_cgtcon.c
|
lapacke_cgtcon.c
|
||||||
|
@ -564,6 +562,8 @@ set(CSRC
|
||||||
lapacke_ctrsna_work.c
|
lapacke_ctrsna_work.c
|
||||||
lapacke_ctrsyl.c
|
lapacke_ctrsyl.c
|
||||||
lapacke_ctrsyl_work.c
|
lapacke_ctrsyl_work.c
|
||||||
|
lapacke_ctrsyl3.c
|
||||||
|
lapacke_ctrsyl3_work.c
|
||||||
lapacke_ctrtri.c
|
lapacke_ctrtri.c
|
||||||
lapacke_ctrtri_work.c
|
lapacke_ctrtri_work.c
|
||||||
lapacke_ctrtrs.c
|
lapacke_ctrtrs.c
|
||||||
|
@ -596,6 +596,8 @@ set(CSRC
|
||||||
lapacke_cungtr_work.c
|
lapacke_cungtr_work.c
|
||||||
lapacke_cungtsqr_row.c
|
lapacke_cungtsqr_row.c
|
||||||
lapacke_cungtsqr_row_work.c
|
lapacke_cungtsqr_row_work.c
|
||||||
|
lapacke_cunhr_col.c
|
||||||
|
lapacke_cunhr_col_work.c
|
||||||
lapacke_cunmbr.c
|
lapacke_cunmbr.c
|
||||||
lapacke_cunmbr_work.c
|
lapacke_cunmbr_work.c
|
||||||
lapacke_cunmhr.c
|
lapacke_cunmhr.c
|
||||||
|
@ -695,8 +697,6 @@ set(DSRC
|
||||||
lapacke_dgeqlf_work.c
|
lapacke_dgeqlf_work.c
|
||||||
lapacke_dgeqp3.c
|
lapacke_dgeqp3.c
|
||||||
lapacke_dgeqp3_work.c
|
lapacke_dgeqp3_work.c
|
||||||
lapacke_dgeqpf.c
|
|
||||||
lapacke_dgeqpf_work.c
|
|
||||||
lapacke_dgeqr.c
|
lapacke_dgeqr.c
|
||||||
lapacke_dgeqr_work.c
|
lapacke_dgeqr_work.c
|
||||||
lapacke_dgeqr2.c
|
lapacke_dgeqr2.c
|
||||||
|
@ -717,6 +717,10 @@ set(DSRC
|
||||||
lapacke_dgerqf_work.c
|
lapacke_dgerqf_work.c
|
||||||
lapacke_dgesdd.c
|
lapacke_dgesdd.c
|
||||||
lapacke_dgesdd_work.c
|
lapacke_dgesdd_work.c
|
||||||
|
lapacke_dgedmd.c
|
||||||
|
lapacke_dgedmd_work.c
|
||||||
|
lapacke_dgedmdq.c
|
||||||
|
lapacke_dgedmdq_work.c
|
||||||
lapacke_dgesv.c
|
lapacke_dgesv.c
|
||||||
lapacke_dgesv_work.c
|
lapacke_dgesv_work.c
|
||||||
lapacke_dgesvd.c
|
lapacke_dgesvd.c
|
||||||
|
@ -771,12 +775,8 @@ set(DSRC
|
||||||
lapacke_dggqrf_work.c
|
lapacke_dggqrf_work.c
|
||||||
lapacke_dggrqf.c
|
lapacke_dggrqf.c
|
||||||
lapacke_dggrqf_work.c
|
lapacke_dggrqf_work.c
|
||||||
lapacke_dggsvd.c
|
|
||||||
lapacke_dggsvd_work.c
|
|
||||||
lapacke_dggsvd3.c
|
lapacke_dggsvd3.c
|
||||||
lapacke_dggsvd3_work.c
|
lapacke_dggsvd3_work.c
|
||||||
lapacke_dggsvp.c
|
|
||||||
lapacke_dggsvp_work.c
|
|
||||||
lapacke_dggsvp3.c
|
lapacke_dggsvp3.c
|
||||||
lapacke_dggsvp3_work.c
|
lapacke_dggsvp3_work.c
|
||||||
lapacke_dgtcon.c
|
lapacke_dgtcon.c
|
||||||
|
@ -874,6 +874,8 @@ set(DSRC
|
||||||
lapacke_dorgtr_work.c
|
lapacke_dorgtr_work.c
|
||||||
lapacke_dorgtsqr_row.c
|
lapacke_dorgtsqr_row.c
|
||||||
lapacke_dorgtsqr_row_work.c
|
lapacke_dorgtsqr_row_work.c
|
||||||
|
lapacke_dorhr_col.c
|
||||||
|
lapacke_dorhr_col_work.c
|
||||||
lapacke_dormbr.c
|
lapacke_dormbr.c
|
||||||
lapacke_dormbr_work.c
|
lapacke_dormbr_work.c
|
||||||
lapacke_dormhr.c
|
lapacke_dormhr.c
|
||||||
|
@ -1186,6 +1188,8 @@ set(DSRC
|
||||||
lapacke_dtrsna_work.c
|
lapacke_dtrsna_work.c
|
||||||
lapacke_dtrsyl.c
|
lapacke_dtrsyl.c
|
||||||
lapacke_dtrsyl_work.c
|
lapacke_dtrsyl_work.c
|
||||||
|
lapacke_dtrsyl3.c
|
||||||
|
lapacke_dtrsyl3_work.c
|
||||||
lapacke_dtrtri.c
|
lapacke_dtrtri.c
|
||||||
lapacke_dtrtri_work.c
|
lapacke_dtrtri_work.c
|
||||||
lapacke_dtrtrs.c
|
lapacke_dtrtrs.c
|
||||||
|
@ -1275,8 +1279,6 @@ set(SSRC
|
||||||
lapacke_sgeqlf_work.c
|
lapacke_sgeqlf_work.c
|
||||||
lapacke_sgeqp3.c
|
lapacke_sgeqp3.c
|
||||||
lapacke_sgeqp3_work.c
|
lapacke_sgeqp3_work.c
|
||||||
lapacke_sgeqpf.c
|
|
||||||
lapacke_sgeqpf_work.c
|
|
||||||
lapacke_sgeqr.c
|
lapacke_sgeqr.c
|
||||||
lapacke_sgeqr_work.c
|
lapacke_sgeqr_work.c
|
||||||
lapacke_sgeqr2.c
|
lapacke_sgeqr2.c
|
||||||
|
@ -1297,6 +1299,10 @@ set(SSRC
|
||||||
lapacke_sgerqf_work.c
|
lapacke_sgerqf_work.c
|
||||||
lapacke_sgesdd.c
|
lapacke_sgesdd.c
|
||||||
lapacke_sgesdd_work.c
|
lapacke_sgesdd_work.c
|
||||||
|
lapacke_sgedmd.c
|
||||||
|
lapacke_sgedmd_work.c
|
||||||
|
lapacke_sgedmdq.c
|
||||||
|
lapacke_sgedmdq_work.c
|
||||||
lapacke_sgesv.c
|
lapacke_sgesv.c
|
||||||
lapacke_sgesv_work.c
|
lapacke_sgesv_work.c
|
||||||
lapacke_sgesvd.c
|
lapacke_sgesvd.c
|
||||||
|
@ -1351,12 +1357,8 @@ set(SSRC
|
||||||
lapacke_sggqrf_work.c
|
lapacke_sggqrf_work.c
|
||||||
lapacke_sggrqf.c
|
lapacke_sggrqf.c
|
||||||
lapacke_sggrqf_work.c
|
lapacke_sggrqf_work.c
|
||||||
lapacke_sggsvd.c
|
|
||||||
lapacke_sggsvd_work.c
|
|
||||||
lapacke_sggsvd3.c
|
lapacke_sggsvd3.c
|
||||||
lapacke_sggsvd3_work.c
|
lapacke_sggsvd3_work.c
|
||||||
lapacke_sggsvp.c
|
|
||||||
lapacke_sggsvp_work.c
|
|
||||||
lapacke_sggsvp3.c
|
lapacke_sggsvp3.c
|
||||||
lapacke_sggsvp3_work.c
|
lapacke_sggsvp3_work.c
|
||||||
lapacke_sgtcon.c
|
lapacke_sgtcon.c
|
||||||
|
@ -1453,6 +1455,8 @@ set(SSRC
|
||||||
lapacke_sorgtr_work.c
|
lapacke_sorgtr_work.c
|
||||||
lapacke_sorgtsqr_row.c
|
lapacke_sorgtsqr_row.c
|
||||||
lapacke_sorgtsqr_row_work.c
|
lapacke_sorgtsqr_row_work.c
|
||||||
|
lapacke_sorhr_col.c
|
||||||
|
lapacke_sorhr_col_work.c
|
||||||
lapacke_sormbr.c
|
lapacke_sormbr.c
|
||||||
lapacke_sormbr_work.c
|
lapacke_sormbr_work.c
|
||||||
lapacke_sormhr.c
|
lapacke_sormhr.c
|
||||||
|
@ -1762,6 +1766,8 @@ set(SSRC
|
||||||
lapacke_strsna_work.c
|
lapacke_strsna_work.c
|
||||||
lapacke_strsyl.c
|
lapacke_strsyl.c
|
||||||
lapacke_strsyl_work.c
|
lapacke_strsyl_work.c
|
||||||
|
lapacke_ctrsyl3.c
|
||||||
|
lapacke_ctrsyl3_work.c
|
||||||
lapacke_strtri.c
|
lapacke_strtri.c
|
||||||
lapacke_strtri_work.c
|
lapacke_strtri_work.c
|
||||||
lapacke_strtrs.c
|
lapacke_strtrs.c
|
||||||
|
@ -1849,8 +1855,6 @@ set(ZSRC
|
||||||
lapacke_zgeqlf_work.c
|
lapacke_zgeqlf_work.c
|
||||||
lapacke_zgeqp3.c
|
lapacke_zgeqp3.c
|
||||||
lapacke_zgeqp3_work.c
|
lapacke_zgeqp3_work.c
|
||||||
lapacke_zgeqpf.c
|
|
||||||
lapacke_zgeqpf_work.c
|
|
||||||
lapacke_zgeqr.c
|
lapacke_zgeqr.c
|
||||||
lapacke_zgeqr_work.c
|
lapacke_zgeqr_work.c
|
||||||
lapacke_zgeqr2.c
|
lapacke_zgeqr2.c
|
||||||
|
@ -1871,6 +1875,10 @@ set(ZSRC
|
||||||
lapacke_zgerqf_work.c
|
lapacke_zgerqf_work.c
|
||||||
lapacke_zgesdd.c
|
lapacke_zgesdd.c
|
||||||
lapacke_zgesdd_work.c
|
lapacke_zgesdd_work.c
|
||||||
|
lapacke_zgedmd.c
|
||||||
|
lapacke_zgedmd_work.c
|
||||||
|
lapacke_zgedmdq.c
|
||||||
|
lapacke_zgedmdq_work.c
|
||||||
lapacke_zgesv.c
|
lapacke_zgesv.c
|
||||||
lapacke_zgesv_work.c
|
lapacke_zgesv_work.c
|
||||||
lapacke_zgesvd.c
|
lapacke_zgesvd.c
|
||||||
|
@ -1925,12 +1933,8 @@ set(ZSRC
|
||||||
lapacke_zggqrf_work.c
|
lapacke_zggqrf_work.c
|
||||||
lapacke_zggrqf.c
|
lapacke_zggrqf.c
|
||||||
lapacke_zggrqf_work.c
|
lapacke_zggrqf_work.c
|
||||||
lapacke_zggsvd.c
|
|
||||||
lapacke_zggsvd_work.c
|
|
||||||
lapacke_zggsvd3.c
|
lapacke_zggsvd3.c
|
||||||
lapacke_zggsvd3_work.c
|
lapacke_zggsvd3_work.c
|
||||||
lapacke_zggsvp.c
|
|
||||||
lapacke_zggsvp_work.c
|
|
||||||
lapacke_zggsvp3.c
|
lapacke_zggsvp3.c
|
||||||
lapacke_zggsvp3_work.c
|
lapacke_zggsvp3_work.c
|
||||||
lapacke_zgtcon.c
|
lapacke_zgtcon.c
|
||||||
|
@ -2343,6 +2347,8 @@ set(ZSRC
|
||||||
lapacke_ztrsna_work.c
|
lapacke_ztrsna_work.c
|
||||||
lapacke_ztrsyl.c
|
lapacke_ztrsyl.c
|
||||||
lapacke_ztrsyl_work.c
|
lapacke_ztrsyl_work.c
|
||||||
|
lapacke_ztrsyl3.c
|
||||||
|
lapacke_ztrsyl3_work.c
|
||||||
lapacke_ztrtri.c
|
lapacke_ztrtri.c
|
||||||
lapacke_ztrtri_work.c
|
lapacke_ztrtri_work.c
|
||||||
lapacke_ztrtrs.c
|
lapacke_ztrtrs.c
|
||||||
|
@ -2375,6 +2381,8 @@ set(ZSRC
|
||||||
lapacke_zungtr_work.c
|
lapacke_zungtr_work.c
|
||||||
lapacke_zungtsqr_row.c
|
lapacke_zungtsqr_row.c
|
||||||
lapacke_zungtsqr_row_work.c
|
lapacke_zungtsqr_row_work.c
|
||||||
|
lapacke_zunhr_col.c
|
||||||
|
lapacke_zunhr_col_work.c
|
||||||
lapacke_zunmbr.c
|
lapacke_zunmbr.c
|
||||||
lapacke_zunmbr_work.c
|
lapacke_zunmbr_work.c
|
||||||
lapacke_zunmhr.c
|
lapacke_zunmhr.c
|
||||||
|
@ -2401,6 +2409,12 @@ set(ZSRC
|
||||||
lapacke_csyr_work.c
|
lapacke_csyr_work.c
|
||||||
lapacke_ilaver.c
|
lapacke_ilaver.c
|
||||||
)
|
)
|
||||||
|
if (BUILD_LAPACK_DEPRECATED)
|
||||||
|
set(SRCS $SRCS lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c)
|
||||||
|
set(SRCD $SRCD lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c)
|
||||||
|
set(SRCC $SRCC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c)
|
||||||
|
set(SRCZ $SRCZ lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c)
|
||||||
|
endif()
|
||||||
|
|
||||||
set(SRCX
|
set(SRCX
|
||||||
lapacke_cgbrfsx.c lapacke_cporfsx.c lapacke_dgerfsx.c lapacke_sgbrfsx.c lapacke_ssyrfsx.c lapacke_zherfsx.c
|
lapacke_cgbrfsx.c lapacke_cporfsx.c lapacke_dgerfsx.c lapacke_sgbrfsx.c lapacke_ssyrfsx.c lapacke_zherfsx.c
|
||||||
|
|
|
@ -55,7 +55,7 @@ if (DEFINED TARGET)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
|
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
|
||||||
if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
|
if (X86_64 AND NOT (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" OR ${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC"))
|
||||||
set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
|
set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
@ -280,7 +280,41 @@ if (DEFINED TARGET)
|
||||||
if (${TARGET} STREQUAL POWER8)
|
if (${TARGET} STREQUAL POWER8)
|
||||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
|
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (${TARGET} STREQUAL NEOVERSEV1)
|
||||||
|
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||||
|
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
|
||||||
|
else ()
|
||||||
|
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||||
|
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
|
||||||
|
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1")
|
||||||
|
else ()
|
||||||
|
message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
if (${TARGET} STREQUAL NEOVERSEN2)
|
||||||
|
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||||
|
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
|
||||||
|
else ()
|
||||||
|
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||||
|
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
|
||||||
|
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
|
||||||
|
else ()
|
||||||
|
message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
if (${TARGET} STREQUAL ARMV8SVE)
|
||||||
|
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||||
|
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve")
|
||||||
|
else ()
|
||||||
|
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (DEFINED BINARY)
|
if (DEFINED BINARY)
|
||||||
message(STATUS "Compiling a ${BINARY}-bit binary.")
|
message(STATUS "Compiling a ${BINARY}-bit binary.")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
|
@ -44,6 +44,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
|
||||||
set(MIPS64 1)
|
set(MIPS64 1)
|
||||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
|
||||||
set(LOONGARCH64 1)
|
set(LOONGARCH64 1)
|
||||||
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*")
|
||||||
|
set(RISCV64 1)
|
||||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
||||||
if (NOT BINARY)
|
if (NOT BINARY)
|
||||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||||
|
@ -60,7 +62,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
||||||
endif()
|
endif()
|
||||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
|
||||||
set(X86 1)
|
set(X86 1)
|
||||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)")
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*|armv8.*)")
|
||||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||||
set(ARM64 1)
|
set(ARM64 1)
|
||||||
else()
|
else()
|
||||||
|
@ -107,7 +109,7 @@ else()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (NOT BINARY)
|
if (NOT BINARY)
|
||||||
if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64)
|
if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64)
|
||||||
set(BINARY 64)
|
set(BINARY 64)
|
||||||
else ()
|
else ()
|
||||||
set(BINARY 32)
|
set(BINARY 32)
|
||||||
|
|
|
@ -87,6 +87,15 @@ macro(ParseMakefileVars MAKEFILE_IN)
|
||||||
#message(STATUS "skipping ${makefile_line}")
|
#message(STATUS "skipping ${makefile_line}")
|
||||||
continue ()
|
continue ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
# Example 1: SBGEMM_SMALL_M_PERMIT =
|
||||||
|
# Unset the variable
|
||||||
|
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*$" line_match "${makefile_line}")
|
||||||
|
if (NOT "${line_match}" STREQUAL "")
|
||||||
|
set(var_name ${CMAKE_MATCH_1})
|
||||||
|
unset(${var_name})
|
||||||
|
endif()
|
||||||
|
|
||||||
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
|
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
|
||||||
if (NOT "${line_match}" STREQUAL "")
|
if (NOT "${line_match}" STREQUAL "")
|
||||||
#message(STATUS "match on ${line_match}")
|
#message(STATUS "match on ${line_match}")
|
||||||
|
|
2
common.h
2
common.h
|
@ -525,7 +525,7 @@ static inline unsigned long long rpcc(void){
|
||||||
#endif // !RPCC_DEFINED
|
#endif // !RPCC_DEFINED
|
||||||
|
|
||||||
#if !defined(BLAS_LOCK_DEFINED) && defined(__GNUC__)
|
#if !defined(BLAS_LOCK_DEFINED) && defined(__GNUC__)
|
||||||
static void __inline blas_lock(volatile BLASULONG *address){
|
static __inline void blas_lock(volatile BLASULONG *address){
|
||||||
|
|
||||||
do {
|
do {
|
||||||
while (*address) {YIELDING;};
|
while (*address) {YIELDING;};
|
||||||
|
|
|
@ -45,7 +45,7 @@
|
||||||
#define WMB asm("wmb")
|
#define WMB asm("wmb")
|
||||||
#define RMB asm("mb")
|
#define RMB asm("mb")
|
||||||
|
|
||||||
static void __inline blas_lock(unsigned long *address){
|
static __inline void blas_lock(unsigned long *address){
|
||||||
#ifndef __DECC
|
#ifndef __DECC
|
||||||
unsigned long tmp1, tmp2;
|
unsigned long tmp1, tmp2;
|
||||||
asm volatile(
|
asm volatile(
|
||||||
|
|
|
@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#if defined(ARMV6) || defined(ARMV7) || defined(ARMV8)
|
#if defined(ARMV6) || defined(ARMV7) || defined(ARMV8)
|
||||||
|
|
||||||
static void __inline blas_lock(volatile BLASULONG *address){
|
static __inline void blas_lock(volatile BLASULONG *address){
|
||||||
|
|
||||||
int register ret;
|
int register ret;
|
||||||
|
|
||||||
|
|
|
@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#ifndef ASSEMBLER
|
#ifndef ASSEMBLER
|
||||||
|
|
||||||
|
|
||||||
static void __inline blas_lock(volatile BLASULONG *address){
|
static __inline void blas_lock(volatile BLASULONG *address){
|
||||||
|
|
||||||
BLASULONG ret;
|
BLASULONG ret;
|
||||||
|
|
||||||
|
|
|
@ -83,6 +83,19 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||||
return x / y;
|
return x / y;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef NO_AFFINITY
|
||||||
|
static inline int WhereAmI(void){
|
||||||
|
int ret = 0, counter = 0;
|
||||||
|
__asm__ volatile (
|
||||||
|
"rdtimel.w %[counter], %[id]"
|
||||||
|
: [id]"=r"(ret), [counter]"=r"(counter)
|
||||||
|
:
|
||||||
|
: "memory"
|
||||||
|
);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef DOUBLE
|
#ifdef DOUBLE
|
||||||
#define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory")
|
#define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory")
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* Copyright 2023 The OpenBLAS Project. */
|
||||||
/* All rights reserved. */
|
/* All rights reserved. */
|
||||||
/* */
|
/* */
|
||||||
/* Redistribution and use in source and binary forms, with or */
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
@ -45,12 +46,14 @@
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int dtb_entries;
|
int dtb_entries;
|
||||||
|
int switch_ratio;
|
||||||
int offsetA, offsetB, align;
|
int offsetA, offsetB, align;
|
||||||
|
|
||||||
#if BUILD_BFLOAT16 == 1
|
#if BUILD_BFLOAT16 == 1
|
||||||
int sbgemm_p, sbgemm_q, sbgemm_r;
|
int sbgemm_p, sbgemm_q, sbgemm_r;
|
||||||
int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn;
|
int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn;
|
||||||
int sbgemm_align_k;
|
int sbgemm_align_k;
|
||||||
|
int need_amxtile_permission; // 0 default, 1 for device support amx.
|
||||||
|
|
||||||
void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG);
|
void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG);
|
||||||
void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG);
|
void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG);
|
||||||
|
|
|
@ -91,7 +91,7 @@
|
||||||
|
|
||||||
void *qalloc(int flags, size_t bytes);
|
void *qalloc(int flags, size_t bytes);
|
||||||
|
|
||||||
static void INLINE blas_lock(volatile unsigned long *address){
|
static INLINE void blas_lock(volatile unsigned long *address){
|
||||||
|
|
||||||
long int ret, val = 1;
|
long int ret, val = 1;
|
||||||
|
|
||||||
|
|
|
@ -45,7 +45,7 @@
|
||||||
|
|
||||||
#ifndef ASSEMBLER
|
#ifndef ASSEMBLER
|
||||||
|
|
||||||
static void __inline blas_lock(volatile unsigned long *address){
|
static __inline void blas_lock(volatile unsigned long *address){
|
||||||
|
|
||||||
long int ret = 1;
|
long int ret = 1;
|
||||||
|
|
||||||
|
|
|
@ -53,7 +53,6 @@ extern void goto_set_num_threads(int nthreads);
|
||||||
/* Global Parameter */
|
/* Global Parameter */
|
||||||
extern int blas_cpu_number;
|
extern int blas_cpu_number;
|
||||||
extern int blas_num_threads;
|
extern int blas_num_threads;
|
||||||
extern int blas_num_threads_set;
|
|
||||||
extern int blas_omp_linked;
|
extern int blas_omp_linked;
|
||||||
|
|
||||||
#define BLAS_LEGACY 0x8000U
|
#define BLAS_LEGACY 0x8000U
|
||||||
|
@ -136,15 +135,13 @@ typedef struct blas_queue {
|
||||||
#ifdef SMP_SERVER
|
#ifdef SMP_SERVER
|
||||||
|
|
||||||
extern int blas_server_avail;
|
extern int blas_server_avail;
|
||||||
|
extern int blas_omp_number_max;
|
||||||
|
|
||||||
static __inline int num_cpu_avail(int level) {
|
static __inline int num_cpu_avail(int level) {
|
||||||
|
|
||||||
#ifdef USE_OPENMP
|
#ifdef USE_OPENMP
|
||||||
int openmp_nthreads;
|
int openmp_nthreads;
|
||||||
if (blas_num_threads_set == 0)
|
|
||||||
openmp_nthreads=omp_get_max_threads();
|
openmp_nthreads=omp_get_max_threads();
|
||||||
else
|
|
||||||
openmp_nthreads=blas_cpu_number;
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef USE_OPENMP
|
#ifndef USE_OPENMP
|
||||||
|
@ -156,6 +153,12 @@ int openmp_nthreads;
|
||||||
) return 1;
|
) return 1;
|
||||||
|
|
||||||
#ifdef USE_OPENMP
|
#ifdef USE_OPENMP
|
||||||
|
if (openmp_nthreads > blas_omp_number_max){
|
||||||
|
#ifdef DEBUG
|
||||||
|
fprintf(stderr,"WARNING - more OpenMP threads requested (%d) than available (%d)\n",openmp_nthreads,blas_omp_number_max);
|
||||||
|
#endif
|
||||||
|
openmp_nthreads = blas_omp_number_max;
|
||||||
|
}
|
||||||
if (blas_cpu_number != openmp_nthreads) {
|
if (blas_cpu_number != openmp_nthreads) {
|
||||||
goto_set_num_threads(openmp_nthreads);
|
goto_set_num_threads(openmp_nthreads);
|
||||||
}
|
}
|
||||||
|
|
|
@ -54,7 +54,7 @@
|
||||||
#define __volatile__
|
#define __volatile__
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void __inline blas_lock(volatile BLASULONG *address){
|
static __inline void blas_lock(volatile BLASULONG *address){
|
||||||
|
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
|
|
|
@ -70,7 +70,7 @@
|
||||||
#define RMB
|
#define RMB
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void __inline blas_lock(volatile BLASULONG *address){
|
static __inline void blas_lock(volatile BLASULONG *address){
|
||||||
|
|
||||||
|
|
||||||
#ifndef C_MSVC
|
#ifndef C_MSVC
|
||||||
|
|
|
@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#ifndef ASSEMBLER
|
#ifndef ASSEMBLER
|
||||||
|
|
||||||
/*
|
/*
|
||||||
static void __inline blas_lock(volatile BLASULONG *address){
|
static __inline void blas_lock(volatile BLASULONG *address){
|
||||||
|
|
||||||
BLASULONG ret;
|
BLASULONG ret;
|
||||||
|
|
||||||
|
|
|
@ -267,8 +267,9 @@ int detect(void)
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
#ifdef __APPLE__
|
#ifdef __APPLE__
|
||||||
sysctlbyname("hw.cpufamily",&value,&length,NULL,0);
|
sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0);
|
||||||
if (value ==131287967|| value == 458787763 ) return CPU_VORTEX;
|
if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1
|
||||||
|
if (value64 == 3660830781) return CPU_VORTEX; //A15/M2
|
||||||
#endif
|
#endif
|
||||||
return CPU_ARMV8;
|
return CPU_ARMV8;
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
**********************************************************************************/
|
**********************************************************************************/
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
#include <sys/auxv.h>
|
||||||
|
|
||||||
/* If LASX extension instructions supported,
|
/* If LASX extension instructions supported,
|
||||||
* using core LOONGSON3R5
|
* using core LOONGSON3R5
|
||||||
|
@ -46,9 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define CPU_LOONGSON3R5 1
|
#define CPU_LOONGSON3R5 1
|
||||||
#define CPU_LOONGSON2K1000 2
|
#define CPU_LOONGSON2K1000 2
|
||||||
|
|
||||||
#define LOONGARCH_CFG2 0x02
|
#define LA_HWCAP_LSX (1<<4)
|
||||||
#define LOONGARCH_LASX 1<<7
|
#define LA_HWCAP_LASX (1<<5)
|
||||||
#define LOONGARCH_LSX 1<<6
|
|
||||||
|
|
||||||
static char *cpuname[] = {
|
static char *cpuname[] = {
|
||||||
"LOONGSONGENERIC",
|
"LOONGSONGENERIC",
|
||||||
|
@ -64,17 +64,11 @@ static char *cpuname_lower[] = {
|
||||||
|
|
||||||
int detect(void) {
|
int detect(void) {
|
||||||
#ifdef __linux
|
#ifdef __linux
|
||||||
uint32_t reg = 0;
|
int flag = (int)getauxval(AT_HWCAP);
|
||||||
|
|
||||||
__asm__ volatile (
|
if (flag & LA_HWCAP_LASX)
|
||||||
"cpucfg %0, %1 \n\t"
|
|
||||||
: "+&r"(reg)
|
|
||||||
: "r"(LOONGARCH_CFG2)
|
|
||||||
);
|
|
||||||
|
|
||||||
if (reg & LOONGARCH_LASX)
|
|
||||||
return CPU_LOONGSON3R5;
|
return CPU_LOONGSON3R5;
|
||||||
else if (reg & LOONGARCH_LSX)
|
else if (flag & LA_HWCAP_LSX)
|
||||||
return CPU_LOONGSON2K1000;
|
return CPU_LOONGSON2K1000;
|
||||||
else
|
else
|
||||||
return CPU_GENERIC;
|
return CPU_GENERIC;
|
||||||
|
|
28
cpuid_x86.c
28
cpuid_x86.c
|
@ -1479,6 +1479,8 @@ int get_cpuname(void){
|
||||||
else
|
else
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
case 15: // Sapphire Rapids
|
case 15: // Sapphire Rapids
|
||||||
|
if(support_amx_bf16())
|
||||||
|
return CPUTYPE_SAPPHIRERAPIDS;
|
||||||
if(support_avx512_bf16())
|
if(support_avx512_bf16())
|
||||||
return CPUTYPE_COOPERLAKE;
|
return CPUTYPE_COOPERLAKE;
|
||||||
if(support_avx512())
|
if(support_avx512())
|
||||||
|
@ -1549,6 +1551,7 @@ int get_cpuname(void){
|
||||||
case 7: // Raptor Lake
|
case 7: // Raptor Lake
|
||||||
case 10:
|
case 10:
|
||||||
case 15:
|
case 15:
|
||||||
|
case 14: // Alder Lake N
|
||||||
if(support_avx2())
|
if(support_avx2())
|
||||||
return CPUTYPE_HASWELL;
|
return CPUTYPE_HASWELL;
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
|
@ -1845,7 +1848,8 @@ static char *cpuname[] = {
|
||||||
"ZEN",
|
"ZEN",
|
||||||
"SKYLAKEX",
|
"SKYLAKEX",
|
||||||
"DHYANA",
|
"DHYANA",
|
||||||
"COOPERLAKE"
|
"COOPERLAKE",
|
||||||
|
"SAPPHIRERAPIDS",
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *lowercpuname[] = {
|
static char *lowercpuname[] = {
|
||||||
|
@ -1902,7 +1906,8 @@ static char *lowercpuname[] = {
|
||||||
"zen",
|
"zen",
|
||||||
"skylakex",
|
"skylakex",
|
||||||
"dhyana",
|
"dhyana",
|
||||||
"cooperlake"
|
"cooperlake",
|
||||||
|
"sapphirerapids",
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *corename[] = {
|
static char *corename[] = {
|
||||||
|
@ -1936,7 +1941,8 @@ static char *corename[] = {
|
||||||
"ZEN",
|
"ZEN",
|
||||||
"SKYLAKEX",
|
"SKYLAKEX",
|
||||||
"DHYANA",
|
"DHYANA",
|
||||||
"COOPERLAKE"
|
"COOPERLAKE",
|
||||||
|
"SAPPHIRERAPIDS",
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *corename_lower[] = {
|
static char *corename_lower[] = {
|
||||||
|
@ -1970,7 +1976,8 @@ static char *corename_lower[] = {
|
||||||
"zen",
|
"zen",
|
||||||
"skylakex",
|
"skylakex",
|
||||||
"dhyana",
|
"dhyana",
|
||||||
"cooperlake"
|
"cooperlake",
|
||||||
|
"sapphirerapids",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -2276,16 +2283,18 @@ int get_coretype(void){
|
||||||
return CORE_NEHALEM;
|
return CORE_NEHALEM;
|
||||||
}
|
}
|
||||||
if (model == 15) { // Sapphire Rapids
|
if (model == 15) { // Sapphire Rapids
|
||||||
|
if(support_amx_bf16())
|
||||||
|
return CORE_SAPPHIRERAPIDS;
|
||||||
if(support_avx512_bf16())
|
if(support_avx512_bf16())
|
||||||
return CPUTYPE_COOPERLAKE;
|
return CORE_COOPERLAKE;
|
||||||
if(support_avx512())
|
if(support_avx512())
|
||||||
return CPUTYPE_SKYLAKEX;
|
return CORE_SKYLAKEX;
|
||||||
if(support_avx2())
|
if(support_avx2())
|
||||||
return CPUTYPE_HASWELL;
|
return CORE_HASWELL;
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
return CPUTYPE_SANDYBRIDGE;
|
return CORE_SANDYBRIDGE;
|
||||||
else
|
else
|
||||||
return CPUTYPE_NEHALEM;
|
return CORE_NEHALEM;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
@ -2352,6 +2361,7 @@ int get_coretype(void){
|
||||||
case 7: // Raptor Lake
|
case 7: // Raptor Lake
|
||||||
case 10:
|
case 10:
|
||||||
case 15:
|
case 15:
|
||||||
|
case 14: // Alder Lake N
|
||||||
#ifndef NO_AVX2
|
#ifndef NO_AVX2
|
||||||
if(support_avx2())
|
if(support_avx2())
|
||||||
return CORE_HASWELL;
|
return CORE_HASWELL;
|
||||||
|
|
|
@ -208,7 +208,7 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
|
||||||
ifeq ($(USE_OPENMP), 1)
|
ifeq ($(USE_OPENMP), 1)
|
||||||
ifeq ($(F_COMPILER), GFORTRAN)
|
ifeq ($(F_COMPILER), GFORTRAN)
|
||||||
ifeq ($(C_COMPILER), CLANG)
|
ifeq ($(C_COMPILER), CLANG)
|
||||||
CEXTRALIB = -lomp
|
CEXTRALIB += -lomp
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
ifeq ($(F_COMPILER), NAG)
|
ifeq ($(F_COMPILER), NAG)
|
||||||
|
|
|
@ -0,0 +1,270 @@
|
||||||
|
# Guidance for redistributing OpenBLAS
|
||||||
|
|
||||||
|
*We note that this document contains recommendations only - packagers and other
|
||||||
|
redistributors are in charge of how OpenBLAS is built and distributed in their
|
||||||
|
systems, and may have good reasons to deviate from the guidance given on this
|
||||||
|
page. These recommendations are aimed at general packaging systems, with a user
|
||||||
|
base that typically is large, open source (or freely available at least), and
|
||||||
|
doesn't behave uniformly or that the packager is directly connected with.*
|
||||||
|
|
||||||
|
OpenBLAS has a large number of build-time options which can be used to change
|
||||||
|
how it behaves at runtime, how artifacts or symbols are named, etc. Variation
|
||||||
|
in build configuration can be necessary to acheive a given end goal within a
|
||||||
|
distribution or as an end user. However, such variation can also make it more
|
||||||
|
difficult to build on top of OpenBLAS and ship code or other packages in a way
|
||||||
|
that works across many different distros. Here we provide guidance about the
|
||||||
|
most important build options, what effects they may have when changed, and
|
||||||
|
which ones to default to.
|
||||||
|
|
||||||
|
The Make and CMake build systems provide equivalent options and yield more or
|
||||||
|
less the same artifacts, but not exactly (the CMake builds are still
|
||||||
|
experimental). You can choose either one and the options will function in the
|
||||||
|
same way, however the CMake outputs may require some renaming. To review
|
||||||
|
available build options, see `Makefile.rule` or `CMakeLists.txt` in the root of
|
||||||
|
the repository.
|
||||||
|
|
||||||
|
Build options typically fall into two categories: (a) options that affect the
|
||||||
|
user interface, such as library and symbol names or APIs that are made
|
||||||
|
available, and (b) options that affect performance and runtime behavior, such
|
||||||
|
as threading behavior or CPU architecture-specific code paths. The user
|
||||||
|
interface options are more important to keep aligned between distributions,
|
||||||
|
while for the performance-related options there are typically more reasons to
|
||||||
|
make choices that deviate from the defaults.
|
||||||
|
|
||||||
|
Here are recommendations for user interface related packaging choices where it
|
||||||
|
is not likely to be a good idea to deviate (typically these are the default
|
||||||
|
settings):
|
||||||
|
|
||||||
|
1. Include CBLAS. The CBLAS interface is widely used and it doesn't affect
|
||||||
|
binary size much, so don't turn it off.
|
||||||
|
2. Include LAPACK and LAPACKE. The LAPACK interface is also widely used, and
|
||||||
|
while it does make up a significant part of the binary size of the installed
|
||||||
|
library, that does not outweigh the regression in usability when deviating
|
||||||
|
from the default here.[^1]
|
||||||
|
3. Always distribute the pkg-config (`.pc`) and CMake `.cmake`) dependency
|
||||||
|
detection files. These files are used by build systems when users want to
|
||||||
|
link against OpenBLAS, and there is no benefit of leaving them out.
|
||||||
|
4. Provide the LP64 interface by default, and if in addition to that you choose
|
||||||
|
to provide an ILP64 interface build as well, use a symbol suffix to avoid
|
||||||
|
symbol name clashes (see the next section).
|
||||||
|
|
||||||
|
[^1] All major distributions do include LAPACK as of mid 2023 as far as we
|
||||||
|
know. Older versions of Arch Linux did not, and that was known to cause
|
||||||
|
problems.
|
||||||
|
|
||||||
|
|
||||||
|
## ILP64 interface builds
|
||||||
|
|
||||||
|
The LP64 (32-bit integer) interface is the default build, and has
|
||||||
|
well-established C and Fortran APIs as determined by the reference (Netlib)
|
||||||
|
BLAS and LAPACK libraries. The ILP64 (64-bit integer) interface however does
|
||||||
|
not have a standard API: symbol names and shared/static library names can be
|
||||||
|
produced in multiple ways, and this tends to make it difficult to use.
|
||||||
|
As of today there is an agreed-upon way of choosing names for OpenBLAS between
|
||||||
|
a number of key users/redistributors, which is the closest thing to a standard
|
||||||
|
that there is now. However, there is an ongoing standardization effort in the
|
||||||
|
reference BLAS and LAPACK libraries, which differs from the current OpenBLAS
|
||||||
|
agreed-upon convention. In this section we'll aim to explain both.
|
||||||
|
|
||||||
|
Those two methods are fairly similar, and have a key thing in common: *using a
|
||||||
|
symbol suffix*. This is good practice; it is recommended that if you distribute
|
||||||
|
an ILP64 build, to have it use a symbol suffix containing `64` in the name.
|
||||||
|
This avoids potential symbol clashes when different packages which depend on
|
||||||
|
OpenBLAS load both an LP64 and an ILP64 library into memory at the same time.
|
||||||
|
|
||||||
|
### The current OpenBLAS agreed-upon ILP64 convention
|
||||||
|
|
||||||
|
This convention comprises the shared library name and the symbol suffix in the
|
||||||
|
shared library. The symbol suffix to use is `64_`, implying that the library
|
||||||
|
name will be `libopenblas64_.so` and the symbols in that library end in `64_`.
|
||||||
|
The central issue where this was discussed is
|
||||||
|
[openblas#646](https://github.com/xianyi/OpenBLAS/issues/646), and adopters
|
||||||
|
include Fedora, Julia, NumPy and SciPy - SuiteSparse already used it as well.
|
||||||
|
|
||||||
|
To build shared and static libraries with the currently recommended ILP64
|
||||||
|
conventions with Make:
|
||||||
|
```bash
|
||||||
|
$ make INTERFACE64=1 SYMBOLSUFFIX=64_
|
||||||
|
```
|
||||||
|
|
||||||
|
This will produce libraries named `libopenblas64_.so|a`, a pkg-config file
|
||||||
|
named `openblas64.pc`, and CMake and header files.
|
||||||
|
|
||||||
|
Installing locally and inspecting the output will show a few more details:
|
||||||
|
```bash
|
||||||
|
$ make install PREFIX=$PWD/../openblas/make64 INTERFACE64=1 SYMBOLSUFFIX=64_
|
||||||
|
$ tree . # output slightly edited down
|
||||||
|
.
|
||||||
|
├── include
|
||||||
|
│ ├── cblas.h
|
||||||
|
│ ├── f77blas.h
|
||||||
|
│ ├── lapacke_config.h
|
||||||
|
│ ├── lapacke.h
|
||||||
|
│ ├── lapacke_mangling.h
|
||||||
|
│ ├── lapacke_utils.h
|
||||||
|
│ ├── lapack.h
|
||||||
|
│ └── openblas_config.h
|
||||||
|
└── lib
|
||||||
|
├── cmake
|
||||||
|
│ └── openblas
|
||||||
|
│ ├── OpenBLASConfig.cmake
|
||||||
|
│ └── OpenBLASConfigVersion.cmake
|
||||||
|
├── libopenblas64_.a
|
||||||
|
├── libopenblas64_.so
|
||||||
|
└── pkgconfig
|
||||||
|
└── openblas64.pc
|
||||||
|
```
|
||||||
|
|
||||||
|
A key point are the symbol names. These will equal the LP64 symbol names, then
|
||||||
|
(for Fortran only) the compiler mangling, and then the `64_` symbol suffix.
|
||||||
|
Hence to obtain the final symbol names, we need to take into account which
|
||||||
|
Fortran compiler we are using. For the most common cases (e.g., gfortran, Intel
|
||||||
|
Fortran, or Flang), that means appending a single underscore. In that case, the
|
||||||
|
result is:
|
||||||
|
|
||||||
|
| base API name | binary symbol name | call from Fortran code | call from C code |
|
||||||
|
|---------------|--------------------|------------------------|-----------------------|
|
||||||
|
| `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` |
|
||||||
|
| `cblas_dgemm` | `cblas_dgemm64_` | n/a | `cblas_dgemm64_(...)` |
|
||||||
|
|
||||||
|
It is quite useful to have these symbol names be as uniform as possible across
|
||||||
|
different packaging systems.
|
||||||
|
|
||||||
|
The equivalent build options with CMake are:
|
||||||
|
```bash
|
||||||
|
$ mkdir build && cd build
|
||||||
|
$ cmake .. -DINTERFACE64=1 -DSYMBOLSUFFIX=64_ -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON
|
||||||
|
$ cmake --build . -j
|
||||||
|
```
|
||||||
|
|
||||||
|
Note that the result is not 100% identical to the Make result. For example, the
|
||||||
|
library name ends in `_64` rather than `64_` - it is recommended to rename them
|
||||||
|
to match the Make library names (also update the `libsuffix` entry in
|
||||||
|
`openblas64.pc` to match that rename).
|
||||||
|
```bash
|
||||||
|
$ cmake --install . --prefix $PWD/../../openblas/cmake64
|
||||||
|
$ tree .
|
||||||
|
.
|
||||||
|
├── include
|
||||||
|
│ └── openblas64
|
||||||
|
│ ├── cblas.h
|
||||||
|
│ ├── f77blas.h
|
||||||
|
│ ├── lapacke_config.h
|
||||||
|
│ ├── lapacke_example_aux.h
|
||||||
|
│ ├── lapacke.h
|
||||||
|
│ ├── lapacke_mangling.h
|
||||||
|
│ ├── lapacke_utils.h
|
||||||
|
│ ├── lapack.h
|
||||||
|
│ ├── openblas64
|
||||||
|
│ │ └── lapacke_mangling.h
|
||||||
|
│ └── openblas_config.h
|
||||||
|
└── lib
|
||||||
|
├── cmake
|
||||||
|
│ └── OpenBLAS64
|
||||||
|
│ ├── OpenBLAS64Config.cmake
|
||||||
|
│ ├── OpenBLAS64ConfigVersion.cmake
|
||||||
|
│ ├── OpenBLAS64Targets.cmake
|
||||||
|
│ └── OpenBLAS64Targets-noconfig.cmake
|
||||||
|
├── libopenblas_64.a
|
||||||
|
├── libopenblas_64.so -> libopenblas_64.so.0
|
||||||
|
└── pkgconfig
|
||||||
|
└── openblas64.pc
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### The upcoming standardized ILP64 convention
|
||||||
|
|
||||||
|
While the `64_` convention above got some adoption, it's slightly hacky and is
|
||||||
|
implemented through the use of `objcopy`. An effort is ongoing for a more
|
||||||
|
broadly adopted convention in the reference BLAS and LAPACK libraries, using
|
||||||
|
(a) the `_64` suffix, and (b) applying that suffix _before_ rather than after
|
||||||
|
Fortran compiler mangling. The central issue for this is
|
||||||
|
[lapack#666](https://github.com/Reference-LAPACK/lapack/issues/666).
|
||||||
|
|
||||||
|
For the most common cases of compiler mangling (a single `_` appended), the end
|
||||||
|
result will be:
|
||||||
|
|
||||||
|
| base API name | binary symbol name | call from Fortran code | call from C code |
|
||||||
|
|---------------|--------------------|------------------------|-----------------------|
|
||||||
|
| `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` |
|
||||||
|
| `cblas_dgemm` | `cblas_dgemm_64` | n/a | `cblas_dgemm_64(...)` |
|
||||||
|
|
||||||
|
For other compiler mangling schemes, replace the trailing `_` by the scheme in use.
|
||||||
|
|
||||||
|
The shared library name for this `_64` convention should be `libopenblas_64.so`.
|
||||||
|
|
||||||
|
Note: it is not yet possible to produce an OpenBLAS build which employs this
|
||||||
|
convention! Once reference BLAS and LAPACK with support for `_64` have been
|
||||||
|
released, a future OpenBLAS release will support it. For now, please use the
|
||||||
|
older `64_` scheme and avoid using the name `libopenblas_64.so`; it should be
|
||||||
|
considered reserved for future use of the `_64` standard as prescribed by
|
||||||
|
reference BLAS/LAPACK.
|
||||||
|
|
||||||
|
|
||||||
|
## Performance and runtime behavior related build options
|
||||||
|
|
||||||
|
For these options there are multiple reasonable or common choices.
|
||||||
|
|
||||||
|
### Threading related options
|
||||||
|
|
||||||
|
OpenBLAS can be built as a multi-threaded or single-threaded library, with the
|
||||||
|
default being multi-threaded. It's expected that the default `libopenblas`
|
||||||
|
library is multi-threaded; if you'd like to also distribute single-threaded
|
||||||
|
builds, consider naming them `libopenblas_sequential`.
|
||||||
|
|
||||||
|
OpenBLAS can be built with pthreads or OpenMP as the threading model, with the
|
||||||
|
default being pthreads. Both options are commonly used, and the choice here
|
||||||
|
should not influence the shared library name. The choice will be captured by
|
||||||
|
the `.pc` file. E.g.,:
|
||||||
|
```bash
|
||||||
|
$ pkg-config --libs openblas
|
||||||
|
-fopenmp -lopenblas
|
||||||
|
|
||||||
|
$ cat openblas.pc
|
||||||
|
...
|
||||||
|
openblas_config= ... USE_OPENMP=0 MAX_THREADS=24
|
||||||
|
```
|
||||||
|
|
||||||
|
The maximum number of threads users will be able to use is determined at build
|
||||||
|
time by the `NUM_THREADS` build option. It defaults to 24, and there's a wide
|
||||||
|
range of values that are reasonable to use (up to 256). 64 is a typical choice
|
||||||
|
here; there is a memory footprint penalty that is linear in `NUM_THREADS`.
|
||||||
|
Please see `Makefile.rule` for more details.
|
||||||
|
|
||||||
|
### CPU architecture related options
|
||||||
|
|
||||||
|
OpenBLAS contains a lot of CPU architecture-specific optimizations, hence when
|
||||||
|
distributing to a user base with a variety of hardware, it is recommended to
|
||||||
|
enable CPU architecture runtime detection. This will dynamically select
|
||||||
|
optimized kernels for individual APIs. To do this, use the `DYNAMIC_ARCH=1`
|
||||||
|
build option. This is usually done on all common CPU families, except when
|
||||||
|
there are known issues.
|
||||||
|
|
||||||
|
In case the CPU architecture is known (e.g. you're building binaries for macOS
|
||||||
|
M1 users), it is possible to specify the target architecture directly with the
|
||||||
|
`TARGET=` build option.
|
||||||
|
|
||||||
|
`DYNAMIC_ARCH` and `TARGET` are covered in more detail in the main `README.md`
|
||||||
|
in this repository.
|
||||||
|
|
||||||
|
|
||||||
|
## Real-world examples
|
||||||
|
|
||||||
|
OpenBLAS is likely to be distributed in one of these distribution models:
|
||||||
|
|
||||||
|
1. As a standalone package, or multiple packages, in a packaging ecosystem like
|
||||||
|
a Linux distro, Homebrew, conda-forge or MSYS2.
|
||||||
|
2. Vendored as part of a larger package, e.g. in Julia, NumPy, SciPy, or R.
|
||||||
|
3. Locally, e.g. making available as a build on a single HPC cluster.
|
||||||
|
|
||||||
|
The guidance on this page is most important for models (1) and (2). These links
|
||||||
|
to build recipes for a representative selection of packaging systems may be
|
||||||
|
helpful as a reference:
|
||||||
|
|
||||||
|
- [Fedora](https://src.fedoraproject.org/rpms/openblas/blob/rawhide/f/openblas.spec)
|
||||||
|
- [Debian](https://salsa.debian.org/science-team/openblas/-/blob/master/debian/rules)
|
||||||
|
- [Homebrew](https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/openblas.rb)
|
||||||
|
- [MSYS2](https://github.com/msys2/MINGW-packages/blob/master/mingw-w64-openblas/PKGBUILD)
|
||||||
|
- [conda-forge](https://github.com/conda-forge/openblas-feedstock/blob/main/recipe/build.sh)
|
||||||
|
- [NumPy/SciPy](https://github.com/MacPython/openblas-libs/blob/main/tools/build_openblas.sh)
|
||||||
|
- [Nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/development/libraries/science/math/openblas/default.nix)
|
|
@ -1,5 +1,6 @@
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* Copyright 2023 The OpenBLAS Project. */
|
||||||
/* All rights reserved. */
|
/* All rights reserved. */
|
||||||
/* */
|
/* */
|
||||||
/* Redistribution and use in source and binary forms, with or */
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
@ -44,10 +45,6 @@
|
||||||
#define DIVIDE_RATE 2
|
#define DIVIDE_RATE 2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef SWITCH_RATIO
|
|
||||||
#define SWITCH_RATIO 2
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//The array of job_t may overflow the stack.
|
//The array of job_t may overflow the stack.
|
||||||
//Instead, use malloc to alloc job_t.
|
//Instead, use malloc to alloc job_t.
|
||||||
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
|
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
|
||||||
|
@ -1015,6 +1012,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
BLASLONG divN, divT;
|
BLASLONG divN, divT;
|
||||||
int mode;
|
int mode;
|
||||||
|
|
||||||
|
#if defined(DYNAMIC_ARCH)
|
||||||
|
int switch_ratio = gotoblas->switch_ratio;
|
||||||
|
#else
|
||||||
|
int switch_ratio = SWITCH_RATIO;
|
||||||
|
#endif
|
||||||
|
|
||||||
if (range_m) {
|
if (range_m) {
|
||||||
BLASLONG m_from = *(((BLASLONG *)range_m) + 0);
|
BLASLONG m_from = *(((BLASLONG *)range_m) + 0);
|
||||||
BLASLONG m_to = *(((BLASLONG *)range_m) + 1);
|
BLASLONG m_to = *(((BLASLONG *)range_m) + 1);
|
||||||
|
@ -1030,7 +1033,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) {
|
if ((args -> m < nthreads * switch_ratio) || (args -> n < nthreads * switch_ratio)) {
|
||||||
GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0);
|
GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -1038,7 +1041,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
divT = nthreads;
|
divT = nthreads;
|
||||||
divN = 1;
|
divN = 1;
|
||||||
|
|
||||||
while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) {
|
while ((GEMM3M_P * divT > m * switch_ratio) && (divT > 1)) {
|
||||||
do {
|
do {
|
||||||
divT --;
|
divT --;
|
||||||
divN = 1;
|
divN = 1;
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* Copyright 2023 The OpenBLAS Project. */
|
||||||
/* All rights reserved. */
|
/* All rights reserved. */
|
||||||
/* */
|
/* */
|
||||||
/* Redistribution and use in source and binary forms, with or */
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
@ -44,10 +45,6 @@
|
||||||
#define DIVIDE_RATE 2
|
#define DIVIDE_RATE 2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef SWITCH_RATIO
|
|
||||||
#define SWITCH_RATIO 2
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//The array of job_t may overflow the stack.
|
//The array of job_t may overflow the stack.
|
||||||
//Instead, use malloc to alloc job_t.
|
//Instead, use malloc to alloc job_t.
|
||||||
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
|
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
|
||||||
|
@ -528,7 +525,13 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
int mode, mask;
|
int mode, mask;
|
||||||
double dnum, di, dinum;
|
double dnum, di, dinum;
|
||||||
|
|
||||||
if ((nthreads == 1) || (args -> n < nthreads * SWITCH_RATIO)) {
|
#if defined(DYNAMIC_ARCH)
|
||||||
|
int switch_ratio = gotoblas->switch_ratio;
|
||||||
|
#else
|
||||||
|
int switch_ratio = SWITCH_RATIO;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if ((nthreads == 1) || (args->n < nthreads * switch_ratio)) {
|
||||||
SYRK_LOCAL(args, range_m, range_n, sa, sb, 0);
|
SYRK_LOCAL(args, range_m, range_n, sa, sb, 0);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* Copyright 2023 The OpenBLAS Project. */
|
||||||
/* All rights reserved. */
|
/* All rights reserved. */
|
||||||
/* */
|
/* */
|
||||||
/* Redistribution and use in source and binary forms, with or */
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
@ -44,10 +45,6 @@
|
||||||
#define DIVIDE_RATE 2
|
#define DIVIDE_RATE 2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef SWITCH_RATIO
|
|
||||||
#define SWITCH_RATIO 2
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef GEMM_PREFERED_SIZE
|
#ifndef GEMM_PREFERED_SIZE
|
||||||
#define GEMM_PREFERED_SIZE 1
|
#define GEMM_PREFERED_SIZE 1
|
||||||
#endif
|
#endif
|
||||||
|
@ -577,6 +574,11 @@ InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||||
BLASLONG width, i, j, k, js;
|
BLASLONG width, i, j, k, js;
|
||||||
BLASLONG m, n, n_from, n_to;
|
BLASLONG m, n, n_from, n_to;
|
||||||
int mode;
|
int mode;
|
||||||
|
#if defined(DYNAMIC_ARCH)
|
||||||
|
int switch_ratio = gotoblas->switch_ratio;
|
||||||
|
#else
|
||||||
|
int switch_ratio = SWITCH_RATIO;
|
||||||
|
#endif
|
||||||
|
|
||||||
/* Get execution mode */
|
/* Get execution mode */
|
||||||
#ifndef COMPLEX
|
#ifndef COMPLEX
|
||||||
|
@ -698,8 +700,8 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||||
num_parts = 0;
|
num_parts = 0;
|
||||||
while (n > 0){
|
while (n > 0){
|
||||||
width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts);
|
width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts);
|
||||||
if (width < SWITCH_RATIO) {
|
if (width < switch_ratio) {
|
||||||
width = SWITCH_RATIO;
|
width = switch_ratio;
|
||||||
}
|
}
|
||||||
width = round_up(n, width, GEMM_PREFERED_SIZE);
|
width = round_up(n, width, GEMM_PREFERED_SIZE);
|
||||||
|
|
||||||
|
@ -746,6 +748,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
|
||||||
BLASLONG m = args -> m;
|
BLASLONG m = args -> m;
|
||||||
BLASLONG n = args -> n;
|
BLASLONG n = args -> n;
|
||||||
BLASLONG nthreads_m, nthreads_n;
|
BLASLONG nthreads_m, nthreads_n;
|
||||||
|
#if defined(DYNAMIC_ARCH)
|
||||||
|
int switch_ratio = gotoblas->switch_ratio;
|
||||||
|
#else
|
||||||
|
int switch_ratio = SWITCH_RATIO;
|
||||||
|
#endif
|
||||||
|
|
||||||
/* Get dimensions from index ranges if available */
|
/* Get dimensions from index ranges if available */
|
||||||
if (range_m) {
|
if (range_m) {
|
||||||
|
@ -755,21 +762,21 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
|
||||||
n = range_n[1] - range_n[0];
|
n = range_n[1] - range_n[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Partitions in m should have at least SWITCH_RATIO rows */
|
/* Partitions in m should have at least switch_ratio rows */
|
||||||
if (m < 2 * SWITCH_RATIO) {
|
if (m < 2 * switch_ratio) {
|
||||||
nthreads_m = 1;
|
nthreads_m = 1;
|
||||||
} else {
|
} else {
|
||||||
nthreads_m = args -> nthreads;
|
nthreads_m = args -> nthreads;
|
||||||
while (m < nthreads_m * SWITCH_RATIO) {
|
while (m < nthreads_m * switch_ratio) {
|
||||||
nthreads_m = nthreads_m / 2;
|
nthreads_m = nthreads_m / 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Partitions in n should have at most SWITCH_RATIO * nthreads_m columns */
|
/* Partitions in n should have at most switch_ratio * nthreads_m columns */
|
||||||
if (n < SWITCH_RATIO * nthreads_m) {
|
if (n < switch_ratio * nthreads_m) {
|
||||||
nthreads_n = 1;
|
nthreads_n = 1;
|
||||||
} else {
|
} else {
|
||||||
nthreads_n = (n + SWITCH_RATIO * nthreads_m - 1) / (SWITCH_RATIO * nthreads_m);
|
nthreads_n = (n + switch_ratio * nthreads_m - 1) / (switch_ratio * nthreads_m);
|
||||||
if (nthreads_m * nthreads_n > args -> nthreads) {
|
if (nthreads_m * nthreads_n > args -> nthreads) {
|
||||||
nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m);
|
nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m);
|
||||||
}
|
}
|
||||||
|
|
|
@ -973,7 +973,7 @@ void goto_set_num_threads(int num_threads) {
|
||||||
|
|
||||||
increased_threads = 1;
|
increased_threads = 1;
|
||||||
|
|
||||||
for(i = blas_num_threads - 1; i < num_threads - 1; i++){
|
for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
|
||||||
|
|
||||||
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0);
|
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0);
|
||||||
thread_status[i].status = THREAD_STATUS_WAKEUP;
|
thread_status[i].status = THREAD_STATUS_WAKEUP;
|
||||||
|
|
|
@ -68,6 +68,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int blas_server_avail = 0;
|
int blas_server_avail = 0;
|
||||||
|
int blas_omp_number_max = 0;
|
||||||
|
|
||||||
extern int openblas_omp_adaptive_env();
|
extern int openblas_omp_adaptive_env();
|
||||||
|
|
||||||
|
@ -100,8 +101,6 @@ static void adjust_thread_buffers() {
|
||||||
|
|
||||||
void goto_set_num_threads(int num_threads) {
|
void goto_set_num_threads(int num_threads) {
|
||||||
|
|
||||||
blas_num_threads_set = 1;
|
|
||||||
if (num_threads < 0) blas_num_threads_set = 0;
|
|
||||||
if (num_threads < 1) num_threads = blas_num_threads;
|
if (num_threads < 1) num_threads = blas_num_threads;
|
||||||
|
|
||||||
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
|
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
|
||||||
|
@ -125,6 +124,8 @@ void openblas_set_num_threads(int num_threads) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int blas_thread_init(void){
|
int blas_thread_init(void){
|
||||||
|
if(blas_omp_number_max <= 0)
|
||||||
|
blas_omp_number_max = omp_get_max_threads();
|
||||||
|
|
||||||
blas_get_cpu_number();
|
blas_get_cpu_number();
|
||||||
|
|
||||||
|
|
|
@ -568,7 +568,7 @@ void goto_set_num_threads(int num_threads)
|
||||||
blas_server_avail = 1;
|
blas_server_avail = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = blas_num_threads - 1; i < num_threads - 1; i++){
|
for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
|
||||||
|
|
||||||
blas_threads[i] = CreateThread(NULL, 0,
|
blas_threads[i] = CreateThread(NULL, 0,
|
||||||
blas_thread_server, (void *)i,
|
blas_thread_server, (void *)i,
|
||||||
|
|
|
@ -220,6 +220,19 @@ extern gotoblas_t gotoblas_COOPERLAKE;
|
||||||
#else
|
#else
|
||||||
#define gotoblas_COOPERLAKE gotoblas_PRESCOTT
|
#define gotoblas_COOPERLAKE gotoblas_PRESCOTT
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef DYN_SAPPHIRERAPIDS
|
||||||
|
extern gotoblas_t gotoblas_SAPPHIRERAPIDS;
|
||||||
|
#elif defined(DYN_SKYLAKEX)
|
||||||
|
#define gotoblas_SAPPHIRERAPIDS gotoblas_SKYLAKEX
|
||||||
|
#elif defined(DYN_HASWELL)
|
||||||
|
#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL
|
||||||
|
#elif defined(DYN_SANDYBRIDGE)
|
||||||
|
#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE
|
||||||
|
#elif defined(DYN_NEHALEM)
|
||||||
|
#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM
|
||||||
|
#else
|
||||||
|
#define gotoblas_SAPPHIRERAPIDS gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#else // not DYNAMIC_LIST
|
#else // not DYNAMIC_LIST
|
||||||
|
@ -268,9 +281,11 @@ extern gotoblas_t gotoblas_ZEN;
|
||||||
#ifndef NO_AVX512
|
#ifndef NO_AVX512
|
||||||
extern gotoblas_t gotoblas_SKYLAKEX;
|
extern gotoblas_t gotoblas_SKYLAKEX;
|
||||||
extern gotoblas_t gotoblas_COOPERLAKE;
|
extern gotoblas_t gotoblas_COOPERLAKE;
|
||||||
|
extern gotoblas_t gotoblas_SAPPHIRERAPIDS;
|
||||||
#else
|
#else
|
||||||
#define gotoblas_SKYLAKEX gotoblas_HASWELL
|
#define gotoblas_SKYLAKEX gotoblas_HASWELL
|
||||||
#define gotoblas_COOPERLAKE gotoblas_HASWELL
|
#define gotoblas_COOPERLAKE gotoblas_HASWELL
|
||||||
|
#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
|
@ -279,6 +294,7 @@ extern gotoblas_t gotoblas_COOPERLAKE;
|
||||||
#define gotoblas_HASWELL gotoblas_NEHALEM
|
#define gotoblas_HASWELL gotoblas_NEHALEM
|
||||||
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
|
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
|
||||||
#define gotoblas_COOPERLAKE gotoblas_NEHALEM
|
#define gotoblas_COOPERLAKE gotoblas_NEHALEM
|
||||||
|
#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM
|
||||||
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
||||||
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
|
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
|
||||||
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
|
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
|
||||||
|
@ -378,6 +394,31 @@ int support_avx512_bf16(){
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define BIT_AMX_TILE 0x01000000
|
||||||
|
#define BIT_AMX_BF16 0x00400000
|
||||||
|
#define BIT_AMX_ENBD 0x00060000
|
||||||
|
|
||||||
|
int support_amx_bf16() {
|
||||||
|
#if !defined(NO_AVX) && !defined(NO_AVX512)
|
||||||
|
int eax, ebx, ecx, edx;
|
||||||
|
int ret=0;
|
||||||
|
|
||||||
|
if (!support_avx512())
|
||||||
|
return 0;
|
||||||
|
// CPUID.7.0:EDX indicates AMX support
|
||||||
|
cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
|
||||||
|
if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) {
|
||||||
|
// CPUID.D.0:EAX[17:18] indicates AMX enabled
|
||||||
|
cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
|
||||||
|
if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD)
|
||||||
|
ret = 1;
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
extern void openblas_warning(int verbose, const char * msg);
|
extern void openblas_warning(int verbose, const char * msg);
|
||||||
#define FALLBACK_VERBOSE 1
|
#define FALLBACK_VERBOSE 1
|
||||||
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
|
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
|
||||||
|
@ -689,6 +730,8 @@ static gotoblas_t *get_coretype(void){
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (model == 15){ // Sapphire Rapids
|
if (model == 15){ // Sapphire Rapids
|
||||||
|
if(support_amx_bf16())
|
||||||
|
return &gotoblas_SAPPHIRERAPIDS;
|
||||||
if(support_avx512_bf16())
|
if(support_avx512_bf16())
|
||||||
return &gotoblas_COOPERLAKE;
|
return &gotoblas_COOPERLAKE;
|
||||||
if (support_avx512())
|
if (support_avx512())
|
||||||
|
@ -941,7 +984,8 @@ static char *corename[] = {
|
||||||
"Excavator",
|
"Excavator",
|
||||||
"Zen",
|
"Zen",
|
||||||
"SkylakeX",
|
"SkylakeX",
|
||||||
"Cooperlake"
|
"Cooperlake",
|
||||||
|
"SapphireRapids"
|
||||||
};
|
};
|
||||||
|
|
||||||
char *gotoblas_corename(void) {
|
char *gotoblas_corename(void) {
|
||||||
|
@ -1006,6 +1050,7 @@ char *gotoblas_corename(void) {
|
||||||
if (gotoblas == &gotoblas_ZEN) return corename[23];
|
if (gotoblas == &gotoblas_ZEN) return corename[23];
|
||||||
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
|
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
|
||||||
if (gotoblas == &gotoblas_COOPERLAKE) return corename[25];
|
if (gotoblas == &gotoblas_COOPERLAKE) return corename[25];
|
||||||
|
if (gotoblas == &gotoblas_SAPPHIRERAPIDS) return corename[26];
|
||||||
return corename[0];
|
return corename[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* Copyright 2023 The OpenBLAS Project */
|
||||||
/* All rights reserved. */
|
/* All rights reserved. */
|
||||||
/* */
|
/* */
|
||||||
/* Redistribution and use in source and binary forms, with or */
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2;
|
||||||
#else
|
#else
|
||||||
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
|
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef DYN_ARMV8SVE
|
||||||
|
extern gotoblas_t gotoblas_ARMV8SVE;
|
||||||
|
#else
|
||||||
|
#define gotoblas_ARMV8SVE gotoblas_ARMV8
|
||||||
|
#endif
|
||||||
#ifdef DYN_CORTEX_A55
|
#ifdef DYN_CORTEX_A55
|
||||||
extern gotoblas_t gotoblas_CORTEXA55;
|
extern gotoblas_t gotoblas_CORTEXA55;
|
||||||
#else
|
#else
|
||||||
|
@ -128,17 +134,21 @@ extern gotoblas_t gotoblas_NEOVERSEN1;
|
||||||
#ifndef NO_SVE
|
#ifndef NO_SVE
|
||||||
extern gotoblas_t gotoblas_NEOVERSEV1;
|
extern gotoblas_t gotoblas_NEOVERSEV1;
|
||||||
extern gotoblas_t gotoblas_NEOVERSEN2;
|
extern gotoblas_t gotoblas_NEOVERSEN2;
|
||||||
|
extern gotoblas_t gotoblas_ARMV8SVE;
|
||||||
#else
|
#else
|
||||||
#define gotoblas_NEOVERSEV1 gotoblas_ARMV8
|
#define gotoblas_NEOVERSEV1 gotoblas_ARMV8
|
||||||
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
|
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
|
||||||
|
#define gotoblas_ARMV8SVE gotoblas_ARMV8
|
||||||
#endif
|
#endif
|
||||||
extern gotoblas_t gotoblas_THUNDERX3T110;
|
extern gotoblas_t gotoblas_THUNDERX3T110;
|
||||||
extern gotoblas_t gotoblas_CORTEXA55;
|
extern gotoblas_t gotoblas_CORTEXA55;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
extern void openblas_warning(int verbose, const char * msg);
|
extern void openblas_warning(int verbose, const char * msg);
|
||||||
|
#define FALLBACK_VERBOSE 1
|
||||||
|
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"
|
||||||
|
|
||||||
#define NUM_CORETYPES 13
|
#define NUM_CORETYPES 16
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* In case asm/hwcap.h is outdated on the build system, make sure
|
* In case asm/hwcap.h is outdated on the build system, make sure
|
||||||
|
@ -147,6 +157,9 @@ extern void openblas_warning(int verbose, const char * msg);
|
||||||
#ifndef HWCAP_CPUID
|
#ifndef HWCAP_CPUID
|
||||||
#define HWCAP_CPUID (1 << 11)
|
#define HWCAP_CPUID (1 << 11)
|
||||||
#endif
|
#endif
|
||||||
|
#ifndef HWCAP_SVE
|
||||||
|
#define HWCAP_SVE (1 << 22)
|
||||||
|
#endif
|
||||||
|
|
||||||
#define get_cpu_ftr(id, var) ({ \
|
#define get_cpu_ftr(id, var) ({ \
|
||||||
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \
|
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \
|
||||||
|
@ -168,6 +181,7 @@ static char *corename[] = {
|
||||||
"neoversen2",
|
"neoversen2",
|
||||||
"thunderx3t110",
|
"thunderx3t110",
|
||||||
"cortexa55",
|
"cortexa55",
|
||||||
|
"armv8sve",
|
||||||
"unknown"
|
"unknown"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -187,6 +201,7 @@ char *gotoblas_corename(void) {
|
||||||
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12];
|
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12];
|
||||||
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
|
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
|
||||||
if (gotoblas == &gotoblas_CORTEXA55) return corename[14];
|
if (gotoblas == &gotoblas_CORTEXA55) return corename[14];
|
||||||
|
if (gotoblas == &gotoblas_ARMV8SVE) return corename[15];
|
||||||
return corename[NUM_CORETYPES];
|
return corename[NUM_CORETYPES];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -221,6 +236,7 @@ static gotoblas_t *force_coretype(char *coretype) {
|
||||||
case 12: return (&gotoblas_NEOVERSEN2);
|
case 12: return (&gotoblas_NEOVERSEN2);
|
||||||
case 13: return (&gotoblas_THUNDERX3T110);
|
case 13: return (&gotoblas_THUNDERX3T110);
|
||||||
case 14: return (&gotoblas_CORTEXA55);
|
case 14: return (&gotoblas_CORTEXA55);
|
||||||
|
case 15: return (&gotoblas_ARMV8SVE);
|
||||||
}
|
}
|
||||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||||
openblas_warning(1, message);
|
openblas_warning(1, message);
|
||||||
|
@ -281,8 +297,16 @@ static gotoblas_t *get_coretype(void) {
|
||||||
return &gotoblas_NEOVERSEN1;
|
return &gotoblas_NEOVERSEN1;
|
||||||
#ifndef NO_SVE
|
#ifndef NO_SVE
|
||||||
case 0xd49:
|
case 0xd49:
|
||||||
|
if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
|
||||||
|
openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
|
||||||
|
return &gotoblas_NEOVERSEN1;
|
||||||
|
} else
|
||||||
return &gotoblas_NEOVERSEN2;
|
return &gotoblas_NEOVERSEN2;
|
||||||
case 0xd40:
|
case 0xd40:
|
||||||
|
if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
|
||||||
|
openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
|
||||||
|
return &gotoblas_NEOVERSEN1;
|
||||||
|
}else
|
||||||
return &gotoblas_NEOVERSEV1;
|
return &gotoblas_NEOVERSEV1;
|
||||||
#endif
|
#endif
|
||||||
case 0xd05: // Cortex A55
|
case 0xd05: // Cortex A55
|
||||||
|
@ -332,6 +356,12 @@ static gotoblas_t *get_coretype(void) {
|
||||||
snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
|
snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
|
||||||
openblas_warning(1, coremsg);
|
openblas_warning(1, coremsg);
|
||||||
}
|
}
|
||||||
|
#ifndef NO_SVE
|
||||||
|
if ((getauxval(AT_HWCAP) & HWCAP_SVE)) {
|
||||||
|
return &gotoblas_ARMV8SVE;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
return NULL;
|
return NULL;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
|
@ -422,8 +422,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s
|
||||||
*/
|
*/
|
||||||
int blas_num_threads = 0;
|
int blas_num_threads = 0;
|
||||||
|
|
||||||
int blas_num_threads_set = 0;
|
|
||||||
|
|
||||||
int goto_get_num_procs (void) {
|
int goto_get_num_procs (void) {
|
||||||
return blas_cpu_number;
|
return blas_cpu_number;
|
||||||
}
|
}
|
||||||
|
@ -1996,8 +1994,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s
|
||||||
*/
|
*/
|
||||||
int blas_num_threads = 0;
|
int blas_num_threads = 0;
|
||||||
|
|
||||||
int blas_num_threads_set = 0;
|
|
||||||
|
|
||||||
int goto_get_num_procs (void) {
|
int goto_get_num_procs (void) {
|
||||||
return blas_cpu_number;
|
return blas_cpu_number;
|
||||||
}
|
}
|
||||||
|
@ -3015,6 +3011,8 @@ void *blas_memory_alloc(int procpos){
|
||||||
#endif
|
#endif
|
||||||
if (memory_overflowed) goto terminate;
|
if (memory_overflowed) goto terminate;
|
||||||
fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n");
|
fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n");
|
||||||
|
fprintf(stderr,"To avoid this warning, please rebuild your copy of OpenBLAS with a larger NUM_THREADS setting\n");
|
||||||
|
fprintf(stderr,"or set the environment variable OPENBLAS_NUM_THREADS to %d or lower\n", NUM_BUFFERS);
|
||||||
memory_overflowed=1;
|
memory_overflowed=1;
|
||||||
new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t));
|
new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t));
|
||||||
newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct));
|
newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct));
|
||||||
|
|
|
@ -283,7 +283,6 @@ The numbers of threads in the thread pool.
|
||||||
This value is equal or large than blas_cpu_number. This means some threads are sleep.
|
This value is equal or large than blas_cpu_number. This means some threads are sleep.
|
||||||
*/
|
*/
|
||||||
int blas_num_threads = 0;
|
int blas_num_threads = 0;
|
||||||
int blas_num_threads_set = 0;
|
|
||||||
|
|
||||||
int goto_get_num_procs (void) {
|
int goto_get_num_procs (void) {
|
||||||
return blas_cpu_number;
|
return blas_cpu_number;
|
||||||
|
|
|
@ -21,7 +21,7 @@ blasobjsc="
|
||||||
chbmv chemm chemv cher2 cher2k cher cherk scabs1 scamax
|
chbmv chemm chemv cher2 cher2k cher cherk scabs1 scamax
|
||||||
chpmv chpr2 chpr crotg cscal csrot csscal cswap scamin scasum scnrm2
|
chpmv chpr2 chpr crotg cscal csrot csscal cswap scamin scasum scnrm2
|
||||||
csymm csyr2k csyrk ctbmv ctbsv ctpmv ctpsv ctrmm ctrmv ctrsm
|
csymm csyr2k csyrk ctbmv ctbsv ctpmv ctpsv ctrmm ctrmv ctrsm
|
||||||
ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum"
|
ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum cgemmt"
|
||||||
|
|
||||||
blasobjsd="
|
blasobjsd="
|
||||||
damax damin dasum daxpy daxpby dcabs1 dcopy ddot dgbmv dgemm
|
damax damin dasum daxpy daxpby dcabs1 dcopy ddot dgbmv dgemm
|
||||||
|
@ -29,7 +29,7 @@ blasobjsd="
|
||||||
dscal dsdot dspmv dspr2 dimatcopy domatcopy
|
dscal dsdot dspmv dspr2 dimatcopy domatcopy
|
||||||
dspr dswap dsymm dsymv dsyr2 dsyr2k dsyr dsyrk dtbmv dtbsv
|
dspr dswap dsymm dsymv dsyr2 dsyr2k dsyr dsyrk dtbmv dtbsv
|
||||||
dtpmv dtpsv dtrmm dtrmv dtrsm dtrsv
|
dtpmv dtpsv dtrmm dtrmv dtrsm dtrsv
|
||||||
idamax idamin idmax idmin dgeadd dsum"
|
idamax idamin idmax idmin dgeadd dsum dgemmt"
|
||||||
|
|
||||||
blasobjss="
|
blasobjss="
|
||||||
isamax isamin ismax ismin
|
isamax isamin ismax ismin
|
||||||
|
@ -38,7 +38,7 @@ blasobjss="
|
||||||
smax smin snrm2 simatcopy somatcopy
|
smax smin snrm2 simatcopy somatcopy
|
||||||
srot srotg srotm srotmg ssbmv sscal sspmv sspr2 sspr sswap
|
srot srotg srotm srotmg ssbmv sscal sspmv sspr2 sspr sswap
|
||||||
ssymm ssymv ssyr2 ssyr2k ssyr ssyrk stbmv stbsv stpmv stpsv
|
ssymm ssymv ssyr2 ssyr2k ssyr ssyrk stbmv stbsv stpmv stpsv
|
||||||
strmm strmv strsm strsv sgeadd ssum"
|
strmm strmv strsm strsv sgeadd ssum sgemmt"
|
||||||
|
|
||||||
blasobjsz="
|
blasobjsz="
|
||||||
izamax izamin
|
izamax izamin
|
||||||
|
@ -48,7 +48,7 @@ blasobjsz="
|
||||||
zhpr zrotg zscal zswap zsymm zsyr2k zsyrk ztbmv
|
zhpr zrotg zscal zswap zsymm zsyr2k zsyrk ztbmv
|
||||||
ztbsv ztpmv ztpsv ztrmm ztrmv ztrsm ztrsv
|
ztbsv ztpmv ztpsv ztrmm ztrmv ztrsm ztrsv
|
||||||
zomatcopy zimatcopy dzamax dzamin dzasum dznrm2
|
zomatcopy zimatcopy dzamax dzamin dzasum dznrm2
|
||||||
zgeadd dzsum"
|
zgeadd dzsum zgemmt"
|
||||||
|
|
||||||
blasobjs="lsame xerbla"
|
blasobjs="lsame xerbla"
|
||||||
bfblasobjs="sbgemm sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod"
|
bfblasobjs="sbgemm sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod"
|
||||||
|
@ -58,7 +58,7 @@ cblasobjsc="
|
||||||
cblas_cher cblas_cherk cblas_chpmv cblas_chpr2 cblas_chpr cblas_cscal cblas_caxpby
|
cblas_cher cblas_cherk cblas_chpmv cblas_chpr2 cblas_chpr cblas_cscal cblas_caxpby
|
||||||
cblas_csscal cblas_cswap cblas_csymm cblas_csyr2k cblas_csyrk cblas_ctbmv cblas_cgeadd
|
cblas_csscal cblas_cswap cblas_csymm cblas_csyr2k cblas_csyrk cblas_ctbmv cblas_cgeadd
|
||||||
cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv
|
cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv
|
||||||
cblas_scnrm2 cblas_scasum
|
cblas_scnrm2 cblas_scasum cblas_cgemmt
|
||||||
cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy
|
cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy
|
||||||
"
|
"
|
||||||
cblasobjsd="
|
cblasobjsd="
|
||||||
|
@ -67,7 +67,7 @@ cblasobjsd="
|
||||||
cblas_drot cblas_drotg cblas_drotm cblas_drotmg cblas_dsbmv cblas_dscal cblas_dsdot
|
cblas_drot cblas_drotg cblas_drotm cblas_drotmg cblas_dsbmv cblas_dscal cblas_dsdot
|
||||||
cblas_dspmv cblas_dspr2 cblas_dspr cblas_dswap cblas_dsymm cblas_dsymv cblas_dsyr2
|
cblas_dspmv cblas_dspr2 cblas_dspr cblas_dswap cblas_dsymm cblas_dsymv cblas_dsyr2
|
||||||
cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv
|
cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv
|
||||||
cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd
|
cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt
|
||||||
cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy
|
cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy
|
||||||
"
|
"
|
||||||
|
|
||||||
|
@ -78,7 +78,7 @@ cblasobjss="
|
||||||
cblas_srotm cblas_srotmg cblas_ssbmv cblas_sscal cblas_sspmv cblas_sspr2 cblas_sspr
|
cblas_srotm cblas_srotmg cblas_ssbmv cblas_sscal cblas_sspmv cblas_sspr2 cblas_sspr
|
||||||
cblas_sswap cblas_ssymm cblas_ssymv cblas_ssyr2 cblas_ssyr2k cblas_ssyr cblas_ssyrk
|
cblas_sswap cblas_ssymm cblas_ssymv cblas_ssyr2 cblas_ssyr2k cblas_ssyr cblas_ssyrk
|
||||||
cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm
|
cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm
|
||||||
cblas_strsv cblas_sgeadd
|
cblas_strsv cblas_sgeadd cblas_sgemmt
|
||||||
cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy
|
cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy
|
||||||
"
|
"
|
||||||
|
|
||||||
|
@ -89,7 +89,7 @@ cblasobjsz="
|
||||||
cblas_zhpr cblas_zscal cblas_zswap cblas_zsymm cblas_zsyr2k cblas_zsyrk
|
cblas_zhpr cblas_zscal cblas_zswap cblas_zsymm cblas_zsyr2k cblas_zsyrk
|
||||||
cblas_ztbmv cblas_ztbsv cblas_ztpmv cblas_ztpsv cblas_ztrmm cblas_ztrmv cblas_ztrsm
|
cblas_ztbmv cblas_ztbsv cblas_ztpmv cblas_ztpsv cblas_ztrmm cblas_ztrmv cblas_ztrsm
|
||||||
cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub
|
cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub
|
||||||
cblas_zaxpby cblas_zgeadd
|
cblas_zaxpby cblas_zgeadd cblas_zgemmt
|
||||||
cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy
|
cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy
|
||||||
"
|
"
|
||||||
|
|
||||||
|
@ -716,6 +716,7 @@ lapackobjs2z="$lapackobjs2z
|
||||||
# functions added for lapack-3.7.0
|
# functions added for lapack-3.7.0
|
||||||
lapackobjs2s="$lapackobjs2s
|
lapackobjs2s="$lapackobjs2s
|
||||||
slarfy
|
slarfy
|
||||||
|
ssyconvf
|
||||||
strevc3
|
strevc3
|
||||||
sgelqt
|
sgelqt
|
||||||
sgelqt3
|
sgelqt3
|
||||||
|
@ -843,6 +844,23 @@ lapackobjs2z="$lapackobjs2z
|
||||||
zungtsqr_row
|
zungtsqr_row
|
||||||
"
|
"
|
||||||
|
|
||||||
|
#functions added for lapack-3.11
|
||||||
|
lapackobjs2c="$lapackobjs2c
|
||||||
|
cgedmd
|
||||||
|
cgedmdq
|
||||||
|
"
|
||||||
|
lapackobjs2d="$lapackobjs2d
|
||||||
|
dgedmd
|
||||||
|
dgedmdq
|
||||||
|
"
|
||||||
|
lapackobjs2s="$lapackobjs2s
|
||||||
|
sgedmd
|
||||||
|
sgedmdq
|
||||||
|
"
|
||||||
|
lapackobjs2z="$lapackobjs2z
|
||||||
|
zgedmd
|
||||||
|
zgedmdq
|
||||||
|
"
|
||||||
lapack_extendedprecision_objs="
|
lapack_extendedprecision_objs="
|
||||||
zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx
|
zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx
|
||||||
dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx
|
dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx
|
||||||
|
@ -1012,6 +1030,10 @@ lapackeobjsc="
|
||||||
LAPACKE_cgebrd_work
|
LAPACKE_cgebrd_work
|
||||||
LAPACKE_cgecon
|
LAPACKE_cgecon
|
||||||
LAPACKE_cgecon_work
|
LAPACKE_cgecon_work
|
||||||
|
LAPACKE_cgedmd
|
||||||
|
LAPACKE_cgedmd_work
|
||||||
|
LAPACKE_cgedmdq
|
||||||
|
LAPACKE_cgedmdq_work
|
||||||
LAPACKE_cgeequ
|
LAPACKE_cgeequ
|
||||||
LAPACKE_cgeequ_work
|
LAPACKE_cgeequ_work
|
||||||
LAPACKE_cgeequb
|
LAPACKE_cgeequb
|
||||||
|
@ -1671,6 +1693,10 @@ lapackeobjsd="
|
||||||
LAPACKE_dgebrd_work
|
LAPACKE_dgebrd_work
|
||||||
LAPACKE_dgecon
|
LAPACKE_dgecon
|
||||||
LAPACKE_dgecon_work
|
LAPACKE_dgecon_work
|
||||||
|
LAPACKE_dgedmd
|
||||||
|
LAPACKE_dgedmd_work
|
||||||
|
LAPACKE_dgedmdq
|
||||||
|
LAPACKE_dgedmdq_work
|
||||||
LAPACKE_dgeequ
|
LAPACKE_dgeequ
|
||||||
LAPACKE_dgeequ_work
|
LAPACKE_dgeequ_work
|
||||||
LAPACKE_dgeequb
|
LAPACKE_dgeequb
|
||||||
|
@ -2284,6 +2310,10 @@ lapackeobjss="
|
||||||
LAPACKE_sgebrd_work
|
LAPACKE_sgebrd_work
|
||||||
LAPACKE_sgecon
|
LAPACKE_sgecon
|
||||||
LAPACKE_sgecon_work
|
LAPACKE_sgecon_work
|
||||||
|
LAPACKE_sgedmd
|
||||||
|
LAPACKE_sgedmd_work
|
||||||
|
LAPACKE_sgedmdq
|
||||||
|
LAPACKE_sgedmdq_work
|
||||||
LAPACKE_sgeequ
|
LAPACKE_sgeequ
|
||||||
LAPACKE_sgeequ_work
|
LAPACKE_sgeequ_work
|
||||||
LAPACKE_sgeequb
|
LAPACKE_sgeequb
|
||||||
|
@ -2893,6 +2923,10 @@ lapackeobjsz="
|
||||||
LAPACKE_zgebrd_work
|
LAPACKE_zgebrd_work
|
||||||
LAPACKE_zgecon
|
LAPACKE_zgecon
|
||||||
LAPACKE_zgecon_work
|
LAPACKE_zgecon_work
|
||||||
|
LAPACKE_zgedmd
|
||||||
|
LAPACKE_zgedmd_work
|
||||||
|
LAPACKE_zgedmdq
|
||||||
|
LAPACKE_zgedmdq_work
|
||||||
LAPACKE_zgeequ
|
LAPACKE_zgeequ
|
||||||
LAPACKE_zgeequ_work
|
LAPACKE_zgeequ_work
|
||||||
LAPACKE_zgeequb
|
LAPACKE_zgeequb
|
||||||
|
|
|
@ -21,7 +21,7 @@
|
||||||
chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax,
|
chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax,
|
||||||
chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2,
|
chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2,
|
||||||
csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm,
|
csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm,
|
||||||
ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum);
|
ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt);
|
||||||
|
|
||||||
@blasobjsd = (
|
@blasobjsd = (
|
||||||
damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm,
|
damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm,
|
||||||
|
@ -29,7 +29,7 @@
|
||||||
dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy,
|
dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy,
|
||||||
dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv,
|
dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv,
|
||||||
dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv,
|
dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv,
|
||||||
idamax,idamin,idmax,idmin,dgeadd,dsum);
|
idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt);
|
||||||
|
|
||||||
@blasobjss = (
|
@blasobjss = (
|
||||||
isamax,isamin,ismax,ismin,
|
isamax,isamin,ismax,ismin,
|
||||||
|
@ -38,7 +38,7 @@
|
||||||
smax,smin,snrm2,simatcopy,somatcopy,
|
smax,smin,snrm2,simatcopy,somatcopy,
|
||||||
srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap,
|
srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap,
|
||||||
ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv,
|
ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv,
|
||||||
strmm,strmv,strsm,strsv, sgeadd,ssum);
|
strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt);
|
||||||
|
|
||||||
@blasobjsz = (
|
@blasobjsz = (
|
||||||
izamax,izamin,,
|
izamax,izamin,,
|
||||||
|
@ -48,7 +48,7 @@
|
||||||
zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv,
|
zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv,
|
||||||
ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv,
|
ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv,
|
||||||
zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2,
|
zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2,
|
||||||
zgeadd, dzsum);
|
zgeadd, dzsum, zgemmt);
|
||||||
|
|
||||||
@blasobjs = (lsame, xerbla);
|
@blasobjs = (lsame, xerbla);
|
||||||
@bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
|
@bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
|
||||||
|
@ -60,7 +60,7 @@
|
||||||
cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv,
|
cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv,
|
||||||
cblas_scnrm2, cblas_scasum,
|
cblas_scnrm2, cblas_scasum,
|
||||||
cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy
|
cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy
|
||||||
);
|
cblas_cgemmt);
|
||||||
@cblasobjsd = (
|
@cblasobjsd = (
|
||||||
cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot,
|
cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot,
|
||||||
cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2,
|
cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2,
|
||||||
|
@ -69,7 +69,7 @@
|
||||||
cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv,
|
cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv,
|
||||||
cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd,
|
cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd,
|
||||||
cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy
|
cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy
|
||||||
);
|
cblas_dgemmt);
|
||||||
|
|
||||||
@cblasobjss = (
|
@cblasobjss = (
|
||||||
cblas_sasum, cblas_saxpy, cblas_saxpby,
|
cblas_sasum, cblas_saxpy, cblas_saxpby,
|
||||||
|
@ -80,7 +80,7 @@
|
||||||
cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm,
|
cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm,
|
||||||
cblas_strsv, cblas_sgeadd,
|
cblas_strsv, cblas_sgeadd,
|
||||||
cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy
|
cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy
|
||||||
);
|
cblas_sgemmt);
|
||||||
@cblasobjsz = (
|
@cblasobjsz = (
|
||||||
cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal,
|
cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal,
|
||||||
cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm,
|
cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm,
|
||||||
|
@ -90,7 +90,7 @@
|
||||||
cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub,
|
cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub,
|
||||||
cblas_zaxpby, cblas_zgeadd,
|
cblas_zaxpby, cblas_zgeadd,
|
||||||
cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy
|
cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy
|
||||||
);
|
cblas_zgemmt);
|
||||||
|
|
||||||
@cblasobjs = ( cblas_xerbla );
|
@cblasobjs = ( cblas_xerbla );
|
||||||
|
|
||||||
|
|
7
f_check
7
f_check
|
@ -101,6 +101,13 @@ else
|
||||||
*flang*)
|
*flang*)
|
||||||
vendor=FLANG
|
vendor=FLANG
|
||||||
openmp='-fopenmp'
|
openmp='-fopenmp'
|
||||||
|
data=`$compiler -v 2>&1 > /dev/null `
|
||||||
|
v="${data#*version *}"
|
||||||
|
v="${v%%*.}"
|
||||||
|
major="${v%%.*}"
|
||||||
|
if [ "$major" -ge 17 ]; then
|
||||||
|
vendor=FLANGNEW
|
||||||
|
fi
|
||||||
;;
|
;;
|
||||||
*ifort*|*ifx*)
|
*ifort*|*ifx*)
|
||||||
vendor=INTEL
|
vendor=INTEL
|
||||||
|
|
|
@ -1930,15 +1930,15 @@ printf("ELF_VERSION=2\n");
|
||||||
|
|
||||||
#ifdef MAKE_NB_JOBS
|
#ifdef MAKE_NB_JOBS
|
||||||
#if MAKE_NB_JOBS > 0
|
#if MAKE_NB_JOBS > 0
|
||||||
printf("MAKE += -j %d\n", MAKE_NB_JOBS);
|
printf("MAKEFLAGS += -j %d\n", MAKE_NB_JOBS);
|
||||||
#else
|
#else
|
||||||
// Let make use parent -j argument or -j1 if there
|
// Let make use parent -j argument or -j1 if there
|
||||||
// is no make parent
|
// is no make parent
|
||||||
#endif
|
#endif
|
||||||
#elif NO_PARALLEL_MAKE==1
|
#elif NO_PARALLEL_MAKE==1
|
||||||
printf("MAKE += -j 1\n");
|
printf("MAKEFLAGS += -j 1\n");
|
||||||
#else
|
#else
|
||||||
printf("MAKE += -j %d\n", get_num_cores());
|
printf("MAKEFLAGS += -j %d\n", get_num_cores());
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -68,7 +68,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
|
||||||
info = 0;
|
info = 0;
|
||||||
|
|
||||||
|
|
||||||
if (lda < MAX(1, m)) info = 6;
|
if (lda < MAX(1, m)) info = 5;
|
||||||
if (ldc < MAX(1, m)) info = 8;
|
if (ldc < MAX(1, m)) info = 8;
|
||||||
|
|
||||||
if (n < 0) info = 2;
|
if (n < 0) info = 2;
|
||||||
|
|
|
@ -154,6 +154,23 @@ static size_t zgemm_small_kernel_b0[] = {
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
|
||||||
|
#define XFEATURE_XTILEDATA 18
|
||||||
|
#define ARCH_REQ_XCOMP_PERM 0x1023
|
||||||
|
static int openblas_amxtile_permission = 0;
|
||||||
|
static int init_amxtile_permission() {
|
||||||
|
long status =
|
||||||
|
syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);
|
||||||
|
if (status != 0) {
|
||||||
|
fprintf(stderr, "XTILEDATA permission not granted in your device(Linux, "
|
||||||
|
"Intel Sapphier Rapids), skip sbgemm calculation\n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
openblas_amxtile_permission = 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef CBLAS
|
#ifndef CBLAS
|
||||||
|
|
||||||
void NAME(char *TRANSA, char *TRANSB,
|
void NAME(char *TRANSA, char *TRANSB,
|
||||||
|
@ -455,6 +472,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
|
||||||
|
#if defined(DYNAMIC_ARCH)
|
||||||
|
if (gotoblas->need_amxtile_permission &&
|
||||||
|
openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#if !defined(DYNAMIC_ARCH) && defined(SAPPHIRERAPIDS)
|
||||||
|
if (openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif // defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
|
||||||
|
|
||||||
if ((args.m == 0) || (args.n == 0)) return;
|
if ((args.m == 0) || (args.n == 0)) return;
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
|
|
@ -35,29 +35,26 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#ifdef FUNCTION_PROFILE
|
|
||||||
#include "functable.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef COMPLEX
|
#ifndef COMPLEX
|
||||||
#define SMP_THRESHOLD_MIN 65536.0
|
#define SMP_THRESHOLD_MIN 65536.0
|
||||||
#ifdef XDOUBLE
|
#ifdef XDOUBLE
|
||||||
#define ERROR_NAME "QGEMT "
|
#define ERROR_NAME "QGEMMT "
|
||||||
#elif defined(DOUBLE)
|
#elif defined(DOUBLE)
|
||||||
#define ERROR_NAME "DGEMT "
|
#define ERROR_NAME "DGEMMT "
|
||||||
#elif defined(BFLOAT16)
|
#elif defined(BFLOAT16)
|
||||||
#define ERROR_NAME "SBGEMT "
|
#define ERROR_NAME "SBGEMMT "
|
||||||
#else
|
#else
|
||||||
#define ERROR_NAME "SGEMT "
|
#define ERROR_NAME "SGEMMT "
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
#define SMP_THRESHOLD_MIN 8192.0
|
#define SMP_THRESHOLD_MIN 8192.0
|
||||||
#ifdef XDOUBLE
|
#ifdef XDOUBLE
|
||||||
#define ERROR_NAME "XGEMT "
|
#define ERROR_NAME "XGEMMT "
|
||||||
#elif defined(DOUBLE)
|
#elif defined(DOUBLE)
|
||||||
#define ERROR_NAME "ZGEMT "
|
#define ERROR_NAME "ZGEMMT "
|
||||||
#else
|
#else
|
||||||
#define ERROR_NAME "CGEMT "
|
#define ERROR_NAME "CGEMMT "
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -68,18 +65,19 @@
|
||||||
#ifndef CBLAS
|
#ifndef CBLAS
|
||||||
|
|
||||||
void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
||||||
blasint * M, blasint * N, blasint * K,
|
blasint * M, blasint * K,
|
||||||
FLOAT * Alpha,
|
FLOAT * Alpha,
|
||||||
IFLOAT * a, blasint * ldA,
|
IFLOAT * a, blasint * ldA,
|
||||||
IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
|
IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
|
||||||
{
|
{
|
||||||
|
|
||||||
blasint m, n, k;
|
blasint m, k;
|
||||||
blasint lda, ldb, ldc;
|
blasint lda, ldb, ldc;
|
||||||
int transa, transb, uplo;
|
int transa, transb, uplo;
|
||||||
blasint info;
|
blasint info;
|
||||||
|
|
||||||
char transA, transB, Uplo;
|
char transA, transB, Uplo;
|
||||||
|
blasint nrowa, nrowb;
|
||||||
IFLOAT *buffer;
|
IFLOAT *buffer;
|
||||||
IFLOAT *aa, *bb;
|
IFLOAT *aa, *bb;
|
||||||
FLOAT *cc;
|
FLOAT *cc;
|
||||||
|
@ -92,7 +90,6 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
||||||
PRINT_DEBUG_NAME;
|
PRINT_DEBUG_NAME;
|
||||||
|
|
||||||
m = *M;
|
m = *M;
|
||||||
n = *N;
|
|
||||||
k = *K;
|
k = *K;
|
||||||
|
|
||||||
#if defined(COMPLEX)
|
#if defined(COMPLEX)
|
||||||
|
@ -159,32 +156,39 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
||||||
if (Uplo == 'L')
|
if (Uplo == 'L')
|
||||||
uplo = 1;
|
uplo = 1;
|
||||||
|
|
||||||
|
nrowa = m;
|
||||||
|
if (transa) nrowa = k;
|
||||||
|
nrowb = k;
|
||||||
|
if (transb) nrowb = m;
|
||||||
|
|
||||||
info = 0;
|
info = 0;
|
||||||
|
|
||||||
if (uplo < 0)
|
if (ldc < MAX(1, m))
|
||||||
info = 14;
|
|
||||||
if (ldc < m)
|
|
||||||
info = 13;
|
info = 13;
|
||||||
|
if (ldb < MAX(1, nrowa))
|
||||||
|
info = 10;
|
||||||
|
if (lda < MAX(1, nrowb))
|
||||||
|
info = 8;
|
||||||
if (k < 0)
|
if (k < 0)
|
||||||
info = 5;
|
info = 5;
|
||||||
if (n < 0)
|
|
||||||
info = 4;
|
|
||||||
if (m < 0)
|
if (m < 0)
|
||||||
info = 3;
|
info = 4;
|
||||||
if (transb < 0)
|
if (transb < 0)
|
||||||
info = 2;
|
info = 3;
|
||||||
if (transa < 0)
|
if (transa < 0)
|
||||||
|
info = 2;
|
||||||
|
if (uplo < 0)
|
||||||
info = 1;
|
info = 1;
|
||||||
|
|
||||||
if (info) {
|
if (info != 0) {
|
||||||
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
|
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
|
||||||
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M,
|
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m,
|
||||||
blasint N, blasint k,
|
blasint k,
|
||||||
#ifndef COMPLEX
|
#ifndef COMPLEX
|
||||||
FLOAT alpha,
|
FLOAT alpha,
|
||||||
IFLOAT * A, blasint LDA,
|
IFLOAT * A, blasint LDA,
|
||||||
|
@ -205,17 +209,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
|
|
||||||
int transa, transb, uplo;
|
int transa, transb, uplo;
|
||||||
blasint info;
|
blasint info;
|
||||||
blasint m, n, lda, ldb;
|
blasint lda, ldb;
|
||||||
FLOAT *a, *b;
|
FLOAT *a, *b;
|
||||||
XFLOAT *buffer;
|
XFLOAT *buffer;
|
||||||
|
|
||||||
PRINT_DEBUG_CNAME;
|
PRINT_DEBUG_CNAME;
|
||||||
|
|
||||||
|
uplo = -1;
|
||||||
transa = -1;
|
transa = -1;
|
||||||
transb = -1;
|
transb = -1;
|
||||||
info = 0;
|
info = 0;
|
||||||
|
|
||||||
if (order == CblasColMajor) {
|
if (order == CblasColMajor) {
|
||||||
|
if (Uplo == CblasUpper) uplo = 0;
|
||||||
|
if (Uplo == CblasLower) uplo = 1;
|
||||||
|
|
||||||
if (TransA == CblasNoTrans)
|
if (TransA == CblasNoTrans)
|
||||||
transa = 0;
|
transa = 0;
|
||||||
|
@ -248,9 +255,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
transb = 3;
|
transb = 3;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
m = M;
|
|
||||||
n = N;
|
|
||||||
|
|
||||||
a = (void *)A;
|
a = (void *)A;
|
||||||
b = (void *)B;
|
b = (void *)B;
|
||||||
lda = LDA;
|
lda = LDA;
|
||||||
|
@ -258,23 +262,31 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
|
|
||||||
info = -1;
|
info = -1;
|
||||||
|
|
||||||
if (ldc < m)
|
blasint nrowa, nrowb;
|
||||||
|
nrowa = m;
|
||||||
|
if (transa) nrowa = k;
|
||||||
|
nrowb = k;
|
||||||
|
if (transb) nrowb = m;
|
||||||
|
|
||||||
|
if (ldc < MAX(1, m))
|
||||||
info = 13;
|
info = 13;
|
||||||
|
if (ldb < MAX(1, nrowb))
|
||||||
|
info = 10;
|
||||||
|
if (lda < MAX(1, nrowa))
|
||||||
|
info = 8;
|
||||||
if (k < 0)
|
if (k < 0)
|
||||||
info = 5;
|
info = 5;
|
||||||
if (n < 0)
|
|
||||||
info = 4;
|
|
||||||
if (m < 0)
|
if (m < 0)
|
||||||
info = 3;
|
info = 4;
|
||||||
if (transb < 0)
|
if (transb < 0)
|
||||||
info = 2;
|
info = 3;
|
||||||
if (transa < 0)
|
if (transa < 0)
|
||||||
|
info = 2;
|
||||||
|
if (uplo < 0)
|
||||||
info = 1;
|
info = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (order == CblasRowMajor) {
|
if (order == CblasRowMajor) {
|
||||||
m = N;
|
|
||||||
n = M;
|
|
||||||
|
|
||||||
a = (void *)B;
|
a = (void *)B;
|
||||||
b = (void *)A;
|
b = (void *)A;
|
||||||
|
@ -282,6 +294,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
lda = LDB;
|
lda = LDB;
|
||||||
ldb = LDA;
|
ldb = LDA;
|
||||||
|
|
||||||
|
if (Uplo == CblasUpper) uplo = 0;
|
||||||
|
if (Uplo == CblasLower) uplo = 1;
|
||||||
|
|
||||||
if (TransB == CblasNoTrans)
|
if (TransB == CblasNoTrans)
|
||||||
transa = 0;
|
transa = 0;
|
||||||
if (TransB == CblasTrans)
|
if (TransB == CblasTrans)
|
||||||
|
@ -315,28 +330,29 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
|
|
||||||
info = -1;
|
info = -1;
|
||||||
|
|
||||||
if (ldc < m)
|
blasint ncola, ncolb;
|
||||||
|
ncola = k;
|
||||||
|
if (transa) ncola = m;
|
||||||
|
ncolb = m;
|
||||||
|
if (transb) ncolb = k;
|
||||||
|
|
||||||
|
if (ldc < MAX(1,m))
|
||||||
info = 13;
|
info = 13;
|
||||||
|
if (ldb < MAX(1, ncolb))
|
||||||
|
info = 10;
|
||||||
|
if (lda < MAX(1, ncola))
|
||||||
|
info = 8;
|
||||||
if (k < 0)
|
if (k < 0)
|
||||||
info = 5;
|
info = 5;
|
||||||
if (n < 0)
|
|
||||||
info = 4;
|
|
||||||
if (m < 0)
|
if (m < 0)
|
||||||
info = 3;
|
info = 4;
|
||||||
if (transb < 0)
|
if (transb < 0)
|
||||||
info = 2;
|
info = 3;
|
||||||
if (transa < 0)
|
if (transa < 0)
|
||||||
info = 1;
|
info = 2;
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
uplo = -1;
|
|
||||||
if (Uplo == CblasUpper)
|
|
||||||
uplo = 0;
|
|
||||||
if (Uplo == CblasLower)
|
|
||||||
uplo = 1;
|
|
||||||
if (uplo < 0)
|
if (uplo < 0)
|
||||||
info = 14;
|
info = 1;
|
||||||
|
}
|
||||||
|
|
||||||
if (info >= 0) {
|
if (info >= 0) {
|
||||||
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
|
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||||
|
@ -407,37 +423,35 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if ((m == 0) || (n == 0))
|
if (m == 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
IDEBUG_START;
|
IDEBUG_START;
|
||||||
|
|
||||||
FUNCTION_PROFILE_START();
|
|
||||||
|
|
||||||
const blasint incb = (transb == 0) ? 1 : ldb;
|
const blasint incb = (transb == 0) ? 1 : ldb;
|
||||||
|
|
||||||
if (uplo == 1) {
|
if (uplo == 1) {
|
||||||
for (i = 0; i < n; i++) {
|
for (i = 0; i < m; i++) {
|
||||||
j = n - i;
|
j = m - i;
|
||||||
|
|
||||||
l = j;
|
l = j;
|
||||||
#if defined(COMPLEX)
|
#if defined(COMPLEX)
|
||||||
aa = a + i * 2;
|
aa = a + i * 2;
|
||||||
bb = b + i * ldb * 2;
|
bb = b + i * ldb * 2;
|
||||||
if (transa) {
|
if (transa) {
|
||||||
l = k;
|
|
||||||
aa = a + lda * i * 2;
|
aa = a + lda * i * 2;
|
||||||
bb = b + i * 2;
|
|
||||||
}
|
}
|
||||||
|
if (transb)
|
||||||
|
bb = b + i * 2;
|
||||||
cc = c + i * 2 * ldc + i * 2;
|
cc = c + i * 2 * ldc + i * 2;
|
||||||
#else
|
#else
|
||||||
aa = a + i;
|
aa = a + i;
|
||||||
bb = b + i * ldb;
|
bb = b + i * ldb;
|
||||||
if (transa) {
|
if (transa) {
|
||||||
l = k;
|
|
||||||
aa = a + lda * i;
|
aa = a + lda * i;
|
||||||
bb = b + i;
|
|
||||||
}
|
}
|
||||||
|
if (transb)
|
||||||
|
bb = b + i;
|
||||||
cc = c + i * ldc + i;
|
cc = c + i * ldc + i;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -458,8 +472,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
|
|
||||||
IDEBUG_START;
|
IDEBUG_START;
|
||||||
|
|
||||||
FUNCTION_PROFILE_START();
|
|
||||||
|
|
||||||
buffer_size = j + k + 128 / sizeof(FLOAT);
|
buffer_size = j + k + 128 / sizeof(FLOAT);
|
||||||
#ifdef WINDOWS_ABI
|
#ifdef WINDOWS_ABI
|
||||||
buffer_size += 160 / sizeof(FLOAT);
|
buffer_size += 160 / sizeof(FLOAT);
|
||||||
|
@ -479,20 +491,34 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(COMPLEX)
|
#if defined(COMPLEX)
|
||||||
|
if (!transa)
|
||||||
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
|
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
|
||||||
aa, lda, bb, incb, cc, 1,
|
aa, lda, bb, incb, cc, 1,
|
||||||
buffer);
|
buffer);
|
||||||
|
else
|
||||||
|
(gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
|
||||||
|
aa, lda, bb, incb, cc, 1,
|
||||||
|
buffer);
|
||||||
#else
|
#else
|
||||||
|
if (!transa)
|
||||||
(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
|
(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
|
||||||
bb, incb, cc, 1, buffer);
|
bb, incb, cc, 1, buffer);
|
||||||
|
else
|
||||||
|
(gemv[(int)transa]) (k, j, 0, alpha, aa, lda,
|
||||||
|
bb, incb, cc, 1, buffer);
|
||||||
#endif
|
#endif
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
} else {
|
} else {
|
||||||
|
if (!transa)
|
||||||
(gemv_thread[(int)transa]) (j, k, alpha, aa,
|
(gemv_thread[(int)transa]) (j, k, alpha, aa,
|
||||||
lda, bb, incb, cc,
|
lda, bb, incb, cc,
|
||||||
1, buffer,
|
1, buffer,
|
||||||
nthreads);
|
nthreads);
|
||||||
|
else
|
||||||
|
(gemv_thread[(int)transa]) (k, j, alpha, aa,
|
||||||
|
lda, bb, incb, cc,
|
||||||
|
1, buffer,
|
||||||
|
nthreads);
|
||||||
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -501,21 +527,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
for (i = 0; i < n; i++) {
|
for (i = 0; i < m; i++) {
|
||||||
j = i + 1;
|
j = i + 1;
|
||||||
|
|
||||||
l = j;
|
l = j;
|
||||||
#if defined COMPLEX
|
#if defined COMPLEX
|
||||||
bb = b + i * ldb * 2;
|
bb = b + i * ldb * 2;
|
||||||
if (transa) {
|
if (transb) {
|
||||||
l = k;
|
|
||||||
bb = b + i * 2;
|
bb = b + i * 2;
|
||||||
}
|
}
|
||||||
cc = c + i * 2 * ldc;
|
cc = c + i * 2 * ldc;
|
||||||
#else
|
#else
|
||||||
bb = b + i * ldb;
|
bb = b + i * ldb;
|
||||||
if (transa) {
|
if (transb) {
|
||||||
l = k;
|
|
||||||
bb = b + i;
|
bb = b + i;
|
||||||
}
|
}
|
||||||
cc = c + i * ldc;
|
cc = c + i * ldc;
|
||||||
|
@ -537,8 +561,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
#endif
|
#endif
|
||||||
IDEBUG_START;
|
IDEBUG_START;
|
||||||
|
|
||||||
FUNCTION_PROFILE_START();
|
|
||||||
|
|
||||||
buffer_size = j + k + 128 / sizeof(FLOAT);
|
buffer_size = j + k + 128 / sizeof(FLOAT);
|
||||||
#ifdef WINDOWS_ABI
|
#ifdef WINDOWS_ABI
|
||||||
buffer_size += 160 / sizeof(FLOAT);
|
buffer_size += 160 / sizeof(FLOAT);
|
||||||
|
@ -558,30 +580,39 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(COMPLEX)
|
#if defined(COMPLEX)
|
||||||
|
if (!transa)
|
||||||
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
|
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
|
||||||
a, lda, bb, incb, cc, 1,
|
a, lda, bb, incb, cc, 1,
|
||||||
buffer);
|
buffer);
|
||||||
|
else
|
||||||
|
(gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
|
||||||
|
a, lda, bb, incb, cc, 1,
|
||||||
|
buffer);
|
||||||
#else
|
#else
|
||||||
|
if (!transa)
|
||||||
(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
|
(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
|
||||||
incb, cc, 1, buffer);
|
incb, cc, 1, buffer);
|
||||||
|
else
|
||||||
|
(gemv[(int)transa]) (k, j, 0, alpha, a, lda, bb,
|
||||||
|
incb, cc, 1, buffer);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
} else {
|
} else {
|
||||||
|
if (!transa)
|
||||||
(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
|
(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
|
||||||
bb, incb, cc, 1,
|
bb, incb, cc, 1,
|
||||||
buffer, nthreads);
|
buffer, nthreads);
|
||||||
|
else
|
||||||
|
(gemv_thread[(int)transa]) (k, j, alpha, a, lda,
|
||||||
|
bb, incb, cc, 1,
|
||||||
|
buffer, nthreads);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
STACK_FREE(buffer);
|
STACK_FREE(buffer);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE,
|
|
||||||
args.m * args.k + args.k * args.n +
|
|
||||||
args.m * args.n, 2 * args.m * args.n * args.k);
|
|
||||||
|
|
||||||
IDEBUG_END;
|
IDEBUG_END;
|
||||||
|
|
||||||
|
|
|
@ -100,13 +100,13 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
||||||
|
|
||||||
if ( order == BlasColMajor)
|
if ( order == BlasColMajor)
|
||||||
{
|
{
|
||||||
if ( trans == BlasNoTrans && *ldb < *rows ) info = 9;
|
if ( trans == BlasNoTrans && *ldb < *rows ) info = 8;
|
||||||
if ( trans == BlasTrans && *ldb < *cols ) info = 9;
|
if ( trans == BlasTrans && *ldb < *cols ) info = 8;
|
||||||
}
|
}
|
||||||
if ( order == BlasRowMajor)
|
if ( order == BlasRowMajor)
|
||||||
{
|
{
|
||||||
if ( trans == BlasNoTrans && *ldb < *cols ) info = 9;
|
if ( trans == BlasNoTrans && *ldb < *cols ) info = 8;
|
||||||
if ( trans == BlasTrans && *ldb < *rows ) info = 9;
|
if ( trans == BlasTrans && *ldb < *rows ) info = 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( order == BlasColMajor && *lda < *rows ) info = 7;
|
if ( order == BlasColMajor && *lda < *rows ) info = 7;
|
||||||
|
@ -120,17 +120,20 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
||||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef NEW_IMATCOPY
|
#ifdef NEW_IMATCOPY
|
||||||
if ( *lda == *ldb && *rows == *cols) {
|
if ( *lda == *ldb ) {
|
||||||
if ( order == BlasColMajor )
|
if ( order == BlasColMajor )
|
||||||
{
|
{
|
||||||
if ( trans == BlasNoTrans )
|
if ( trans == BlasNoTrans )
|
||||||
{
|
{
|
||||||
IMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda );
|
IMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda );
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
else
|
else if ( *rows == *cols )
|
||||||
{
|
{
|
||||||
IMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda );
|
IMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda );
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -138,26 +141,23 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
||||||
if ( trans == BlasNoTrans )
|
if ( trans == BlasNoTrans )
|
||||||
{
|
{
|
||||||
IMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda );
|
IMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda );
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
IMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda );
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
else if ( *rows == *cols )
|
||||||
|
{
|
||||||
|
IMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda );
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if ( *lda > *ldb )
|
msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT);
|
||||||
msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT);
|
|
||||||
else
|
|
||||||
msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT);
|
|
||||||
|
|
||||||
b = malloc(msize);
|
b = malloc(msize);
|
||||||
if ( b == NULL )
|
if ( b == NULL )
|
||||||
{
|
{
|
||||||
printf("Memory alloc failed\n");
|
printf("Memory alloc failed in imatcopy\n");
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -165,26 +165,26 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
||||||
{
|
{
|
||||||
if ( trans == BlasNoTrans )
|
if ( trans == BlasNoTrans )
|
||||||
{
|
{
|
||||||
OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *ldb );
|
OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *rows );
|
||||||
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *ldb, a, *ldb );
|
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *rows, a, *ldb );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *ldb );
|
OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *cols );
|
||||||
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, b, *ldb, a, *ldb );
|
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, b, *cols, a, *ldb );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
if ( trans == BlasNoTrans )
|
if ( trans == BlasNoTrans )
|
||||||
{
|
{
|
||||||
OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *ldb );
|
OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *cols );
|
||||||
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *ldb, a, *ldb );
|
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *cols, a, *ldb );
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *ldb );
|
OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *rows );
|
||||||
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, b, *ldb, a, *ldb );
|
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, b, *rows, a, *ldb );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -54,6 +54,21 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
|
||||||
|
|
||||||
if (n <= 0) return 0.;
|
if (n <= 0) return 0.;
|
||||||
|
|
||||||
|
#ifndef COMPLEX
|
||||||
|
if (n == 1)
|
||||||
|
#ifdef DOUBLE
|
||||||
|
return fabs(x[0]);
|
||||||
|
#else
|
||||||
|
return fabsf(x[0]);
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (incx < 0)
|
||||||
|
#ifdef COMPLEX
|
||||||
|
x -= (n - 1) * incx * 2;
|
||||||
|
#else
|
||||||
|
x -= (n - 1) * incx;
|
||||||
|
#endif
|
||||||
IDEBUG_START;
|
IDEBUG_START;
|
||||||
|
|
||||||
FUNCTION_PROFILE_START();
|
FUNCTION_PROFILE_START();
|
||||||
|
@ -82,6 +97,22 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
|
||||||
|
|
||||||
if (n <= 0) return 0.;
|
if (n <= 0) return 0.;
|
||||||
|
|
||||||
|
#ifndef COMPLEX
|
||||||
|
if (n == 1)
|
||||||
|
#ifdef DOUBLE
|
||||||
|
return fabs(x[0]);
|
||||||
|
#else
|
||||||
|
return fabsf(x[0]);
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (incx < 0)
|
||||||
|
#ifdef COMPLEX
|
||||||
|
x -= (n - 1) * incx * 2;
|
||||||
|
#else
|
||||||
|
x -= (n - 1) * incx;
|
||||||
|
#endif
|
||||||
|
|
||||||
IDEBUG_START;
|
IDEBUG_START;
|
||||||
|
|
||||||
FUNCTION_PROFILE_START();
|
FUNCTION_PROFILE_START();
|
||||||
|
|
|
@ -1,9 +1,11 @@
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <float.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#ifdef FUNCTION_PROFILE
|
#ifdef FUNCTION_PROFILE
|
||||||
#include "functable.h"
|
#include "functable.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifndef CBLAS
|
#ifndef CBLAS
|
||||||
|
|
||||||
void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
||||||
|
@ -14,17 +16,27 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef DOUBLE
|
||||||
|
long double safmin = DBL_MIN;
|
||||||
|
#else
|
||||||
|
long double safmin = FLT_MIN;
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86)
|
#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86)
|
||||||
|
|
||||||
long double da = *DA;
|
long double da = *DA;
|
||||||
long double db = *DB;
|
long double db = *DB;
|
||||||
long double c;
|
long double c;
|
||||||
long double s;
|
long double s;
|
||||||
long double r, roe, z;
|
long double r, z;
|
||||||
|
long double sigma, dascal,dbscal;
|
||||||
|
|
||||||
long double ada = fabsl(da);
|
long double ada = fabsl(da);
|
||||||
long double adb = fabsl(db);
|
long double adb = fabsl(db);
|
||||||
long double scale = ada + adb;
|
long double maxab = MAX(ada,adb);
|
||||||
|
long double safmax;
|
||||||
|
long double scale;
|
||||||
|
|
||||||
|
|
||||||
#ifndef CBLAS
|
#ifndef CBLAS
|
||||||
PRINT_DEBUG_NAME;
|
PRINT_DEBUG_NAME;
|
||||||
|
@ -32,17 +44,25 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
||||||
PRINT_DEBUG_CNAME;
|
PRINT_DEBUG_CNAME;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
roe = db;
|
if (adb == ZERO) {
|
||||||
if (ada > adb) roe = da;
|
|
||||||
|
|
||||||
if (scale == ZERO) {
|
|
||||||
*C = ONE;
|
*C = ONE;
|
||||||
*S = ZERO;
|
*S = ZERO;
|
||||||
*DA = ZERO;
|
|
||||||
*DB = ZERO;
|
*DB = ZERO;
|
||||||
|
} else if (ada == ZERO) {
|
||||||
|
*C = ZERO;
|
||||||
|
*S = ONE;
|
||||||
|
*DA = *DB;
|
||||||
|
*DB = ONE;
|
||||||
} else {
|
} else {
|
||||||
r = sqrt(da * da + db * db);
|
safmax = 1./safmin;
|
||||||
if (roe < 0) r = -r;
|
scale = MIN(MAX(safmin,maxab), safmax);
|
||||||
|
if (ada > adb)
|
||||||
|
sigma = copysign(1.,da);
|
||||||
|
else
|
||||||
|
sigma = copysign(1.,db);
|
||||||
|
dascal = da / scale;
|
||||||
|
dbscal = db / scale;
|
||||||
|
r = sigma * (scale * sqrt(dascal * dascal + dbscal * dbscal));
|
||||||
c = da / r;
|
c = da / r;
|
||||||
s = db / r;
|
s = db / r;
|
||||||
z = ONE;
|
z = ONE;
|
||||||
|
@ -65,11 +85,22 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
||||||
FLOAT db = *DB;
|
FLOAT db = *DB;
|
||||||
FLOAT c = *C;
|
FLOAT c = *C;
|
||||||
FLOAT s = *S;
|
FLOAT s = *S;
|
||||||
FLOAT r, roe, z;
|
FLOAT sigma;
|
||||||
|
FLOAT r, z;
|
||||||
|
|
||||||
FLOAT ada = fabs(da);
|
FLOAT ada = fabs(da);
|
||||||
FLOAT adb = fabs(db);
|
FLOAT adb = fabs(db);
|
||||||
FLOAT scale = ada + adb;
|
FLOAT maxab = MAX(ada,adb);
|
||||||
|
long double safmax ;
|
||||||
|
FLOAT scale ;
|
||||||
|
|
||||||
|
safmax = 1./safmin;
|
||||||
|
scale = MIN(MAX(safmin,maxab), safmax);
|
||||||
|
|
||||||
|
if (ada > adb)
|
||||||
|
sigma = copysign(1.,da);
|
||||||
|
else
|
||||||
|
sigma = copysign(1.,db);
|
||||||
|
|
||||||
#ifndef CBLAS
|
#ifndef CBLAS
|
||||||
PRINT_DEBUG_NAME;
|
PRINT_DEBUG_NAME;
|
||||||
|
@ -77,20 +108,21 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
||||||
PRINT_DEBUG_CNAME;
|
PRINT_DEBUG_CNAME;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
roe = db;
|
|
||||||
if (ada > adb) roe = da;
|
|
||||||
|
|
||||||
if (scale == ZERO) {
|
if (adb == ZERO) {
|
||||||
*C = ONE;
|
*C = ONE;
|
||||||
*S = ZERO;
|
*S = ZERO;
|
||||||
*DA = ZERO;
|
|
||||||
*DB = ZERO;
|
*DB = ZERO;
|
||||||
|
} else if (ada == ZERO) {
|
||||||
|
*C = ZERO;
|
||||||
|
*S = ONE;
|
||||||
|
*DA = *DB;
|
||||||
|
*DB = ONE;
|
||||||
} else {
|
} else {
|
||||||
FLOAT aa = da / scale;
|
FLOAT aa = da / scale;
|
||||||
FLOAT bb = db / scale;
|
FLOAT bb = db / scale;
|
||||||
|
|
||||||
r = scale * sqrt(aa * aa + bb * bb);
|
r = sigma * scale * sqrt(aa * aa + bb * bb);
|
||||||
if (roe < 0) r = -r;
|
|
||||||
c = da / r;
|
c = da / r;
|
||||||
s = db / r;
|
s = db / r;
|
||||||
z = ONE;
|
z = ONE;
|
||||||
|
|
|
@ -166,7 +166,7 @@ void NAME(char *SIDE, char *UPLO,
|
||||||
int nodes;
|
int nodes;
|
||||||
#endif
|
#endif
|
||||||
# if defined(SMP)
|
# if defined(SMP)
|
||||||
int MN;
|
double MN;
|
||||||
#endif
|
#endif
|
||||||
blasint info;
|
blasint info;
|
||||||
int side;
|
int side;
|
||||||
|
@ -264,7 +264,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
|
||||||
int nodes;
|
int nodes;
|
||||||
#endif
|
#endif
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
int MN;
|
double MN;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
PRINT_DEBUG_CNAME;
|
PRINT_DEBUG_CNAME;
|
||||||
|
|
|
@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS,
|
||||||
FLOAT *sa, *sb;
|
FLOAT *sa, *sb;
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
int NNK;
|
double NNK;
|
||||||
#ifdef USE_SIMPLE_THREADED_LEVEL3
|
#ifdef USE_SIMPLE_THREADED_LEVEL3
|
||||||
#ifndef COMPLEX
|
#ifndef COMPLEX
|
||||||
#ifdef XDOUBLE
|
#ifdef XDOUBLE
|
||||||
|
@ -232,7 +232,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
|
||||||
FLOAT *sa, *sb;
|
FLOAT *sa, *sb;
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
int NNK;
|
double NNK;
|
||||||
|
|
||||||
#ifdef USE_SIMPLE_THREADED_LEVEL3
|
#ifdef USE_SIMPLE_THREADED_LEVEL3
|
||||||
#ifndef COMPLEX
|
#ifndef COMPLEX
|
||||||
|
|
|
@ -125,27 +125,33 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
||||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef NEW_IMATCOPY
|
#ifdef NEW_IMATCOPY
|
||||||
if (*lda == *ldb && *cols == *rows) {
|
if (*lda == *ldb ) {
|
||||||
if ( order == BlasColMajor )
|
if ( order == BlasColMajor )
|
||||||
{
|
{
|
||||||
|
|
||||||
if ( trans == BlasNoTrans )
|
if ( trans == BlasNoTrans )
|
||||||
{
|
{
|
||||||
IMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
IMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
if ( trans == BlasConj )
|
if ( trans == BlasConj )
|
||||||
{
|
{
|
||||||
IMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
IMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
if ( trans == BlasTrans )
|
if ( trans == BlasTrans && *rows == *cols )
|
||||||
{
|
{
|
||||||
IMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
IMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
if ( trans == BlasTransConj )
|
if ( trans == BlasTransConj && *rows == *cols )
|
||||||
{
|
{
|
||||||
IMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
IMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -153,28 +159,29 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
||||||
if ( trans == BlasNoTrans )
|
if ( trans == BlasNoTrans )
|
||||||
{
|
{
|
||||||
IMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
IMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
if ( trans == BlasConj )
|
if ( trans == BlasConj )
|
||||||
{
|
{
|
||||||
IMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
IMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
if ( trans == BlasTrans )
|
if ( trans == BlasTrans && *rows == *cols )
|
||||||
{
|
{
|
||||||
IMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
IMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
if ( trans == BlasTransConj )
|
if ( trans == BlasTransConj && *rows == *cols )
|
||||||
{
|
{
|
||||||
IMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
IMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
||||||
}
|
|
||||||
}
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if ( *lda > *ldb )
|
msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT) * 2;
|
||||||
msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT) * 2;
|
|
||||||
else
|
|
||||||
msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT) * 2;
|
|
||||||
|
|
||||||
b = malloc(msize);
|
b = malloc(msize);
|
||||||
if ( b == NULL )
|
if ( b == NULL )
|
||||||
|
@ -183,37 +190,28 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
if ( order == BlasColMajor )
|
if ( order == BlasColMajor )
|
||||||
{
|
{
|
||||||
|
|
||||||
if ( trans == BlasNoTrans )
|
if ( trans == BlasNoTrans )
|
||||||
{
|
{
|
||||||
OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
|
||||||
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
|
||||||
free(b);
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
if ( trans == BlasConj )
|
else if ( trans == BlasConj )
|
||||||
{
|
{
|
||||||
OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
|
||||||
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
|
||||||
free(b);
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
if ( trans == BlasTrans )
|
else if ( trans == BlasTrans )
|
||||||
{
|
{
|
||||||
OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
|
||||||
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
|
||||||
free(b);
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
if ( trans == BlasTransConj )
|
else if ( trans == BlasTransConj )
|
||||||
{
|
{
|
||||||
OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
|
||||||
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
|
||||||
free(b);
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -222,34 +220,27 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
||||||
|
|
||||||
if ( trans == BlasNoTrans )
|
if ( trans == BlasNoTrans )
|
||||||
{
|
{
|
||||||
OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
|
||||||
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
|
||||||
free(b);
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
if ( trans == BlasConj )
|
else if ( trans == BlasConj )
|
||||||
{
|
{
|
||||||
OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
|
||||||
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
|
||||||
free(b);
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
if ( trans == BlasTrans )
|
else if ( trans == BlasTrans )
|
||||||
{
|
{
|
||||||
OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
|
||||||
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
|
||||||
free(b);
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
if ( trans == BlasTransConj )
|
else if ( trans == BlasTransConj )
|
||||||
{
|
{
|
||||||
OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
|
||||||
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
|
||||||
free(b);
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
free(b);
|
free(b);
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,11 @@
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
#include <float.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#ifdef FUNCTION_PROFILE
|
#ifdef FUNCTION_PROFILE
|
||||||
#include "functable.h"
|
#include "functable.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifndef CBLAS
|
#ifndef CBLAS
|
||||||
void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
||||||
|
|
||||||
|
@ -14,123 +16,166 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
|
||||||
FLOAT *S = (FLOAT*) VS;
|
FLOAT *S = (FLOAT*) VS;
|
||||||
#endif /* CBLAS */
|
#endif /* CBLAS */
|
||||||
|
|
||||||
#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86)
|
#ifdef DOUBLE
|
||||||
|
long double safmin = DBL_MIN;
|
||||||
long double da_r = *(DA + 0);
|
long double rtmin = sqrt(DBL_MIN/DBL_EPSILON);
|
||||||
long double da_i = *(DA + 1);
|
|
||||||
long double db_r = *(DB + 0);
|
|
||||||
long double db_i = *(DB + 1);
|
|
||||||
long double r;
|
|
||||||
|
|
||||||
long double ada = fabsl(da_r) + fabsl(da_i);
|
|
||||||
|
|
||||||
PRINT_DEBUG_NAME;
|
|
||||||
|
|
||||||
IDEBUG_START;
|
|
||||||
|
|
||||||
FUNCTION_PROFILE_START();
|
|
||||||
|
|
||||||
if (ada == ZERO) {
|
|
||||||
*C = ZERO;
|
|
||||||
*(S + 0) = ONE;
|
|
||||||
*(S + 1) = ZERO;
|
|
||||||
*(DA + 0) = db_r;
|
|
||||||
*(DA + 1) = db_i;
|
|
||||||
} else {
|
|
||||||
long double alpha_r, alpha_i;
|
|
||||||
|
|
||||||
ada = sqrt(da_r * da_r + da_i * da_i);
|
|
||||||
|
|
||||||
r = sqrt(da_r * da_r + da_i * da_i + db_r * db_r + db_i * db_i);
|
|
||||||
|
|
||||||
alpha_r = da_r / ada;
|
|
||||||
alpha_i = da_i / ada;
|
|
||||||
|
|
||||||
*(C + 0) = ada / r;
|
|
||||||
*(S + 0) = (alpha_r * db_r + alpha_i *db_i) / r;
|
|
||||||
*(S + 1) = (alpha_i * db_r - alpha_r *db_i) / r;
|
|
||||||
*(DA + 0) = alpha_r * r;
|
|
||||||
*(DA + 1) = alpha_i * r;
|
|
||||||
}
|
|
||||||
#else
|
#else
|
||||||
FLOAT da_r = *(DA + 0);
|
long double safmin = FLT_MIN;
|
||||||
FLOAT da_i = *(DA + 1);
|
long double rtmin = sqrt(FLT_MIN/FLT_EPSILON);
|
||||||
FLOAT db_r = *(DB + 0);
|
|
||||||
FLOAT db_i = *(DB + 1);
|
|
||||||
FLOAT r;
|
|
||||||
|
|
||||||
FLOAT ada = fabs(da_r) + fabs(da_i);
|
|
||||||
FLOAT adb;
|
|
||||||
|
|
||||||
PRINT_DEBUG_NAME;
|
|
||||||
|
|
||||||
IDEBUG_START;
|
|
||||||
|
|
||||||
FUNCTION_PROFILE_START();
|
|
||||||
|
|
||||||
if (ada == ZERO) {
|
|
||||||
*C = ZERO;
|
|
||||||
*(S + 0) = ONE;
|
|
||||||
*(S + 1) = ZERO;
|
|
||||||
*(DA + 0) = db_r;
|
|
||||||
*(DA + 1) = db_i;
|
|
||||||
} else {
|
|
||||||
FLOAT scale;
|
|
||||||
FLOAT aa_r, aa_i, bb_r, bb_i;
|
|
||||||
FLOAT alpha_r, alpha_i;
|
|
||||||
|
|
||||||
aa_r = fabs(da_r);
|
|
||||||
aa_i = fabs(da_i);
|
|
||||||
|
|
||||||
if (aa_i > aa_r) {
|
|
||||||
aa_r = fabs(da_i);
|
|
||||||
aa_i = fabs(da_r);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (aa_r == ZERO) {
|
|
||||||
ada = 0.;
|
|
||||||
} else {
|
|
||||||
scale = (aa_i / aa_r);
|
|
||||||
ada = aa_r * sqrt(ONE + scale * scale);
|
|
||||||
}
|
|
||||||
|
|
||||||
bb_r = fabs(db_r);
|
|
||||||
bb_i = fabs(db_i);
|
|
||||||
|
|
||||||
if (bb_i > bb_r) {
|
|
||||||
bb_r = fabs(bb_i);
|
|
||||||
bb_i = fabs(bb_r);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (bb_r == ZERO) {
|
|
||||||
adb = 0.;
|
|
||||||
} else {
|
|
||||||
scale = (bb_i / bb_r);
|
|
||||||
adb = bb_r * sqrt(ONE + scale * scale);
|
|
||||||
}
|
|
||||||
scale = ada + adb;
|
|
||||||
|
|
||||||
aa_r = da_r / scale;
|
|
||||||
aa_i = da_i / scale;
|
|
||||||
bb_r = db_r / scale;
|
|
||||||
bb_i = db_i / scale;
|
|
||||||
|
|
||||||
r = scale * sqrt(aa_r * aa_r + aa_i * aa_i + bb_r * bb_r + bb_i * bb_i);
|
|
||||||
|
|
||||||
alpha_r = da_r / ada;
|
|
||||||
alpha_i = da_i / ada;
|
|
||||||
|
|
||||||
*(C + 0) = ada / r;
|
|
||||||
*(S + 0) = (alpha_r * db_r + alpha_i *db_i) / r;
|
|
||||||
*(S + 1) = (alpha_i * db_r - alpha_r *db_i) / r;
|
|
||||||
*(DA + 0) = alpha_r * r;
|
|
||||||
*(DA + 1) = alpha_i * r;
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
FUNCTION_PROFILE_END(4, 4, 4);
|
|
||||||
|
|
||||||
IDEBUG_END;
|
FLOAT da_r = *(DA+0);
|
||||||
|
FLOAT da_i = *(DA+1);
|
||||||
|
FLOAT db_r = *(DB+0);
|
||||||
|
FLOAT db_i = *(DB+1);
|
||||||
|
//long double r;
|
||||||
|
FLOAT *r, *S1=(FLOAT *)malloc(2*sizeof(FLOAT));
|
||||||
|
FLOAT *R=(FLOAT *)malloc(2*sizeof(FLOAT));
|
||||||
|
long double d;
|
||||||
|
|
||||||
|
FLOAT ada = da_r * da_r + da_i * da_i;
|
||||||
|
FLOAT adb = db_r * db_r + db_i * db_i;
|
||||||
|
FLOAT adart = sqrt( da_r * da_r + da_i * da_i);
|
||||||
|
FLOAT adbrt = sqrt( db_r * db_r + db_i * db_i);
|
||||||
|
|
||||||
|
PRINT_DEBUG_NAME;
|
||||||
|
|
||||||
|
IDEBUG_START;
|
||||||
|
|
||||||
|
FUNCTION_PROFILE_START();
|
||||||
|
|
||||||
|
if (db_r == ZERO && db_i == ZERO) {
|
||||||
|
*C = ONE;
|
||||||
|
*(S + 0) = ZERO;
|
||||||
|
*(S + 1) = ZERO;
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
long double safmax = 1./safmin;
|
||||||
|
#if defined DOUBLE
|
||||||
|
long double rtmax = safmax /DBL_EPSILON;
|
||||||
|
#else
|
||||||
|
long double rtmax = safmax /FLT_EPSILON;
|
||||||
|
#endif
|
||||||
|
*(S1 + 0) = *(DB + 0);
|
||||||
|
*(S1 + 1) = *(DB + 1) *-1;
|
||||||
|
if (da_r == ZERO && da_i == ZERO) {
|
||||||
|
*C = ZERO;
|
||||||
|
if (db_r == ZERO) {
|
||||||
|
(*DA) = fabsl(db_i);
|
||||||
|
*S = *S1 /da_r;
|
||||||
|
*(S+1) = *(S1+1) /da_r;
|
||||||
|
return;
|
||||||
|
} else if ( db_i == ZERO) {
|
||||||
|
*DA = fabsl(db_r);
|
||||||
|
*S = *S1 /da_r;
|
||||||
|
*(S+1) = *(S1+1) /da_r;
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
long double g1 = MAX( fabsl(db_r), fabsl(db_i));
|
||||||
|
rtmax =sqrt(safmax/2.);
|
||||||
|
if (g1 > rtmin && g1 < rtmax) { // unscaled
|
||||||
|
d = sqrt(adb);
|
||||||
|
*S = *S1 /d;
|
||||||
|
*(S+1) = *(S1+1) /d;
|
||||||
|
*DA = d ;
|
||||||
|
*(DA+1) = ZERO;
|
||||||
|
return;
|
||||||
|
} else { // scaled algorithm
|
||||||
|
long double u = MIN ( safmax, MAX ( safmin, g1));
|
||||||
|
FLOAT gs_r = db_r/u;
|
||||||
|
FLOAT gs_i = db_i/u;
|
||||||
|
d = sqrt ( gs_r*gs_r + gs_i*gs_i);
|
||||||
|
*S = gs_r / d;
|
||||||
|
*(S + 1) = (gs_i * -1) / d;
|
||||||
|
*DA = d * u;
|
||||||
|
*(DA+1) = ZERO;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
FLOAT f1 = MAX ( fabsl(da_r), fabsl(da_i));
|
||||||
|
FLOAT g1 = MAX ( fabsl(db_r), fabsl(db_i));
|
||||||
|
rtmax = sqrt(safmax / 4.);
|
||||||
|
if ( f1 > rtmin && f1 < rtmax && g1 > rtmin && g1 < rtmax) { //unscaled
|
||||||
|
long double h = ada + adb;
|
||||||
|
double adahsq = sqrt(ada * h);
|
||||||
|
if (ada >= h *safmin) {
|
||||||
|
*C = sqrt(ada/h);
|
||||||
|
*R = *DA / *C;
|
||||||
|
*(R+1) = *(DA+1) / *(C+1);
|
||||||
|
rtmax *= 2.;
|
||||||
|
if ( ada > rtmin && h < rtmax) { // no risk of intermediate overflow
|
||||||
|
*S = *S1 * (*DA / adahsq) - *(S1+1)* (*(DA+1)/adahsq);
|
||||||
|
*(S+1) = *S1 * (*(DA+1) / adahsq) + *(S1+1) * (*DA/adahsq);
|
||||||
|
} else {
|
||||||
|
*S = *S1 * (*R/h) - *(S1+1) * (*(R+1)/h);
|
||||||
|
*(S+1) = *S1 * (*(R+1)/h) + *(S1+1) * (*(R)/h);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
*C = ada / adahsq;
|
||||||
|
if (*C >= safmin)
|
||||||
|
*R = *DA / *C;
|
||||||
|
else
|
||||||
|
*R = *DA * (h / adahsq);
|
||||||
|
*S = *S1 * ada / adahsq;
|
||||||
|
*(S+1) = *(S1+1) * ada / adahsq;
|
||||||
|
}
|
||||||
|
*DA=*R;
|
||||||
|
*(DA+1)=*(R+1);
|
||||||
|
return;
|
||||||
|
} else { // scaled
|
||||||
|
FLOAT fs_r, fs_i, gs_r, gs_i;
|
||||||
|
long double v,w,f2,g2,h;
|
||||||
|
long double u = MIN ( safmax, MAX ( safmin, MAX(f1,g1)));
|
||||||
|
gs_r = db_r/u;
|
||||||
|
gs_i = db_i/u;
|
||||||
|
g2 = sqrt ( gs_r*gs_r + gs_i*gs_i);
|
||||||
|
if (f1 /u < rtmin) {
|
||||||
|
v = MIN (safmax, MAX (safmin, f1));
|
||||||
|
w = v / u;
|
||||||
|
fs_r = *DA/ v;
|
||||||
|
fs_i = *(DA+1) / v;
|
||||||
|
f2 = sqrt ( fs_r*fs_r + fs_i*fs_i);
|
||||||
|
h = f2 * w * w + g2;
|
||||||
|
} else { // use same scaling for both
|
||||||
|
w = 1.;
|
||||||
|
fs_r = *DA/ u;
|
||||||
|
fs_i = *(DA+1) / u;
|
||||||
|
f2 = sqrt ( fs_r*fs_r + fs_i*fs_i);
|
||||||
|
h = f2 + g2;
|
||||||
|
}
|
||||||
|
if ( f2 >= h * safmin) {
|
||||||
|
*C = sqrt ( f2 / h );
|
||||||
|
*DA = fs_r / *C;
|
||||||
|
*(DA+1) = fs_i / *C;
|
||||||
|
rtmax *= 2;
|
||||||
|
if ( f2 > rtmin && h < rtmax) {
|
||||||
|
*S = gs_r * (fs_r /sqrt(f2*h)) - gs_i * (fs_i / sqrt(f2*h));
|
||||||
|
*(S+1) = gs_r * (fs_i /sqrt(f2*h)) + gs_i * -1. * (fs_r / sqrt(f2*h));
|
||||||
|
} else {
|
||||||
|
*S = gs_r * (*DA/h) - gs_i * (*(DA+1) / h);
|
||||||
|
*(S+1) = gs_r * (*(DA+1) /h) + gs_i * -1. * (*DA / h);
|
||||||
|
}
|
||||||
|
} else { // intermediates might overflow
|
||||||
|
d = sqrt ( f2 * h);
|
||||||
|
*C = f2 /d;
|
||||||
|
if (*C >= safmin) {
|
||||||
|
*DA = fs_r / *C;
|
||||||
|
*(DA+1) = fs_i / *C;
|
||||||
|
} else {
|
||||||
|
*DA = fs_r * (h / d);
|
||||||
|
*(DA+1) = fs_i / (h / d);
|
||||||
|
}
|
||||||
|
*S = gs_r * (fs_r /d) - gs_i * (fs_i / d);
|
||||||
|
*(S+1) = gs_r * (fs_i /d) + gs_i * -1. * (fs_r / d);
|
||||||
|
}
|
||||||
|
*C *= w;
|
||||||
|
*DA *= u;
|
||||||
|
*(DA+1) *= u;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,7 +33,7 @@ endif
|
||||||
ifdef TARGET_CORE
|
ifdef TARGET_CORE
|
||||||
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
|
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
|
||||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12)))
|
||||||
override CFLAGS += -march=sapphirerapids
|
override CFLAGS += -march=sapphirerapids
|
||||||
else
|
else
|
||||||
override CFLAGS += -march=skylake-avx512 -mavx512f
|
override CFLAGS += -march=skylake-avx512 -mavx512f
|
||||||
|
@ -48,7 +48,7 @@ ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
|
||||||
endif
|
endif
|
||||||
else ifeq ($(TARGET_CORE), COOPERLAKE)
|
else ifeq ($(TARGET_CORE), COOPERLAKE)
|
||||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(CLANGVERSIONGTEQ9)))
|
||||||
override CFLAGS += -march=cooperlake
|
override CFLAGS += -march=cooperlake
|
||||||
else
|
else
|
||||||
override CFLAGS += -march=skylake-avx512 -mavx512f
|
override CFLAGS += -march=skylake-avx512 -mavx512f
|
||||||
|
@ -77,6 +77,12 @@ else ifeq ($(TARGET_CORE), ZEN)
|
||||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
|
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
|
||||||
else ifeq ($(TARGET_CORE), LOONGSON3R4)
|
else ifeq ($(TARGET_CORE), LOONGSON3R4)
|
||||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS)
|
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS)
|
||||||
|
else ifneq ($(filter NEOVERSEN2 NEOVERSEV1, $(TARGET_CORE)),)
|
||||||
|
ifeq ($(C_COMPILER), PGI)
|
||||||
|
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -Msve_intrinsics
|
||||||
|
else
|
||||||
|
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||||
|
endif
|
||||||
else
|
else
|
||||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -35,6 +35,12 @@ USE_TRMM = 1
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifneq ($(DYNAMIC_ARCH), 1)
|
||||||
|
ifeq ($(TARGET), MIPS64_GENERIC)
|
||||||
|
USE_TRMM = 1
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), HASWELL)
|
ifeq ($(CORE), HASWELL)
|
||||||
USE_TRMM = 1
|
USE_TRMM = 1
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
FLOAT absxi = 0.0;
|
FLOAT absxi = 0.0;
|
||||||
|
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return(0.0);
|
if (n <= 0 || inc_x == 0) return(0.0);
|
||||||
if ( n == 1 ) return( ABS(x[0]) );
|
if ( n == 1 ) return( ABS(x[0]) );
|
||||||
|
|
||||||
n *= inc_x;
|
n *= inc_x;
|
||||||
|
|
|
@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
BLASLONG inc_x2;
|
BLASLONG inc_x2;
|
||||||
FLOAT temp;
|
FLOAT temp;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return(0.0);
|
if (n <= 0 || inc_x == 0) return(0.0);
|
||||||
|
|
||||||
inc_x2 = 2 * inc_x;
|
inc_x2 = 2 * inc_x;
|
||||||
|
|
||||||
|
|
|
@ -57,7 +57,7 @@ CAMAXKERNEL = zamax.S
|
||||||
ZAMAXKERNEL = zamax.S
|
ZAMAXKERNEL = zamax.S
|
||||||
|
|
||||||
SAXPYKERNEL = axpy.S
|
SAXPYKERNEL = axpy.S
|
||||||
DAXPYKERNEL = axpy.S
|
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||||
CAXPYKERNEL = zaxpy.S
|
CAXPYKERNEL = zaxpy.S
|
||||||
ZAXPYKERNEL = zaxpy.S
|
ZAXPYKERNEL = zaxpy.S
|
||||||
|
|
||||||
|
@ -81,45 +81,35 @@ DGEMVTKERNEL = gemv_t.S
|
||||||
CGEMVTKERNEL = zgemv_t.S
|
CGEMVTKERNEL = zgemv_t.S
|
||||||
ZGEMVTKERNEL = zgemv_t.S
|
ZGEMVTKERNEL = zgemv_t.S
|
||||||
|
|
||||||
|
SASUMKERNEL = sasum_thunderx2t99.c
|
||||||
|
DASUMKERNEL = dasum_thunderx2t99.c
|
||||||
|
CASUMKERNEL = casum_thunderx2t99.c
|
||||||
|
ZASUMKERNEL = zasum_thunderx2t99.c
|
||||||
|
|
||||||
SASUMKERNEL = asum.S
|
SCOPYKERNEL = copy_thunderx2t99.c
|
||||||
DASUMKERNEL = asum.S
|
DCOPYKERNEL = copy_thunderx2t99.c
|
||||||
CASUMKERNEL = casum.S
|
CCOPYKERNEL = copy_thunderx2t99.c
|
||||||
ZASUMKERNEL = zasum.S
|
ZCOPYKERNEL = copy_thunderx2t99.c
|
||||||
|
|
||||||
SCOPYKERNEL = copy.S
|
SSWAPKERNEL = swap_thunderx2t99.S
|
||||||
DCOPYKERNEL = copy.S
|
DSWAPKERNEL = swap_thunderx2t99.S
|
||||||
CCOPYKERNEL = copy.S
|
CSWAPKERNEL = swap_thunderx2t99.S
|
||||||
ZCOPYKERNEL = copy.S
|
ZSWAPKERNEL = swap_thunderx2t99.S
|
||||||
|
|
||||||
SSWAPKERNEL = swap.S
|
ISAMAXKERNEL = iamax_thunderx2t99.c
|
||||||
DSWAPKERNEL = swap.S
|
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||||
CSWAPKERNEL = swap.S
|
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||||
ZSWAPKERNEL = swap.S
|
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||||
|
|
||||||
ISAMAXKERNEL = iamax.S
|
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||||
IDAMAXKERNEL = iamax.S
|
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||||
ICAMAXKERNEL = izamax.S
|
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||||
IZAMAXKERNEL = izamax.S
|
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||||
|
|
||||||
SNRM2KERNEL = nrm2.S
|
DDOTKERNEL = dot.c
|
||||||
DNRM2KERNEL = nrm2.S
|
SDOTKERNEL = dot.c
|
||||||
CNRM2KERNEL = znrm2.S
|
CDOTKERNEL = zdot_thunderx2t99.c
|
||||||
ZNRM2KERNEL = znrm2.S
|
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||||
|
|
||||||
DDOTKERNEL = dot.S
|
|
||||||
ifneq ($(C_COMPILER), PGI)
|
|
||||||
SDOTKERNEL = ../generic/dot.c
|
|
||||||
else
|
|
||||||
SDOTKERNEL = dot.S
|
|
||||||
endif
|
|
||||||
ifneq ($(C_COMPILER), PGI)
|
|
||||||
CDOTKERNEL = zdot.S
|
|
||||||
ZDOTKERNEL = zdot.S
|
|
||||||
else
|
|
||||||
CDOTKERNEL = ../arm/zdot.c
|
|
||||||
ZDOTKERNEL = ../arm/zdot.c
|
|
||||||
endif
|
|
||||||
DSDOTKERNEL = dot.S
|
DSDOTKERNEL = dot.S
|
||||||
|
|
||||||
DGEMM_BETA = dgemm_beta.S
|
DGEMM_BETA = dgemm_beta.S
|
||||||
|
@ -128,10 +118,10 @@ SGEMM_BETA = sgemm_beta.S
|
||||||
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
|
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
|
||||||
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
|
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
|
||||||
|
|
||||||
SGEMMINCOPY = sgemm_ncopy_sve_v1.c
|
SGEMMINCOPY = gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c
|
||||||
SGEMMITCOPY = sgemm_tcopy_sve_v1.c
|
SGEMMITCOPY = gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c
|
||||||
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S
|
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||||
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S
|
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||||
|
|
||||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
@ -149,8 +139,8 @@ SSYMMLCOPY_M = symm_lcopy_sve.c
|
||||||
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
|
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
|
||||||
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
|
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
|
||||||
|
|
||||||
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
|
DGEMMINCOPY = gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c
|
||||||
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
|
DGEMMITCOPY = gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c
|
||||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||||
|
|
||||||
|
@ -170,8 +160,8 @@ DSYMMLCOPY_M = symm_lcopy_sve.c
|
||||||
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||||
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||||
|
|
||||||
CGEMMINCOPY = cgemm_ncopy_sve_v1.c
|
CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
|
||||||
CGEMMITCOPY = cgemm_tcopy_sve_v1.c
|
CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
|
||||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||||
|
|
||||||
|
@ -194,8 +184,8 @@ CSYMMLCOPY_M = zsymm_lcopy_sve.c
|
||||||
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||||
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||||
|
|
||||||
ZGEMMINCOPY = zgemm_ncopy_sve_v1.c
|
ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
|
||||||
ZGEMMITCOPY = zgemm_tcopy_sve_v1.c
|
ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
|
||||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||||
|
|
||||||
|
|
|
@ -1,189 +1 @@
|
||||||
SAMINKERNEL = ../arm/amin.c
|
include $(KERNELDIR)/KERNEL.ARMV8SVE
|
||||||
DAMINKERNEL = ../arm/amin.c
|
|
||||||
CAMINKERNEL = ../arm/zamin.c
|
|
||||||
ZAMINKERNEL = ../arm/zamin.c
|
|
||||||
|
|
||||||
SMAXKERNEL = ../arm/max.c
|
|
||||||
DMAXKERNEL = ../arm/max.c
|
|
||||||
|
|
||||||
SMINKERNEL = ../arm/min.c
|
|
||||||
DMINKERNEL = ../arm/min.c
|
|
||||||
|
|
||||||
ISAMINKERNEL = ../arm/iamin.c
|
|
||||||
IDAMINKERNEL = ../arm/iamin.c
|
|
||||||
ICAMINKERNEL = ../arm/izamin.c
|
|
||||||
IZAMINKERNEL = ../arm/izamin.c
|
|
||||||
|
|
||||||
ISMAXKERNEL = ../arm/imax.c
|
|
||||||
IDMAXKERNEL = ../arm/imax.c
|
|
||||||
|
|
||||||
ISMINKERNEL = ../arm/imin.c
|
|
||||||
IDMINKERNEL = ../arm/imin.c
|
|
||||||
|
|
||||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
|
||||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
|
||||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
|
||||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
|
||||||
|
|
||||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
|
||||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
|
||||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
|
||||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
|
||||||
|
|
||||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
|
||||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
|
||||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
|
||||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
|
||||||
|
|
||||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
|
||||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
|
||||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
|
||||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
|
||||||
|
|
||||||
SAMAXKERNEL = amax.S
|
|
||||||
DAMAXKERNEL = amax.S
|
|
||||||
CAMAXKERNEL = zamax.S
|
|
||||||
ZAMAXKERNEL = zamax.S
|
|
||||||
|
|
||||||
SAXPYKERNEL = axpy.S
|
|
||||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
|
||||||
CAXPYKERNEL = zaxpy.S
|
|
||||||
ZAXPYKERNEL = zaxpy.S
|
|
||||||
|
|
||||||
SROTKERNEL = rot.S
|
|
||||||
DROTKERNEL = rot.S
|
|
||||||
CROTKERNEL = zrot.S
|
|
||||||
ZROTKERNEL = zrot.S
|
|
||||||
|
|
||||||
SSCALKERNEL = scal.S
|
|
||||||
DSCALKERNEL = scal.S
|
|
||||||
CSCALKERNEL = zscal.S
|
|
||||||
ZSCALKERNEL = zscal.S
|
|
||||||
|
|
||||||
SGEMVNKERNEL = gemv_n.S
|
|
||||||
DGEMVNKERNEL = gemv_n.S
|
|
||||||
CGEMVNKERNEL = zgemv_n.S
|
|
||||||
ZGEMVNKERNEL = zgemv_n.S
|
|
||||||
|
|
||||||
SGEMVTKERNEL = gemv_t.S
|
|
||||||
DGEMVTKERNEL = gemv_t.S
|
|
||||||
CGEMVTKERNEL = zgemv_t.S
|
|
||||||
ZGEMVTKERNEL = zgemv_t.S
|
|
||||||
|
|
||||||
|
|
||||||
SASUMKERNEL = sasum_thunderx2t99.c
|
|
||||||
DASUMKERNEL = dasum_thunderx2t99.c
|
|
||||||
CASUMKERNEL = casum_thunderx2t99.c
|
|
||||||
ZASUMKERNEL = zasum_thunderx2t99.c
|
|
||||||
|
|
||||||
SCOPYKERNEL = copy_thunderx2t99.c
|
|
||||||
DCOPYKERNEL = copy_thunderx2t99.c
|
|
||||||
CCOPYKERNEL = copy_thunderx2t99.c
|
|
||||||
ZCOPYKERNEL = copy_thunderx2t99.c
|
|
||||||
|
|
||||||
SSWAPKERNEL = swap_thunderx2t99.S
|
|
||||||
DSWAPKERNEL = swap_thunderx2t99.S
|
|
||||||
CSWAPKERNEL = swap_thunderx2t99.S
|
|
||||||
ZSWAPKERNEL = swap_thunderx2t99.S
|
|
||||||
|
|
||||||
ISAMAXKERNEL = iamax_thunderx2t99.c
|
|
||||||
IDAMAXKERNEL = iamax_thunderx2t99.c
|
|
||||||
ICAMAXKERNEL = izamax_thunderx2t99.c
|
|
||||||
IZAMAXKERNEL = izamax_thunderx2t99.c
|
|
||||||
|
|
||||||
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
|
||||||
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
|
||||||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
|
||||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
|
||||||
|
|
||||||
DDOTKERNEL = dot.c
|
|
||||||
SDOTKERNEL = dot.c
|
|
||||||
CDOTKERNEL = zdot_thunderx2t99.c
|
|
||||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
|
||||||
DSDOTKERNEL = dot.S
|
|
||||||
|
|
||||||
DGEMM_BETA = dgemm_beta.S
|
|
||||||
SGEMM_BETA = sgemm_beta.S
|
|
||||||
|
|
||||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
|
||||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
|
||||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
|
||||||
ifeq ($(SGEMM_UNROLL_M), 16)
|
|
||||||
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
|
||||||
else
|
|
||||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
|
||||||
endif
|
|
||||||
ifeq ($(SGEMM_UNROLL_M), 4)
|
|
||||||
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
|
||||||
else
|
|
||||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
|
||||||
endif
|
|
||||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
endif
|
|
||||||
ifeq ($(SGEMM_UNROLL_N), 16)
|
|
||||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
|
||||||
else
|
|
||||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
|
||||||
endif
|
|
||||||
ifeq ($(SGEMM_UNROLL_N), 4)
|
|
||||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
|
||||||
else
|
|
||||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
|
||||||
endif
|
|
||||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
|
|
||||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
|
||||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
|
||||||
|
|
||||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
|
||||||
|
|
||||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
|
||||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
|
||||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
|
||||||
else
|
|
||||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
|
||||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
|
||||||
endif
|
|
||||||
|
|
||||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
|
||||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
|
||||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
|
||||||
else
|
|
||||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
|
||||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
|
||||||
endif
|
|
||||||
|
|
||||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
|
|
||||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
|
||||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
|
||||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
|
||||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
|
||||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
|
||||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
endif
|
|
||||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
|
||||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
|
||||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
|
|
||||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
|
||||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
|
||||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
|
||||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
|
||||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
|
||||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
endif
|
|
||||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
|
||||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
|
||||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
|
||||||
|
|
|
@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define pCRow3 x15
|
#define pCRow3 x15
|
||||||
#define pA x16
|
#define pA x16
|
||||||
#define alphaR w17
|
#define alphaR w17
|
||||||
#define alphaI w18
|
#define alphaI w19
|
||||||
|
|
||||||
#define alpha0_R s10
|
#define alpha0_R s10
|
||||||
#define alphaV0_R v10.s[0]
|
#define alphaV0_R v10.s[0]
|
||||||
|
|
|
@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define pCRow3 x15
|
#define pCRow3 x15
|
||||||
#define pA x16
|
#define pA x16
|
||||||
#define alphaR w17
|
#define alphaR w17
|
||||||
#define alphaI w18
|
#define alphaI w19
|
||||||
|
|
||||||
#define alpha0_R s10
|
#define alpha0_R s10
|
||||||
#define alphaV0_R v10.s[0]
|
#define alphaV0_R v10.s[0]
|
||||||
|
|
|
@ -240,7 +240,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
add pB, pB, 32
|
add pB, pB, 32
|
||||||
|
|
||||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro KERNELv1x4_M1
|
.macro KERNELv1x4_M1
|
||||||
|
@ -276,9 +275,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
ld1rw z15.s, p0/z, [pB, 28]
|
ld1rw z15.s, p0/z, [pB, 28]
|
||||||
|
|
||||||
add pB, pB, 32
|
add pB, pB, 32
|
||||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
|
||||||
|
|
||||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro KERNELv1x4_M2
|
.macro KERNELv1x4_M2
|
||||||
|
@ -313,11 +309,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
OP_ri z23.s, p1/m, z2.s, z15.s
|
OP_ri z23.s, p1/m, z2.s, z15.s
|
||||||
ld1rw z15.s, p0/z, [pB, 28]
|
ld1rw z15.s, p0/z, [pB, 28]
|
||||||
|
|
||||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
|
||||||
|
|
||||||
add pB, pB, 32
|
add pB, pB, 32
|
||||||
|
|
||||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro KERNELv1x4_E
|
.macro KERNELv1x4_E
|
||||||
|
@ -341,10 +333,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
OP_ii z22.s, p1/m, z3.s, z15.s
|
OP_ii z22.s, p1/m, z3.s, z15.s
|
||||||
OP_ri z23.s, p1/m, z2.s, z15.s
|
OP_ri z23.s, p1/m, z2.s, z15.s
|
||||||
|
|
||||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
|
||||||
|
|
||||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
|
||||||
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro KERNELv1x4_SUB
|
.macro KERNELv1x4_SUB
|
||||||
|
@ -383,13 +371,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
OP_ii z22.s, p1/m, z1.s, z15.s
|
OP_ii z22.s, p1/m, z1.s, z15.s
|
||||||
OP_ri z23.s, p1/m, z0.s, z15.s
|
OP_ri z23.s, p1/m, z0.s, z15.s
|
||||||
|
|
||||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
|
||||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro SAVEv1x4
|
.macro SAVEv1x4
|
||||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
|
||||||
|
|
||||||
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
|
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
|
||||||
fmla z24.s, p1/m, z16.s, alphaz_R
|
fmla z24.s, p1/m, z16.s, alphaz_R
|
||||||
fmls z24.s, p1/m, z17.s, alphaz_I
|
fmls z24.s, p1/m, z17.s, alphaz_I
|
||||||
|
@ -407,8 +391,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
st2w {z26.s, z27.s}, p1, [pCRow1]
|
st2w {z26.s, z27.s}, p1, [pCRow1]
|
||||||
|
|
||||||
add pCRow1, pCRow1, lanes, lsl #3
|
add pCRow1, pCRow1, lanes, lsl #3
|
||||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
|
||||||
|
|
||||||
ld2w {z28.s, z29.s}, p1/z, [pCRow2]
|
ld2w {z28.s, z29.s}, p1/z, [pCRow2]
|
||||||
fmla z28.s, p1/m, z20.s, alphaz_R
|
fmla z28.s, p1/m, z20.s, alphaz_R
|
||||||
fmls z28.s, p1/m, z21.s, alphaz_I
|
fmls z28.s, p1/m, z21.s, alphaz_I
|
||||||
|
@ -425,12 +407,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
fmla z31.s, p1/m, z23.s, alphaz_R
|
fmla z31.s, p1/m, z23.s, alphaz_R
|
||||||
st2w {z30.s, z31.s}, p1, [pCRow3]
|
st2w {z30.s, z31.s}, p1, [pCRow3]
|
||||||
|
|
||||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
|
||||||
|
|
||||||
add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4
|
add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4
|
||||||
|
|
||||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
|
||||||
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
|
@ -466,8 +444,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro SAVEv1x2
|
.macro SAVEv1x2
|
||||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
|
||||||
|
|
||||||
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
|
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
|
||||||
fmla z24.s, p1/m, z16.s, alphaz_R
|
fmla z24.s, p1/m, z16.s, alphaz_R
|
||||||
fmls z24.s, p1/m, z17.s, alphaz_I
|
fmls z24.s, p1/m, z17.s, alphaz_I
|
||||||
|
@ -485,10 +461,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
st2w {z26.s, z27.s}, p1, [pCRow1]
|
st2w {z26.s, z27.s}, p1, [pCRow1]
|
||||||
|
|
||||||
add pCRow1, pCRow1, lanes, lsl #3
|
add pCRow1, pCRow1, lanes, lsl #3
|
||||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
|
||||||
|
|
||||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
|
||||||
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
|
@ -516,8 +488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro SAVEv1x1
|
.macro SAVEv1x1
|
||||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
|
||||||
|
|
||||||
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
|
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
|
||||||
fmla z24.s, p1/m, z16.s, alphaz_R
|
fmla z24.s, p1/m, z16.s, alphaz_R
|
||||||
fmls z24.s, p1/m, z17.s, alphaz_I
|
fmls z24.s, p1/m, z17.s, alphaz_I
|
||||||
|
@ -527,8 +497,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4
|
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4
|
||||||
|
|
||||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
|
||||||
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
/******************************************************************************/
|
/******************************************************************************/
|
||||||
|
@ -553,9 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stp x26, x27, [sp, #(9 * 16)]
|
stp x26, x27, [sp, #(9 * 16)]
|
||||||
str x28, [sp, #(10 * 16)]
|
str x28, [sp, #(10 * 16)]
|
||||||
|
|
||||||
prfm PLDL1KEEP, [origPB]
|
|
||||||
prfm PLDL1KEEP, [origPA]
|
|
||||||
|
|
||||||
fmov alphaR, s0
|
fmov alphaR, s0
|
||||||
dup alphaz_R, alphaR
|
dup alphaz_R, alphaR
|
||||||
fmov alphaI, s1
|
fmov alphaI, s1
|
||||||
|
@ -676,10 +641,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
bne .Lcgemm_kernel_L4_Mv1_46
|
bne .Lcgemm_kernel_L4_Mv1_46
|
||||||
|
|
||||||
.Lcgemm_kernel_L4_Mv1_100:
|
.Lcgemm_kernel_L4_Mv1_100:
|
||||||
prfm PLDL1KEEP, [pA]
|
|
||||||
prfm PLDL1KEEP, [pA, #64]
|
|
||||||
prfm PLDL1KEEP, [origPB]
|
|
||||||
|
|
||||||
SAVEv1x4
|
SAVEv1x4
|
||||||
|
|
||||||
.Lcgemm_kernel_L4_Mv1_END:
|
.Lcgemm_kernel_L4_Mv1_END:
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* Copyright 2023 The OpenBLAS Project */
|
||||||
/* All rights reserved. */
|
/* All rights reserved. */
|
||||||
/* */
|
/* */
|
||||||
/* Redistribution and use in source and binary forms, with or */
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||||
boffset = b;
|
boffset = b;
|
||||||
|
|
||||||
j = 0;
|
j = 0;
|
||||||
svbool_t pg = svwhilelt_b32(j, n);
|
svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
|
||||||
uint32_t active = svcntp_b32(svptrue_b32(), pg);
|
uint32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||||
do {
|
do {
|
||||||
|
|
||||||
|
@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||||
aoffset += active * lda * 2;
|
aoffset += active * lda * 2;
|
||||||
|
|
||||||
j += svcntw();
|
j += svcntw();
|
||||||
pg = svwhilelt_b32(j, n);
|
pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
|
||||||
active = svcntp_b32(svptrue_b32(), pg);
|
active = svcntp_b32(svptrue_b32(), pg);
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* Copyright 2023 The OpenBLAS Project */
|
||||||
/* All rights reserved. */
|
/* All rights reserved. */
|
||||||
/* */
|
/* */
|
||||||
/* Redistribution and use in source and binary forms, with or */
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||||
boffset = b;
|
boffset = b;
|
||||||
|
|
||||||
j = 0;
|
j = 0;
|
||||||
svbool_t pg = svwhilelt_b32(j, n);
|
svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
|
||||||
uint32_t active = svcntp_b32(svptrue_b32(), pg);
|
uint32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||||
do {
|
do {
|
||||||
|
|
||||||
|
@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||||
aoffset += active * 2;
|
aoffset += active * 2;
|
||||||
|
|
||||||
j += svcntw();
|
j += svcntw();
|
||||||
pg = svwhilelt_b32(j, n);
|
pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
|
||||||
active = svcntp_b32(svptrue_b32(), pg);
|
active = svcntp_b32(svptrue_b32(), pg);
|
||||||
|
|
||||||
} while (svptest_any(svptrue_b32(), pg));
|
} while (svptest_any(svptrue_b32(), pg));
|
||||||
|
|
|
@ -49,10 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define pCRow3 x15
|
#define pCRow3 x15
|
||||||
#define pA x16
|
#define pA x16
|
||||||
#define alphaR w17
|
#define alphaR w17
|
||||||
#define alphaI w18
|
#define alphaI w19
|
||||||
#define temp x19
|
#define temp x20
|
||||||
#define tempOffset x20
|
#define tempOffset x21
|
||||||
#define tempK x21
|
#define tempK x22
|
||||||
|
|
||||||
#define alpha0_R s10
|
#define alpha0_R s10
|
||||||
#define alphaV0_R v10.s[0]
|
#define alphaV0_R v10.s[0]
|
||||||
|
|
|
@ -1,79 +0,0 @@
|
||||||
/*********************************************************************/
|
|
||||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
||||||
/* All rights reserved. */
|
|
||||||
/* */
|
|
||||||
/* Redistribution and use in source and binary forms, with or */
|
|
||||||
/* without modification, are permitted provided that the following */
|
|
||||||
/* conditions are met: */
|
|
||||||
/* */
|
|
||||||
/* 1. Redistributions of source code must retain the above */
|
|
||||||
/* copyright notice, this list of conditions and the following */
|
|
||||||
/* disclaimer. */
|
|
||||||
/* */
|
|
||||||
/* 2. Redistributions in binary form must reproduce the above */
|
|
||||||
/* copyright notice, this list of conditions and the following */
|
|
||||||
/* disclaimer in the documentation and/or other materials */
|
|
||||||
/* provided with the distribution. */
|
|
||||||
/* */
|
|
||||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
||||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
||||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
||||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
||||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
||||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
||||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
||||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
||||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
||||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
||||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
||||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
||||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
||||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
|
||||||
/* */
|
|
||||||
/* The views and conclusions contained in the software and */
|
|
||||||
/* documentation are those of the authors and should not be */
|
|
||||||
/* interpreted as representing official policies, either expressed */
|
|
||||||
/* or implied, of The University of Texas at Austin. */
|
|
||||||
/*********************************************************************/
|
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include "common.h"
|
|
||||||
#include <arm_sve.h>
|
|
||||||
|
|
||||||
// TODO: write in assembly with proper unrolling of inner loop
|
|
||||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
|
||||||
|
|
||||||
BLASLONG j;
|
|
||||||
IFLOAT *aoffset, *aoffset1, *boffset;
|
|
||||||
|
|
||||||
svint64_t lda_vec = svindex_s64(0LL, lda);
|
|
||||||
uint64_t sve_size = svcntd();
|
|
||||||
|
|
||||||
aoffset = a;
|
|
||||||
boffset = b;
|
|
||||||
|
|
||||||
j = 0;
|
|
||||||
svbool_t pg = svwhilelt_b64(j, n);
|
|
||||||
uint64_t active = svcntp_b64(svptrue_b64(), pg);
|
|
||||||
do {
|
|
||||||
|
|
||||||
aoffset1 = aoffset;
|
|
||||||
|
|
||||||
uint64_t i_cnt = m;
|
|
||||||
while (i_cnt--) {
|
|
||||||
svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
|
|
||||||
svst1_f64(pg, (double *) boffset, a_vec);
|
|
||||||
aoffset1++;
|
|
||||||
boffset += active;
|
|
||||||
}
|
|
||||||
aoffset += sve_size * lda;
|
|
||||||
|
|
||||||
j += svcntd();
|
|
||||||
pg = svwhilelt_b64(j, n);
|
|
||||||
active = svcntp_b64(svptrue_b64(), pg);
|
|
||||||
|
|
||||||
|
|
||||||
} while (svptest_any(svptrue_b64(), pg));
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
|
@ -1,77 +0,0 @@
|
||||||
/*********************************************************************/
|
|
||||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
||||||
/* All rights reserved. */
|
|
||||||
/* */
|
|
||||||
/* Redistribution and use in source and binary forms, with or */
|
|
||||||
/* without modification, are permitted provided that the following */
|
|
||||||
/* conditions are met: */
|
|
||||||
/* */
|
|
||||||
/* 1. Redistributions of source code must retain the above */
|
|
||||||
/* copyright notice, this list of conditions and the following */
|
|
||||||
/* disclaimer. */
|
|
||||||
/* */
|
|
||||||
/* 2. Redistributions in binary form must reproduce the above */
|
|
||||||
/* copyright notice, this list of conditions and the following */
|
|
||||||
/* disclaimer in the documentation and/or other materials */
|
|
||||||
/* provided with the distribution. */
|
|
||||||
/* */
|
|
||||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
||||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
||||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
||||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
||||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
||||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
||||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
||||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
||||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
||||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
||||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
||||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
||||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
||||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
|
||||||
/* */
|
|
||||||
/* The views and conclusions contained in the software and */
|
|
||||||
/* documentation are those of the authors and should not be */
|
|
||||||
/* interpreted as representing official policies, either expressed */
|
|
||||||
/* or implied, of The University of Texas at Austin. */
|
|
||||||
/*********************************************************************/
|
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include "common.h"
|
|
||||||
#include <arm_sve.h>
|
|
||||||
|
|
||||||
// TODO: write in assembly with proper unrolling of inner loop
|
|
||||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
|
||||||
|
|
||||||
BLASLONG j;
|
|
||||||
IFLOAT *aoffset, *aoffset1, *boffset;
|
|
||||||
|
|
||||||
uint64_t sve_size = svcntd();
|
|
||||||
|
|
||||||
aoffset = a;
|
|
||||||
boffset = b;
|
|
||||||
|
|
||||||
j = 0;
|
|
||||||
svbool_t pg = svwhilelt_b64(j, n);
|
|
||||||
uint64_t active = svcntp_b64(svptrue_b64(), pg);
|
|
||||||
do {
|
|
||||||
|
|
||||||
aoffset1 = aoffset;
|
|
||||||
|
|
||||||
uint64_t i_cnt = m;
|
|
||||||
while (i_cnt--) {
|
|
||||||
svfloat64_t a_vec = svld1(pg, (double *)aoffset1);
|
|
||||||
svst1_f64(pg, (double *) boffset, a_vec);
|
|
||||||
aoffset1 += lda;
|
|
||||||
boffset += active;
|
|
||||||
}
|
|
||||||
aoffset += sve_size;
|
|
||||||
|
|
||||||
j += svcntd();
|
|
||||||
pg = svwhilelt_b64(j, n);
|
|
||||||
active = svcntp_b64(svptrue_b64(), pg);
|
|
||||||
|
|
||||||
} while (svptest_any(svptrue_b64(), pg));
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
|
@ -50,8 +50,8 @@ static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) {
|
||||||
BLASLONG sve_width = SVE_WIDTH;
|
BLASLONG sve_width = SVE_WIDTH;
|
||||||
|
|
||||||
for (BLASLONG i = 0; i < n; i += sve_width * 2) {
|
for (BLASLONG i = 0; i < n; i += sve_width * 2) {
|
||||||
svbool_t pg_a = SVE_WHILELT(i, n);
|
svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n);
|
||||||
svbool_t pg_b = SVE_WHILELT(i + sve_width, n);
|
svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n);
|
||||||
|
|
||||||
SVE_TYPE x_vec_a = svld1(pg_a, &x[i]);
|
SVE_TYPE x_vec_a = svld1(pg_a, &x[i]);
|
||||||
SVE_TYPE y_vec_a = svld1(pg_a, &y[i]);
|
SVE_TYPE y_vec_a = svld1(pg_a, &y[i]);
|
||||||
|
|
|
@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include <float.h>
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
|
@ -404,7 +404,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
#else
|
#else
|
||||||
nrm2_compute(n, x, inc_x, &ssq, &scale);
|
nrm2_compute(n, x, inc_x, &ssq, &scale);
|
||||||
#endif
|
#endif
|
||||||
if (fabs(scale) <1.e-300) return 0.;
|
volatile FLOAT sca = fabs(scale);
|
||||||
|
if (sca < DBL_MIN) return 0.;
|
||||||
ssq = sqrt(ssq) * scale;
|
ssq = sqrt(ssq) * scale;
|
||||||
|
|
||||||
return ssq;
|
return ssq;
|
||||||
|
|
|
@ -0,0 +1,121 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2023, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <arm_sve.h>
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#ifdef DOUBLE
|
||||||
|
#define COUNT "cntd"
|
||||||
|
#define SV_TYPE svfloat64_t
|
||||||
|
#define SV_INDEX svuint64_t
|
||||||
|
#define SV_INDEXER svindex_u64
|
||||||
|
#define SV_TRUE svptrue_b64
|
||||||
|
#define SV_WHILE svwhilelt_b64
|
||||||
|
#else
|
||||||
|
#define COUNT "cntw"
|
||||||
|
#define SV_TYPE svfloat32_t
|
||||||
|
#define SV_INDEX svuint32_t
|
||||||
|
#define SV_INDEXER svindex_u32
|
||||||
|
#define SV_TRUE svptrue_b32
|
||||||
|
#define SV_WHILE svwhilelt_b32
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \
|
||||||
|
a_vec_real = svld1_gather_index(pg, a_offset_inner, lda_vec); \
|
||||||
|
a_vec_imag = svld1_gather_index(pg, a_offset_inner + 1, lda_vec); \
|
||||||
|
svst2(pg, b_offset, svcreate2(a_vec_real, a_vec_imag)); \
|
||||||
|
a_offset_inner += 2; \
|
||||||
|
b_offset += active * 2;
|
||||||
|
|
||||||
|
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
|
||||||
|
uint64_t sve_size;
|
||||||
|
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : );
|
||||||
|
|
||||||
|
IFLOAT *a_offset, *a_offset_inner, *b_offset;
|
||||||
|
a_offset = a;
|
||||||
|
b_offset = b;
|
||||||
|
|
||||||
|
SV_INDEX lda_vec = SV_INDEXER(0LL, lda * 2);
|
||||||
|
SV_TYPE a_vec_real;
|
||||||
|
SV_TYPE a_vec_imag;
|
||||||
|
svbool_t pg_true = SV_TRUE();
|
||||||
|
|
||||||
|
BLASLONG single_vectors_n = n & -sve_size;
|
||||||
|
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
|
||||||
|
a_offset_inner = a_offset;
|
||||||
|
|
||||||
|
svbool_t pg = pg_true;
|
||||||
|
uint64_t active = sve_size;
|
||||||
|
uint64_t i_cnt = m >> 2;
|
||||||
|
while (i_cnt--) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 2) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 1) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
a_offset += sve_size * lda * 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
BLASLONG remaining_n = n - single_vectors_n;
|
||||||
|
if (remaining_n) {
|
||||||
|
a_offset_inner = a_offset;
|
||||||
|
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
|
||||||
|
uint64_t active = remaining_n;
|
||||||
|
uint64_t i_cnt = m >> 2;
|
||||||
|
while (i_cnt--) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 2) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 1) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,131 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2023, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <arm_sve.h>
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#ifdef DOUBLE
|
||||||
|
#define COUNT "cntd"
|
||||||
|
#define SV_TYPE svfloat64_t
|
||||||
|
#define SV_INDEX svuint64_t
|
||||||
|
#define SV_INDEXER svindex_u64
|
||||||
|
#define SV_TRUE svptrue_b64
|
||||||
|
#define SV_WHILE svwhilelt_b64
|
||||||
|
#define SV_PREFETCH svprfd_gather_index
|
||||||
|
#else
|
||||||
|
#define COUNT "cntw"
|
||||||
|
#define SV_TYPE svfloat32_t
|
||||||
|
#define SV_INDEX svuint32_t
|
||||||
|
#define SV_INDEXER svindex_u32
|
||||||
|
#define SV_TRUE svptrue_b32
|
||||||
|
#define SV_WHILE svwhilelt_b32
|
||||||
|
#define SV_PREFETCH svprfw_gather_index
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \
|
||||||
|
a_vec = svld1_gather_index(pg, a_offset_inner, lda_vec); \
|
||||||
|
svst1(pg, b_offset, a_vec); \
|
||||||
|
a_offset_inner++; \
|
||||||
|
b_offset += active;
|
||||||
|
|
||||||
|
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
|
||||||
|
uint64_t sve_size;
|
||||||
|
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : );
|
||||||
|
|
||||||
|
IFLOAT *a_offset, *a_offset_inner, *b_offset;
|
||||||
|
a_offset = a;
|
||||||
|
b_offset = b;
|
||||||
|
|
||||||
|
SV_INDEX lda_vec = SV_INDEXER(0LL, lda);
|
||||||
|
SV_TYPE a_vec;
|
||||||
|
svbool_t pg_true = SV_TRUE();
|
||||||
|
|
||||||
|
BLASLONG single_vectors_n = n & -sve_size;
|
||||||
|
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
|
||||||
|
a_offset_inner = a_offset;
|
||||||
|
|
||||||
|
svbool_t pg = pg_true;
|
||||||
|
uint64_t active = sve_size;
|
||||||
|
uint64_t i_cnt = m >> 3;
|
||||||
|
while (i_cnt--) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 4) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 2) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 1) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
a_offset += sve_size * lda;
|
||||||
|
}
|
||||||
|
|
||||||
|
BLASLONG remaining_n = n - single_vectors_n;
|
||||||
|
if (remaining_n) {
|
||||||
|
a_offset_inner = a_offset;
|
||||||
|
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
|
||||||
|
uint64_t active = remaining_n;
|
||||||
|
uint64_t i_cnt = m >> 2;
|
||||||
|
while (i_cnt--) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 2) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 1) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -0,0 +1,115 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2023, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <arm_sve.h>
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#ifdef DOUBLE
|
||||||
|
#define COUNT "cntd"
|
||||||
|
#define SV_TYPE svfloat64x2_t
|
||||||
|
#define SV_TRUE svptrue_b64
|
||||||
|
#define SV_WHILE svwhilelt_b64
|
||||||
|
#else
|
||||||
|
#define COUNT "cntw"
|
||||||
|
#define SV_TYPE svfloat32x2_t
|
||||||
|
#define SV_TRUE svptrue_b32
|
||||||
|
#define SV_WHILE svwhilelt_b32
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \
|
||||||
|
a_vec = svld2(pg, a_offset_inner); \
|
||||||
|
svst2(pg, b_offset, a_vec); \
|
||||||
|
a_offset_inner += lda * 2; \
|
||||||
|
b_offset += active * 2;
|
||||||
|
|
||||||
|
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||||
|
uint64_t sve_size = svcntw();
|
||||||
|
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : );
|
||||||
|
|
||||||
|
IFLOAT *a_offset, *a_offset_inner, *b_offset;
|
||||||
|
a_offset = a;
|
||||||
|
b_offset = b;
|
||||||
|
|
||||||
|
SV_TYPE a_vec;
|
||||||
|
svbool_t pg_true = SV_TRUE();
|
||||||
|
|
||||||
|
BLASLONG single_vectors_n = n & -sve_size;
|
||||||
|
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
|
||||||
|
a_offset_inner = a_offset;
|
||||||
|
|
||||||
|
svbool_t pg = pg_true;
|
||||||
|
uint64_t active = sve_size;
|
||||||
|
uint64_t i_cnt = m >> 2;
|
||||||
|
while (i_cnt--) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 2) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 1) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
a_offset += sve_size * 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
BLASLONG remaining_n = n - single_vectors_n;
|
||||||
|
if (remaining_n) {
|
||||||
|
a_offset_inner = a_offset;
|
||||||
|
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
|
||||||
|
uint64_t active = remaining_n;
|
||||||
|
uint64_t i_cnt = m >> 2;
|
||||||
|
while (i_cnt--) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 2) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 1) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,125 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2023, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <arm_sve.h>
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#ifdef DOUBLE
|
||||||
|
#define COUNT "cntd"
|
||||||
|
#define SV_TYPE svfloat64_t
|
||||||
|
#define SV_TRUE svptrue_b64
|
||||||
|
#define SV_WHILE svwhilelt_b64
|
||||||
|
#else
|
||||||
|
#define COUNT "cntw"
|
||||||
|
#define SV_TYPE svfloat32_t
|
||||||
|
#define SV_TRUE svptrue_b32
|
||||||
|
#define SV_WHILE svwhilelt_b32
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \
|
||||||
|
a_vec = svld1(pg, a_offset_inner); \
|
||||||
|
svst1(pg, b_offset, a_vec); \
|
||||||
|
a_offset_inner += lda; \
|
||||||
|
b_offset += active;
|
||||||
|
|
||||||
|
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||||
|
uint64_t sve_size = svcntw();
|
||||||
|
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : );
|
||||||
|
|
||||||
|
IFLOAT *a_offset, *a_offset_inner, *b_offset;
|
||||||
|
a_offset = a;
|
||||||
|
b_offset = b;
|
||||||
|
|
||||||
|
SV_TYPE a_vec;
|
||||||
|
svbool_t pg_true = SV_TRUE();
|
||||||
|
|
||||||
|
BLASLONG single_vectors_n = n & -sve_size;
|
||||||
|
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
|
||||||
|
a_offset_inner = a_offset;
|
||||||
|
|
||||||
|
svbool_t pg = pg_true;
|
||||||
|
uint64_t active = sve_size;
|
||||||
|
uint64_t i_cnt = m >> 3;
|
||||||
|
while (i_cnt--) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 4) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 2) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 1) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
a_offset += sve_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
BLASLONG remaining_n = n - single_vectors_n;
|
||||||
|
if (remaining_n) {
|
||||||
|
a_offset_inner = a_offset;
|
||||||
|
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
|
||||||
|
uint64_t active = remaining_n;
|
||||||
|
uint64_t i_cnt = m >> 2;
|
||||||
|
while (i_cnt--) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 2) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 1) {
|
||||||
|
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
|
@ -1,78 +0,0 @@
|
||||||
/*********************************************************************/
|
|
||||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
||||||
/* All rights reserved. */
|
|
||||||
/* */
|
|
||||||
/* Redistribution and use in source and binary forms, with or */
|
|
||||||
/* without modification, are permitted provided that the following */
|
|
||||||
/* conditions are met: */
|
|
||||||
/* */
|
|
||||||
/* 1. Redistributions of source code must retain the above */
|
|
||||||
/* copyright notice, this list of conditions and the following */
|
|
||||||
/* disclaimer. */
|
|
||||||
/* */
|
|
||||||
/* 2. Redistributions in binary form must reproduce the above */
|
|
||||||
/* copyright notice, this list of conditions and the following */
|
|
||||||
/* disclaimer in the documentation and/or other materials */
|
|
||||||
/* provided with the distribution. */
|
|
||||||
/* */
|
|
||||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
||||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
||||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
||||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
||||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
||||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
||||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
||||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
||||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
||||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
||||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
||||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
||||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
||||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
|
||||||
/* */
|
|
||||||
/* The views and conclusions contained in the software and */
|
|
||||||
/* documentation are those of the authors and should not be */
|
|
||||||
/* interpreted as representing official policies, either expressed */
|
|
||||||
/* or implied, of The University of Texas at Austin. */
|
|
||||||
/*********************************************************************/
|
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include "common.h"
|
|
||||||
#include <arm_sve.h>
|
|
||||||
|
|
||||||
// TODO: write in assembly with proper unrolling of inner loop
|
|
||||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
|
||||||
|
|
||||||
BLASLONG j;
|
|
||||||
IFLOAT *aoffset, *aoffset1, *boffset;
|
|
||||||
|
|
||||||
svint32_t lda_vec = svindex_s32(0LL, lda);
|
|
||||||
uint32_t sve_size = svcntw();
|
|
||||||
|
|
||||||
aoffset = a;
|
|
||||||
boffset = b;
|
|
||||||
|
|
||||||
j = 0;
|
|
||||||
svbool_t pg = svwhilelt_b32(j, n);
|
|
||||||
uint32_t active = svcntp_b32(svptrue_b32(), pg);
|
|
||||||
do {
|
|
||||||
|
|
||||||
aoffset1 = aoffset;
|
|
||||||
|
|
||||||
uint32_t i_cnt = m;
|
|
||||||
while (i_cnt--) {
|
|
||||||
svfloat32_t a_vec = svld1_gather_index(pg, (float *) aoffset1, lda_vec);
|
|
||||||
svst1_f32(pg, (float *) boffset, a_vec);
|
|
||||||
aoffset1++;
|
|
||||||
boffset += active;
|
|
||||||
}
|
|
||||||
aoffset += sve_size * lda;
|
|
||||||
|
|
||||||
j += svcntw();
|
|
||||||
pg = svwhilelt_b32(j, n);
|
|
||||||
active = svcntp_b32(svptrue_b32(), pg);
|
|
||||||
|
|
||||||
} while (svptest_any(svptrue_b32(), pg));
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
|
@ -1,77 +0,0 @@
|
||||||
/*********************************************************************/
|
|
||||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
|
||||||
/* All rights reserved. */
|
|
||||||
/* */
|
|
||||||
/* Redistribution and use in source and binary forms, with or */
|
|
||||||
/* without modification, are permitted provided that the following */
|
|
||||||
/* conditions are met: */
|
|
||||||
/* */
|
|
||||||
/* 1. Redistributions of source code must retain the above */
|
|
||||||
/* copyright notice, this list of conditions and the following */
|
|
||||||
/* disclaimer. */
|
|
||||||
/* */
|
|
||||||
/* 2. Redistributions in binary form must reproduce the above */
|
|
||||||
/* copyright notice, this list of conditions and the following */
|
|
||||||
/* disclaimer in the documentation and/or other materials */
|
|
||||||
/* provided with the distribution. */
|
|
||||||
/* */
|
|
||||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
|
||||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
|
||||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
|
||||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
|
||||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
|
||||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
|
||||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
|
||||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
|
||||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
|
||||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
|
||||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
|
||||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
|
||||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
|
||||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
|
||||||
/* */
|
|
||||||
/* The views and conclusions contained in the software and */
|
|
||||||
/* documentation are those of the authors and should not be */
|
|
||||||
/* interpreted as representing official policies, either expressed */
|
|
||||||
/* or implied, of The University of Texas at Austin. */
|
|
||||||
/*********************************************************************/
|
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include "common.h"
|
|
||||||
#include <arm_sve.h>
|
|
||||||
|
|
||||||
// TODO: write in assembly with proper unrolling of inner loop
|
|
||||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
|
||||||
|
|
||||||
BLASLONG j;
|
|
||||||
IFLOAT *aoffset, *aoffset1, *boffset;
|
|
||||||
|
|
||||||
uint32_t sve_size = svcntw();
|
|
||||||
|
|
||||||
aoffset = a;
|
|
||||||
boffset = b;
|
|
||||||
|
|
||||||
j = 0;
|
|
||||||
svbool_t pg = svwhilelt_b32(j, n);
|
|
||||||
uint32_t active = svcntp_b32(svptrue_b32(), pg);
|
|
||||||
do {
|
|
||||||
|
|
||||||
aoffset1 = aoffset;
|
|
||||||
|
|
||||||
uint32_t i_cnt = m;
|
|
||||||
while (i_cnt--) {
|
|
||||||
svfloat32_t a_vec = svld1(pg, (float *) aoffset1);
|
|
||||||
svst1_f32(pg, (float *) boffset, a_vec);
|
|
||||||
aoffset1 += lda;
|
|
||||||
boffset += active;
|
|
||||||
}
|
|
||||||
aoffset += sve_size;
|
|
||||||
|
|
||||||
j += svcntw();
|
|
||||||
pg = svwhilelt_b32(j, n);
|
|
||||||
active = svcntp_b32(svptrue_b32(), pg);
|
|
||||||
|
|
||||||
} while (svptest_any(svptrue_b32(), pg));
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
|
@ -1,5 +1,6 @@
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* Copyright 2023 The OpenBLAS Project */
|
||||||
/* All rights reserved. */
|
/* All rights reserved. */
|
||||||
/* */
|
/* */
|
||||||
/* Redistribution and use in source and binary forms, with or */
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
svint64_t one_vec = svdup_s64(1LL);
|
svint64_t one_vec = svdup_s64(1LL);
|
||||||
|
|
||||||
int64_t j = 0;
|
int64_t j = 0;
|
||||||
svbool_t pg = svwhilelt_b64(j, n);
|
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||||
int64_t active = svcntp_b64(svptrue_b64(), pg);
|
int64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||||
svint64_t index_neg = svindex_s64(0LL, -1LL);
|
svint64_t index_neg = svindex_s64(0LL, -1LL);
|
||||||
svint64_t index = svindex_s64(0LL, 1LL);
|
svint64_t index = svindex_s64(0LL, 1LL);
|
||||||
|
@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
posX += sve_size;
|
posX += sve_size;
|
||||||
posX_vec = svdup_s64(posX);
|
posX_vec = svdup_s64(posX);
|
||||||
j += sve_size;
|
j += sve_size;
|
||||||
pg = svwhilelt_b64(j, n);
|
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||||
active = svcntp_b64(svptrue_b64(), pg);
|
active = svcntp_b64(svptrue_b64(), pg);
|
||||||
} while (svptest_any(svptrue_b64(), pg));
|
} while (svptest_any(svptrue_b64(), pg));
|
||||||
|
|
||||||
|
@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
|
|
||||||
int32_t N = n;
|
int32_t N = n;
|
||||||
int32_t j = 0;
|
int32_t j = 0;
|
||||||
svbool_t pg = svwhilelt_b32(j, N);
|
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
|
||||||
int32_t active = svcntp_b32(svptrue_b32(), pg);
|
int32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||||
svint32_t index_neg = svindex_s32(0, -1);
|
svint32_t index_neg = svindex_s32(0, -1);
|
||||||
svint32_t index = svindex_s32(0, 1);
|
svint32_t index = svindex_s32(0, 1);
|
||||||
|
@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
posX += sve_size;
|
posX += sve_size;
|
||||||
posX_vec = svdup_s32(posX);
|
posX_vec = svdup_s32(posX);
|
||||||
j += sve_size;
|
j += sve_size;
|
||||||
pg = svwhilelt_b32(j, N);
|
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
|
||||||
active = svcntp_b32(svptrue_b32(), pg);
|
active = svcntp_b32(svptrue_b32(), pg);
|
||||||
} while (svptest_any(svptrue_b32(), pg));
|
} while (svptest_any(svptrue_b32(), pg));
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
|
/* Copyright 2023 The OpenBLAS Project */
|
||||||
/* All rights reserved. */
|
/* All rights reserved. */
|
||||||
/* */
|
/* */
|
||||||
/* Redistribution and use in source and binary forms, with or */
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
svint64_t one_vec = svdup_s64(1LL);
|
svint64_t one_vec = svdup_s64(1LL);
|
||||||
|
|
||||||
int64_t j = 0;
|
int64_t j = 0;
|
||||||
svbool_t pg = svwhilelt_b64(j, n);
|
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||||
int64_t active = svcntp_b64(svptrue_b64(), pg);
|
int64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||||
svint64_t index_neg = svindex_s64(0LL, -1LL);
|
svint64_t index_neg = svindex_s64(0LL, -1LL);
|
||||||
svint64_t index = svindex_s64(0LL, 1LL);
|
svint64_t index = svindex_s64(0LL, 1LL);
|
||||||
|
@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
posX += sve_size;
|
posX += sve_size;
|
||||||
posX_vec = svdup_s64(posX);
|
posX_vec = svdup_s64(posX);
|
||||||
j += sve_size;
|
j += sve_size;
|
||||||
pg = svwhilelt_b64(j, n);
|
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||||
active = svcntp_b64(svptrue_b64(), pg);
|
active = svcntp_b64(svptrue_b64(), pg);
|
||||||
} while (svptest_any(svptrue_b64(), pg));
|
} while (svptest_any(svptrue_b64(), pg));
|
||||||
|
|
||||||
|
@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
|
|
||||||
int32_t N = n;
|
int32_t N = n;
|
||||||
int32_t j = 0;
|
int32_t j = 0;
|
||||||
svbool_t pg = svwhilelt_b32(j, N);
|
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
|
||||||
int32_t active = svcntp_b32(svptrue_b32(), pg);
|
int32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||||
svint32_t index_neg = svindex_s32(0, -1);
|
svint32_t index_neg = svindex_s32(0, -1);
|
||||||
svint32_t index = svindex_s32(0, 1);
|
svint32_t index = svindex_s32(0, 1);
|
||||||
|
@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
posX += sve_size;
|
posX += sve_size;
|
||||||
posX_vec = svdup_s32(posX);
|
posX_vec = svdup_s32(posX);
|
||||||
j += sve_size;
|
j += sve_size;
|
||||||
pg = svwhilelt_b32(j, N);
|
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
|
||||||
active = svcntp_b32(svptrue_b32(), pg);
|
active = svcntp_b32(svptrue_b32(), pg);
|
||||||
} while (svptest_any(svptrue_b32(), pg));
|
} while (svptest_any(svptrue_b32(), pg));
|
||||||
|
|
||||||
|
|
|
@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
FLOAT *ao;
|
FLOAT *ao;
|
||||||
#ifdef DOUBLE
|
#ifdef DOUBLE
|
||||||
svint64_t index = svindex_s64(0LL, lda);
|
svint64_t index = svindex_s64(0LL, lda);
|
||||||
svbool_t pn = svwhilelt_b64(js, n);
|
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||||
#else
|
#else
|
||||||
svint32_t index = svindex_s32(0, lda);
|
svint32_t index = svindex_s32(0, lda);
|
||||||
svbool_t pn = svwhilelt_b32(js, n);
|
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||||
#endif
|
#endif
|
||||||
do
|
do
|
||||||
|
@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
posY += n_active;
|
posY += n_active;
|
||||||
js += n_active;
|
js += n_active;
|
||||||
#ifdef DOUBLE
|
#ifdef DOUBLE
|
||||||
pn = svwhilelt_b64(js, n);
|
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||||
} while (svptest_any(svptrue_b64(), pn));
|
} while (svptest_any(svptrue_b64(), pn));
|
||||||
#else
|
#else
|
||||||
pn = svwhilelt_b32(js, n);
|
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||||
} while (svptest_any(svptrue_b32(), pn));
|
} while (svptest_any(svptrue_b32(), pn));
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
FLOAT *ao;
|
FLOAT *ao;
|
||||||
js = 0;
|
js = 0;
|
||||||
#ifdef DOUBLE
|
#ifdef DOUBLE
|
||||||
svbool_t pn = svwhilelt_b64(js, n);
|
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||||
#else
|
#else
|
||||||
svbool_t pn = svwhilelt_b32(js, n);
|
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||||
#endif
|
#endif
|
||||||
do
|
do
|
||||||
|
@ -122,11 +122,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
posY += n_active;
|
posY += n_active;
|
||||||
js += n_active;
|
js += n_active;
|
||||||
#ifdef DOUBLE
|
#ifdef DOUBLE
|
||||||
pn = svwhilelt_b64(js, n);
|
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||||
} while (svptest_any(svptrue_b64(), pn));
|
} while (svptest_any(svptrue_b64(), pn));
|
||||||
#else
|
#else
|
||||||
pn = svwhilelt_b32(js, n);
|
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||||
} while (svptest_any(svptrue_b32(), pn));
|
} while (svptest_any(svptrue_b32(), pn));
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
FLOAT *ao;
|
FLOAT *ao;
|
||||||
#ifdef DOUBLE
|
#ifdef DOUBLE
|
||||||
svint64_t index = svindex_s64(0LL, lda);
|
svint64_t index = svindex_s64(0LL, lda);
|
||||||
svbool_t pn = svwhilelt_b64(js, n);
|
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||||
#else
|
#else
|
||||||
svint32_t index = svindex_s32(0, lda);
|
svint32_t index = svindex_s32(0, lda);
|
||||||
svbool_t pn = svwhilelt_b32(js, n);
|
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||||
#endif
|
#endif
|
||||||
do
|
do
|
||||||
|
@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||||
posY += n_active;
|
posY += n_active;
|
||||||
js += n_active;
|
js += n_active;
|
||||||
#ifdef DOUBLE
|
#ifdef DOUBLE
|
||||||
pn = svwhilelt_b64(js, n);
|
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||||
} while (svptest_any(svptrue_b64(), pn));
|
} while (svptest_any(svptrue_b64(), pn));
|
||||||
#else
|
#else
|
||||||
pn = svwhilelt_b32(js, n);
|
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||||
} while (svptest_any(svptrue_b32(), pn));
|
} while (svptest_any(svptrue_b32(), pn));
|
||||||
#endif
|
#endif
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue