Merge pull request #4210 from xianyi/develop
merge develop into 0.3.0 for 0.3.24
This commit is contained in:
commit
2c68822cde
|
@ -0,0 +1,167 @@
|
|||
macos_instance:
|
||||
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
|
||||
|
||||
task:
|
||||
name: AppleM1/LLVM
|
||||
compile_script:
|
||||
- brew install llvm
|
||||
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
- make TARGET=VORTEX USE_OPENMP=1 CC=clang
|
||||
|
||||
task:
|
||||
name: AppleM1/LLVM/ILP64
|
||||
compile_script:
|
||||
- brew install llvm
|
||||
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
- make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1
|
||||
|
||||
task:
|
||||
name: AppleM1/LLVM/CMAKE
|
||||
compile_script:
|
||||
- brew install llvm
|
||||
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
- mkdir build
|
||||
- cd build
|
||||
- cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
|
||||
- make
|
||||
|
||||
task:
|
||||
name: AppleM1/GCC/MAKE/OPENMP
|
||||
compile_script:
|
||||
- brew install gcc@11
|
||||
- export PATH=/opt/homebrew/bin:$PATH
|
||||
- export LDFLAGS="-L/opt/homebrew/lib"
|
||||
- export CPPFLAGS="-I/opt/homebrew/include"
|
||||
- make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
|
||||
|
||||
macos_instance:
|
||||
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
|
||||
task:
|
||||
name: AppleM1/LLVM x86_64 xbuild
|
||||
compile_script:
|
||||
- #brew install llvm
|
||||
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
- export ARCHS="i386 x86_64"
|
||||
- export ARCHS_STANDARD="i386 x86_64"
|
||||
- export ARCHS_STANDARD_32_64_BIT="i386 x86_64"
|
||||
- export ARCHS_STANDARD_64_BIT=x86_64
|
||||
- export ARCHS_STANDARD_INCLUDING_64_BIT="i386 x86_64"
|
||||
- export ARCHS_UNIVERSAL_IPHONE_OS="i386 x86_64"
|
||||
- export VALID_ARCHS="i386 x86_64"
|
||||
- xcrun --sdk macosx --show-sdk-path
|
||||
- xcodebuild -version
|
||||
- export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX12.3.sdk -arch x86_64"
|
||||
- make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
|
||||
always:
|
||||
config_artifacts:
|
||||
path: "*conf*"
|
||||
type: text/plain
|
||||
# lib_artifacts:
|
||||
# path: "libopenblas*"
|
||||
# type: application/octet-streamm
|
||||
|
||||
macos_instance:
|
||||
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
|
||||
task:
|
||||
name: AppleM1/LLVM armv8-ios xbuild
|
||||
compile_script:
|
||||
- #brew install llvm
|
||||
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
- export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
- make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1
|
||||
always:
|
||||
config_artifacts:
|
||||
path: "*conf*"
|
||||
type: text/plain
|
||||
|
||||
macos_instance:
|
||||
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
|
||||
task:
|
||||
name: AppleM1/LLVM armv7-androidndk xbuild
|
||||
compile_script:
|
||||
- #brew install android-ndk
|
||||
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
- find /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b -name "armv7a-linux-androideabi*-ranlib"
|
||||
- #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
- #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b/AndroidNDK8937393.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
|
||||
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
|
||||
always:
|
||||
config_artifacts:
|
||||
path: "*conf*"
|
||||
type: text/plain
|
||||
|
||||
task:
|
||||
name: NeoverseN1
|
||||
arm_container:
|
||||
image: node:latest
|
||||
compile_script:
|
||||
- make
|
||||
|
||||
task:
|
||||
name: NeoverseN1-ILP64
|
||||
arm_container:
|
||||
image: node:latest
|
||||
compile_script:
|
||||
- make INTERFACE64=1
|
||||
|
||||
task:
|
||||
name: NeoverseN1-OMP
|
||||
arm_container:
|
||||
image: node:latest
|
||||
cpu: 8
|
||||
compile_script:
|
||||
- make USE_OPENMP=1
|
||||
|
||||
FreeBSD_task:
|
||||
name: FreeBSD-gcc12
|
||||
freebsd_instance:
|
||||
image_family: freebsd-13-2
|
||||
install_script:
|
||||
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
|
||||
compile_script:
|
||||
- ls -l /usr/local/lib
|
||||
- gmake CC=gcc
|
||||
|
||||
|
||||
FreeBSD_task:
|
||||
name: freebsd-gcc12-ilp64
|
||||
freebsd_instance:
|
||||
image_family: freebsd-13-2
|
||||
install_script:
|
||||
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
|
||||
compile_script:
|
||||
- ls -l /usr/local/lib
|
||||
- gmake CC=gcc INTERFACE64=1
|
||||
|
||||
#task:
|
||||
# name: Windows/LLVM16 --- too slow ---
|
||||
# windows_container:
|
||||
# image: cirrusci/windowsservercore:cmake-2021.12.07
|
||||
# install_script:
|
||||
# - choco list --localonly
|
||||
# - choco install -y llvm
|
||||
# - # choco install -y cmake --installargs '"ADD_CMAKE_TO_PATH=System"'
|
||||
# - choco install -y ninja
|
||||
# - refreshenv
|
||||
# - cd "c:/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/VC/Auxiliary/Build"
|
||||
# - vcvarsall x64
|
||||
# - cd "C:\Users\ContainerAdministrator\AppData\Local\Temp\cirrus-ci-build"
|
||||
# - cmake -S . -B build -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release
|
||||
# - cd build
|
||||
# - cmake --build .
|
||||
# - ctest
|
|
@ -0,0 +1,121 @@
|
|||
name: c910v qemu test
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
permissions:
|
||||
contents: read # to fetch code (actions/checkout)
|
||||
|
||||
jobs:
|
||||
TEST:
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282
|
||||
toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- target: RISCV64_GENERIC
|
||||
triple: riscv64-linux-gnu
|
||||
apt_triple: riscv64-linux-gnu
|
||||
opts: NO_SHARED=1 TARGET=RISCV64_GENERIC
|
||||
- target: C910V
|
||||
triple: riscv64-unknown-linux-gnu
|
||||
apt_triple: riscv64-linux-gnu
|
||||
opts: NO_SHARED=1 TARGET=C910V
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: install build deps
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
|
||||
gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross
|
||||
|
||||
- name: checkout qemu
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
repository: T-head-Semi/qemu
|
||||
path: qemu
|
||||
ref: 1e692ebb43d396c52352406323fc782c1ac99a42
|
||||
|
||||
- name: build qemu
|
||||
run: |
|
||||
# Force use c910v qemu-user
|
||||
wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
|
||||
cd qemu
|
||||
patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
|
||||
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
|
||||
make -j$(nproc)
|
||||
make install
|
||||
|
||||
- name: Compilation cache
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.ccache
|
||||
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
|
||||
restore-keys: |
|
||||
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
|
||||
ccache-${{ runner.os }}-${{ matrix.target }}
|
||||
|
||||
- name: Configure ccache
|
||||
run: |
|
||||
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||
echo "max_size = 300M" > ~/.ccache/ccache.conf
|
||||
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||
ccache -s
|
||||
|
||||
- name: build OpenBLAS
|
||||
run: |
|
||||
wget ${xuetie_toolchain}/${toolchain_file_name}
|
||||
tar -xvf ${toolchain_file_name} -C /opt
|
||||
export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1/bin:$PATH"
|
||||
|
||||
make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
|
||||
|
||||
- name: test
|
||||
run: |
|
||||
export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
|
||||
qemu-riscv64 ./utest/openblas_utest
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat2 < ./ctest/sin2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat2 < ./ctest/din2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat2 < ./ctest/cin2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat2 < ./ctest/zin2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat3 < ./ctest/sin3
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat3 < ./ctest/din3
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat3 < ./ctest/cin3
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat3 < ./ctest/zin3
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat1
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat1
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat1
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat1
|
||||
rm -f ./test/?BLAT2.SUMM
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat
|
||||
rm -f ./test/?BLAT2.SUMM
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat
|
||||
rm -f ./test/?BLAT3.SUMM
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat
|
||||
rm -f ./test/?BLAT3.SUMM
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat
|
|
@ -151,40 +151,53 @@ jobs:
|
|||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
msystem: [MINGW64, MINGW32, CLANG64]
|
||||
msystem: [MINGW64, MINGW32, CLANG64, CLANG32]
|
||||
idx: [int32, int64]
|
||||
build-type: [Release]
|
||||
include:
|
||||
- msystem: MINGW64
|
||||
idx: int32
|
||||
target-prefix: mingw-w64-x86_64
|
||||
fc-pkg: mingw-w64-x86_64-gcc-fortran
|
||||
fc-pkg: fc
|
||||
- msystem: MINGW32
|
||||
idx: int32
|
||||
target-prefix: mingw-w64-i686
|
||||
fc-pkg: mingw-w64-i686-gcc-fortran
|
||||
fc-pkg: fc
|
||||
- msystem: CLANG64
|
||||
idx: int32
|
||||
target-prefix: mingw-w64-clang-x86_64
|
||||
fc-pkg: fc
|
||||
# Compiling with Flang 16 seems to cause test errors on machines
|
||||
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
|
||||
no-avx512-flags: -DNO_AVX512=1
|
||||
- msystem: CLANG32
|
||||
idx: int32
|
||||
target-prefix: mingw-w64-clang-i686
|
||||
fc-pkg: cc
|
||||
c-lapack-flags: -DC_LAPACK=ON
|
||||
- msystem: MINGW64
|
||||
idx: int64
|
||||
idx64-flags: -DBINARY=64 -DINTERFACE64=1
|
||||
target-prefix: mingw-w64-x86_64
|
||||
fc-pkg: mingw-w64-x86_64-gcc-fortran
|
||||
fc-pkg: fc
|
||||
- msystem: CLANG64
|
||||
idx: int64
|
||||
idx64-flags: -DBINARY=64 -DINTERFACE64=1
|
||||
target-prefix: mingw-w64-clang-x86_64
|
||||
c-lapack-flags: -DC_LAPACK=ON
|
||||
fc-pkg: fc
|
||||
# Compiling with Flang 16 seems to cause test errors on machines
|
||||
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
|
||||
no-avx512-flags: -DNO_AVX512=1
|
||||
- msystem: MINGW64
|
||||
idx: int32
|
||||
target-prefix: mingw-w64-x86_64
|
||||
fc-pkg: mingw-w64-x86_64-gcc-fortran
|
||||
fc-pkg: fc
|
||||
build-type: None
|
||||
exclude:
|
||||
- msystem: MINGW32
|
||||
idx: int64
|
||||
- msystem: CLANG32
|
||||
idx: int64
|
||||
|
||||
defaults:
|
||||
run:
|
||||
|
@ -209,7 +222,7 @@ jobs:
|
|||
install: >-
|
||||
base-devel
|
||||
${{ matrix.target-prefix }}-cc
|
||||
${{ matrix.fc-pkg }}
|
||||
${{ matrix.target-prefix }}-${{ matrix.fc-pkg }}
|
||||
${{ matrix.target-prefix }}-cmake
|
||||
${{ matrix.target-prefix }}-ninja
|
||||
${{ matrix.target-prefix }}-ccache
|
||||
|
@ -217,14 +230,21 @@ jobs:
|
|||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Compilation cache
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
# It looks like this path needs to be hard-coded.
|
||||
path: C:/msys64/home/runneradmin/.ccache
|
||||
- name: Prepare ccache
|
||||
# Get cache location of ccache
|
||||
# Create key that is used in action/cache/restore and action/cache/save steps
|
||||
id: ccache-prepare
|
||||
run: |
|
||||
echo "ccachedir=$(cygpath -m $(ccache -k cache_dir))" >> $GITHUB_OUTPUT
|
||||
# We include the commit sha in the cache key, as new cache entries are
|
||||
# only created if there is no existing entry for the key yet.
|
||||
key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}
|
||||
echo "key=ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Restore ccache
|
||||
uses: actions/cache/restore@v3
|
||||
with:
|
||||
path: ${{ steps.ccache-prepare.outputs.ccachedir }}
|
||||
key: ${{ steps.ccache-prepare.outputs.key }}
|
||||
# Restore a matching ccache cache entry. Prefer same branch.
|
||||
restore-keys: |
|
||||
ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}
|
||||
|
@ -234,9 +254,10 @@ jobs:
|
|||
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota.
|
||||
run: |
|
||||
which ccache
|
||||
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||
echo "max_size = 250M" > ~/.ccache/ccache.conf
|
||||
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||
test -d ${{ steps.ccache-prepare.outputs.ccachedir }} || mkdir -p ${{ steps.ccache-prepare.outputs.ccachedir }}
|
||||
echo "max_size = 250M" > ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf
|
||||
echo "compression = true" >> ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf
|
||||
ccache -p
|
||||
ccache -s
|
||||
echo $HOME
|
||||
cygpath -w $HOME
|
||||
|
@ -253,6 +274,7 @@ jobs:
|
|||
-DTARGET=CORE2 \
|
||||
${{ matrix.idx64-flags }} \
|
||||
${{ matrix.c-lapack-flags }} \
|
||||
${{ matrix.no-avx512-flags }} \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
|
||||
..
|
||||
|
@ -264,10 +286,30 @@ jobs:
|
|||
continue-on-error: true
|
||||
run: ccache -s
|
||||
|
||||
- name: Save ccache
|
||||
# Save the cache after we are done (successfully) building
|
||||
uses: actions/cache/save@v3
|
||||
with:
|
||||
path: ${{ steps.ccache-prepare.outputs.ccachedir }}
|
||||
key: ${{ steps.ccache-prepare.outputs.key }}
|
||||
|
||||
- name: Run tests
|
||||
id: run-ctest
|
||||
timeout-minutes: 60
|
||||
run: cd build && ctest
|
||||
|
||||
- name: Re-run tests
|
||||
if: always() && (steps.run-ctest.outcome == 'failure')
|
||||
timeout-minutes: 60
|
||||
run: |
|
||||
cd build
|
||||
echo "::group::Re-run ctest"
|
||||
ctest --rerun-failed --output-on-failure || true
|
||||
echo "::endgroup::"
|
||||
echo "::group::Log from these tests"
|
||||
[ ! -f Testing/Temporary/LastTest.log ] || cat Testing/Temporary/LastTest.log
|
||||
echo "::endgroup::"
|
||||
|
||||
|
||||
cross_build:
|
||||
runs-on: ubuntu-22.04
|
||||
|
@ -295,6 +337,7 @@ jobs:
|
|||
|
||||
- name: Install Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y ccache gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-${{ matrix.target }}-cross
|
||||
|
||||
- name: Compilation cache
|
||||
|
|
|
@ -0,0 +1,110 @@
|
|||
name: loongarch64 qemu test
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
TEST:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- target: LOONGSONGENERIC
|
||||
triple: loongarch64-unknown-linux-gnu
|
||||
opts: NO_SHARED=1 TARGET=LOONGSONGENERIC
|
||||
- target: LOONGSON3R5
|
||||
triple: loongarch64-unknown-linux-gnu
|
||||
opts: NO_SHARED=1 TARGET=LOONGSON3R5
|
||||
- target: LOONGSON2K1000
|
||||
triple: loongarch64-unknown-linux-gnu
|
||||
opts: NO_SHARED=1 TARGET=LOONGSON2K1000
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Install APT deps
|
||||
run: |
|
||||
sudo add-apt-repository ppa:savoury1/virtualisation
|
||||
sudo apt-get update
|
||||
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
|
||||
qemu-user-static
|
||||
|
||||
- name: Download and install loongarch64-toolchain
|
||||
run: |
|
||||
wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz
|
||||
tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt
|
||||
|
||||
- name: Set env
|
||||
run: |
|
||||
echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
|
||||
echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV
|
||||
|
||||
- name: Compilation cache
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.ccache
|
||||
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
|
||||
restore-keys: |
|
||||
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
|
||||
ccache-${{ runner.os }}-${{ matrix.target }}
|
||||
|
||||
- name: Configure ccache
|
||||
run: |
|
||||
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||
echo "max_size = 300M" > ~/.ccache/ccache.conf
|
||||
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||
ccache -s
|
||||
|
||||
- name: Disable utest dsdot:dsdot_n_1
|
||||
run: |
|
||||
echo -n > utest/test_dsdot.c
|
||||
echo "Due to the qemu versions 7.2 causing utest cases to fail,"
|
||||
echo "the utest dsdot:dsdot_n_1 have been temporarily disabled."
|
||||
|
||||
- name: Build OpenBLAS
|
||||
run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
|
||||
|
||||
- name: Test
|
||||
run: |
|
||||
qemu-loongarch64-static ./utest/openblas_utest
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1
|
||||
rm -f ./test/?BLAT2.SUMM
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
|
||||
rm -f ./test/?BLAT2.SUMM
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
|
||||
rm -f ./test/?BLAT3.SUMM
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat
|
||||
rm -f ./test/?BLAT3.SUMM
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat
|
|
@ -14,6 +14,7 @@ lapack-3.4.2
|
|||
lapack-3.4.2.tgz
|
||||
lapack-netlib/make.inc
|
||||
lapack-netlib/lapacke/include/lapacke_mangling.h
|
||||
lapack-netlib/SRC/la_constants.mod
|
||||
lapack-netlib/TESTING/testing_results.txt
|
||||
lapack-netlib/INSTALL/test*
|
||||
lapack-netlib/TESTING/xeigtstc
|
||||
|
@ -71,6 +72,7 @@ test/SBLAT3.SUMM
|
|||
test/ZBLAT2.SUMM
|
||||
test/ZBLAT3.SUMM
|
||||
test/SHBLAT3.SUMM
|
||||
test/SBBLAT3.SUMM
|
||||
test/cblat1
|
||||
test/cblat2
|
||||
test/cblat3
|
||||
|
@ -81,6 +83,7 @@ test/sblat1
|
|||
test/sblat2
|
||||
test/sblat3
|
||||
test/test_shgemm
|
||||
test/test_sbgemm
|
||||
test/zblat1
|
||||
test/zblat2
|
||||
test/zblat3
|
||||
|
|
|
@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
|
|||
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 23)
|
||||
set(OpenBLAS_PATCH_VERSION 23.dev)
|
||||
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
|
@ -20,6 +20,8 @@ include(CMakePackageConfigHelpers)
|
|||
#######
|
||||
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF)
|
||||
|
||||
option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON)
|
||||
|
||||
option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
|
||||
|
||||
option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
|
||||
|
@ -309,20 +311,26 @@ endif()
|
|||
|
||||
#if (MSVC OR NOT NOFORTRAN)
|
||||
if (NOT NO_CBLAS)
|
||||
if (NOT ONLY_CBLAS)
|
||||
# Broken without fortran on unix
|
||||
add_subdirectory(utest)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (NOT NOFORTRAN)
|
||||
if (NOT ONLY_CBLAS)
|
||||
# Build test and ctest
|
||||
add_subdirectory(test)
|
||||
endif()
|
||||
if (BUILD_TESTING)
|
||||
add_subdirectory(lapack-netlib/TESTING)
|
||||
endif()
|
||||
endif()
|
||||
if(NOT NO_CBLAS)
|
||||
if (NOT ONLY_CBLAS)
|
||||
add_subdirectory(ctest)
|
||||
endif()
|
||||
endif()
|
||||
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
|
||||
add_subdirectory(cpp_thread_test)
|
||||
endif()
|
||||
|
@ -398,15 +406,45 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
|
|||
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
|
||||
endif()
|
||||
|
||||
if (${BUILD_LAPACK_DEPRECATED})
|
||||
set (BLD 1)
|
||||
else ()
|
||||
set (BLD 0)
|
||||
endif()
|
||||
if (${BUILD_BFLOAT16})
|
||||
set (BBF16 1)
|
||||
else ()
|
||||
set (BBF16 0)
|
||||
endif()
|
||||
if (${BUILD_SINGLE})
|
||||
set (BS 1)
|
||||
else ()
|
||||
set (BS 0)
|
||||
endif()
|
||||
if (${BUILD_DOUBLE})
|
||||
set (BD 1)
|
||||
else ()
|
||||
set (BD 0)
|
||||
endif()
|
||||
if (${BUILD_COMPLEX})
|
||||
set (BC 1)
|
||||
else ()
|
||||
set (BC 0)
|
||||
endif()
|
||||
if (${BUILD_COMPLEX16})
|
||||
set (BZ 1)
|
||||
else ()
|
||||
set (BZ 0)
|
||||
endif()
|
||||
if (NOT USE_PERL)
|
||||
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
|
||||
COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||
COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
|
||||
COMMENT "renaming symbols"
|
||||
)
|
||||
else()
|
||||
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
|
||||
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
|
||||
COMMENT "renaming symbols"
|
||||
)
|
||||
|
@ -511,9 +549,8 @@ configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/
|
|||
install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
|
||||
|
||||
|
||||
# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
|
||||
set(PN OpenBLAS)
|
||||
set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}")
|
||||
set(CMAKECONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PN}${SUFFIX64}")
|
||||
configure_package_config_file(cmake/${PN}Config.cmake.in
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake"
|
||||
INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR})
|
||||
|
|
|
@ -23,6 +23,9 @@
|
|||
* Optimization on AMD Piledriver
|
||||
* Optimization on Intel Haswell
|
||||
|
||||
* Chris Sidebottom <chris.sidebottom@arm.com>
|
||||
* Optimizations and other improvements targeting AArch64
|
||||
|
||||
## Previous Developers
|
||||
|
||||
* Zaheer Chothia <zaheer.chothia@gmail.com>
|
||||
|
|
100
Changelog.txt
100
Changelog.txt
|
@ -1,4 +1,104 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.24
|
||||
03-Sep-2023
|
||||
|
||||
general:
|
||||
- declared the arguments of cblas_xerbla as const (in accordance with the reference implementation
|
||||
and others, the previous discrepancy appears to have dated back to GotoBLAS)
|
||||
- fixed the implementation of ?GEMMT that was added in 0.3.23
|
||||
- made cpu-specific SWITCH_RATIO parameters for GEMM available to DYNAMIC_ARCH builds
|
||||
- fixed application of SYMBOLSUFFIX in CMAKE builds
|
||||
- fixed missing SSYCONVF function in the shared library
|
||||
- fixed parallel build logic used with gmake
|
||||
- added support for compilation with LLVM17, in particular its new Fortran compiler
|
||||
- added support for CMAKE builds using the NVIDIA HPC compiler
|
||||
- fixed INTERFACE64 builds with CMAKE and the f95 Fortran compiler
|
||||
- fixed cross-build detection and management in c_check
|
||||
- disabled building of the tests with CMAKE when ONLY_CBLAS is defined
|
||||
- fixed several issues with the handling of runtime limits on the number of OPENMP threads
|
||||
- corrected the error code returned by SGEADD/DGEADD when LDA is too small
|
||||
- corrected the error code returned by IMATCOPY when LDB is too small
|
||||
- updated ?NRM2 to support negative increment values (as introduced in release 3.10
|
||||
of the reference BLAS)
|
||||
- fixed OpenMP builds with CLANG for the case where libomp is not in a standard location
|
||||
- fixed a potential overwrite of unrelated memory during thread initialisation on startup
|
||||
- fixed a potential integer overflow in the multithreading threshold for ?SYMM/?SYRK
|
||||
- fixed build of the LAPACKE interfaces for the LAPACK 3.11.0 ?TRSYL functions added in 0.3.22
|
||||
- fixed installation of .cmake files in concurrent 32 and 64bit builds with CMAKE
|
||||
- applied additions and corrections from the development branch of Reference-LAPACK:
|
||||
- fixed actual arguments passed to a number of LAPACK functions (from Reference-LAPACK PR 885)
|
||||
- fixed workspace query results in LAPACK ?SYTRF/?TRECV3 (from Reference-LAPACK PR 883)
|
||||
- fixed derivation of the UPLO parameter in LAPACKE_?larfb (from Reference-LAPACK PR 878)
|
||||
- fixed a crash in LAPACK ?GELSDD on NRHS=0 (from Reference-LAPACK PR 876)
|
||||
- added new LAPACK utility functions CRSCL and ZRSCL (from Reference-LAPACK PR 839)
|
||||
- corrected the order of eigenvalues for 2x2 matrices in ?STEMR (Reference-LAPACK PR 867)
|
||||
- removed spurious reference to OpenMP variables outside OpenMP contexts (Reference-LAPACK PR 860)
|
||||
- updated file comments on use of LAMBDA variable in LAPACK (Reference-LAPACK PR 852)
|
||||
- fixed documentation of LAPACK SLASD0/DLASD0 (Reference-LAPACK PR 855)
|
||||
- fixed confusing use of "minor" in LAPACK documentation (Reference-LAPACK PR 849)
|
||||
- added new LAPACK functions ?GEDMD for dynamic mode decomposition (Reference-LAPACK PR 736)
|
||||
- fixed potential stack overflows in the EIG part of the LAPACK testsuite (Reference-LAPACK PR 854)
|
||||
- applied small improvements to the variants of Cholesky and QR functions (Reference-LAPACK PR 847)
|
||||
- removed unused variables from LAPACK ?BDSQR (Reference-LAPACK PR 832)
|
||||
- fixed a potential crash on allocation failure in LAPACKE SGEESX/DGEESX (Reference-LAPACK PR 836)
|
||||
- added a quick return from SLARUV/DLARUV for N < 1 (Reference-LAPACK PR 837)
|
||||
- updated function descriptions in LAPACK ?GEGS/?GEGV (Reference-LAPACK PR 831)
|
||||
- improved algorithm description in ?GELSY (Reference-LAPACK PR 833)
|
||||
- fixed scaling in LAPACK STGSNA/DTGSNA (Reference-LAPACK PR 830)
|
||||
- fixed crash in LAPACKE_?geqrt with row-major data (Reference-LAPACK PR 768)
|
||||
- added LAPACKE interfaces for C/ZUNHR_COL and S/DORHR_COL (Reference-LAPACK PR 827)
|
||||
- added error exit tests for SYSV/SYTD2/GEHD2 to the testsuite (Reference-LAPACK PR 795)
|
||||
- fixed typos in LAPACK source and comments (Reference-LAPACK PRs 809,811,812,814,820)
|
||||
- adopt refactored ?GEBAL implementation (Reference-LAPACK PR 808)
|
||||
|
||||
x86_64:
|
||||
- added cpu model autodetection for Intel Alder Lake N
|
||||
- added activation of the AMX tile to the Sapphire Rapids SBGEMM kernel
|
||||
- worked around miscompilations of GEMV/SYMV kernels by gcc's tree-vectorizer
|
||||
- fixed compilation of Cooperlake and Sapphire Rapids kernels with CLANG
|
||||
- fixed runtime detection of Cooperlake and Sapphire Rapids in DYNAMIC_ARCH
|
||||
- fixed feature-based cputype fallback in DYNAMIC_ARCH
|
||||
- added support for building the AVX512 kernels with the NVIDIA HPC compiler
|
||||
- corrected ZAXPY result on old pre-AVX hardware for the INCX=0 case
|
||||
- fixed a potential use of uninitialized variables in ZTRSM
|
||||
|
||||
ARM64:
|
||||
- added cpu model autodetection for Apple M2
|
||||
- fixed wrong results of CGEMM/CTRMM/DNRM2 under OSX (use of reserved register)
|
||||
- added support for building the SVE kernels with the NVIDIA HPC compiler
|
||||
- added support for building the SVE kernels with the Apple Clang compiler
|
||||
- fixed compiler option handling for building the SVE kernels with LLVM
|
||||
- implemented SWITCH_RATIO parameter for improved GEMM performance on Neoverse
|
||||
- activated SVE SGEMM and DGEMM kernels for Neoverse V1
|
||||
- improved performance of the SVE CGEMM and ZGEMM kernels on Neoverse V1
|
||||
- improved kernel selection for the ARMV8SVE target and added it to DYNAMIC_ARCH
|
||||
- fixed runtime check for SVE availability in DYNAMIC_ARCH builds to take OS or
|
||||
container restrictions into account
|
||||
- fixed a potential use of uninitialized variables in ZTRSM
|
||||
- fix a potential misdetection of ARMV8 hardware as 32bit in CMAKE builds
|
||||
|
||||
LOONGARCH64:
|
||||
- added ABI detection
|
||||
- added support for cpu affinity handling
|
||||
- fixed compilation with early versions of the Loongson toolchain
|
||||
- added an optimized SGEMM kernel for 3A5000
|
||||
- added optimized DGEMV kernels for 3A5000
|
||||
- improved the performance of the DGEMM kernel for 3A5000
|
||||
|
||||
MIPS64:
|
||||
- fixed miscompilation of TRMM kernels for the MIPS64_GENERIC target
|
||||
|
||||
POWER:
|
||||
- fixed compiler warnings in the POWER10 SBGEMM kernel
|
||||
|
||||
RISCV:
|
||||
- fixed application of the INTERFACE64 option when building with CMAKE
|
||||
- fix a potential misdetection of RISCV hardware as 32bit in CMAKE builds
|
||||
- fixed IDAMAX and DOT kernels for C910V
|
||||
- fixed corner cases in the ROT and SWAP kernels for C910V
|
||||
- fixed compilation of the C910V target with recent vendor compilers
|
||||
|
||||
====================================================================
|
||||
Version 0.3.23
|
||||
01-Apr-2023
|
||||
|
|
|
@ -1,9 +1,14 @@
|
|||
node {
|
||||
stage('Checkout') {
|
||||
checkout
|
||||
pipeline {
|
||||
agent {
|
||||
docker {
|
||||
image 'osuosl/ubuntu-s390x'
|
||||
}
|
||||
|
||||
}
|
||||
stages {
|
||||
stage('Build') {
|
||||
sh("make")
|
||||
steps {
|
||||
sh 'make clean && make'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
pipeline {
|
||||
agent {
|
||||
docker {
|
||||
image 'osuosl/ubuntu-ppc64le'
|
||||
}
|
||||
}
|
||||
stages {
|
||||
stage('Build') {
|
||||
steps {
|
||||
sh 'sudo apt update'
|
||||
sh 'sudo apt install gfortran -y'
|
||||
sh 'make clean && make'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
10
Makefile
10
Makefile
|
@ -40,9 +40,9 @@ LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
|
|||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
|
||||
|
||||
.PHONY : all libs netlib $(RELA) test ctest shared install
|
||||
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
|
||||
.NOTPARALLEL : shared
|
||||
|
||||
all :: libs netlib $(RELA) tests shared
|
||||
all :: tests
|
||||
@echo
|
||||
@echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
|
||||
@echo
|
||||
|
@ -150,7 +150,7 @@ ifeq ($(OSNAME), CYGWIN_NT)
|
|||
endif
|
||||
endif
|
||||
|
||||
tests : libs netlib $(RELA) shared
|
||||
tests : shared
|
||||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
touch $(LIBNAME)
|
||||
ifndef NO_FBLAS
|
||||
|
@ -373,10 +373,10 @@ ifneq ($(CROSS), 1)
|
|||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING)
|
||||
endif
|
||||
|
||||
lapack-runtest:
|
||||
lapack-runtest: lapack-test
|
||||
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
|
||||
./testsecond; ./testdsecnd; ./testieee; ./testversion )
|
||||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
|
||||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING )
|
||||
|
||||
|
||||
blas-test:
|
||||
|
|
|
@ -69,7 +69,7 @@ endif
|
|||
# in GCC>=9
|
||||
ifeq ($(CORE), NEOVERSEN1)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
|
||||
|
@ -92,9 +92,14 @@ endif
|
|||
# in GCC>=10.4
|
||||
ifeq ($(CORE), NEOVERSEV1)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
|
||||
CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-v1
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
|
||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
|
||||
CCOMMON_OPT += -march=armv8.4-a+sve
|
||||
ifeq (1, $(ISCLANG))
|
||||
CCOMMON_OPT += -mtune=cortex-x1
|
||||
else
|
||||
CCOMMON_OPT += -mtune=neoverse-v1
|
||||
endif
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
|
||||
endif
|
||||
|
@ -122,8 +127,8 @@ endif
|
|||
# in GCC>=10.4
|
||||
ifeq ($(CORE), NEOVERSEN2)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
|
||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
|
||||
ifneq ($(OSNAME), Darwin)
|
||||
CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
|
||||
else
|
||||
|
@ -155,7 +160,7 @@ endif
|
|||
# Use a53 tunings because a55 is only available in GCC>=8.1
|
||||
ifeq ($(CORE), CORTEXA55)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||
ifeq ($(GCCVERSIONGTEQ8), 1)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ8) $(ISCLANG)))
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
|
||||
|
@ -196,8 +201,13 @@ endif
|
|||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX3T110)
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
|
||||
CCOMMON_OPT += -march=armv8.3-a
|
||||
ifeq (0, $(ISCLANG))
|
||||
CCOMMON_OPT += -mtune=thunderx3t110
|
||||
else
|
||||
CCOMMON_OPT += -mtune=thunderx2t99
|
||||
endif
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
|
||||
endif
|
||||
|
@ -225,9 +235,12 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
|
||||
ifeq ($(CORE), EMAG8180)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=emag
|
||||
CCOMMON_OPT += -march=armv8-a
|
||||
ifeq ($(ISCLANG), 0)
|
||||
CCOMMON_OPT += -mtune=emag
|
||||
endif
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=emag
|
||||
endif
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.23
|
||||
VERSION = 0.3.23.dev
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
|
|
@ -384,6 +384,11 @@ GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d
|
|||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
CLANGVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||
CLANGVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12)
|
||||
endif
|
||||
|
||||
#
|
||||
# OS dependent settings
|
||||
#
|
||||
|
@ -645,7 +650,7 @@ DYNAMIC_CORE += HASWELL ZEN
|
|||
endif
|
||||
ifneq ($(NO_AVX512), 1)
|
||||
ifneq ($(NO_AVX2), 1)
|
||||
DYNAMIC_CORE += SKYLAKEX COOPERLAKE
|
||||
DYNAMIC_CORE += SKYLAKEX COOPERLAKE SAPPHIRERAPIDS
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
@ -668,6 +673,7 @@ DYNAMIC_CORE += NEOVERSEN1
|
|||
ifneq ($(NO_SVE), 1)
|
||||
DYNAMIC_CORE += NEOVERSEV1
|
||||
DYNAMIC_CORE += NEOVERSEN2
|
||||
DYNAMIC_CORE += ARMV8SVE
|
||||
endif
|
||||
DYNAMIC_CORE += CORTEXA55
|
||||
DYNAMIC_CORE += FALKOR
|
||||
|
@ -932,8 +938,12 @@ BINARY_DEFINED = 1
|
|||
endif
|
||||
|
||||
ifeq ($(ARCH), loongarch64)
|
||||
CCOMMON_OPT += -march=loongarch64 -mabi=lp64
|
||||
FCOMMON_OPT += -march=loongarch64 -mabi=lp64
|
||||
LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d)
|
||||
ifneq ($(LA64_ABI), lp64d)
|
||||
LA64_ABI=lp64
|
||||
endif
|
||||
CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
|
||||
FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
|
||||
endif
|
||||
|
||||
endif
|
||||
|
@ -1082,8 +1092,9 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW))
|
||||
CCOMMON_OPT += -DF_INTERFACE_GFORT
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
FCOMMON_OPT += -Wall
|
||||
# make single-threaded LAPACK calls thread-safe #1847
|
||||
FCOMMON_OPT += -frecursive
|
||||
|
@ -1097,6 +1108,7 @@ EXTRALIB += -lgfortran
|
|||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifdef NO_BINARY_MODE
|
||||
ifeq ($(ARCH), $(filter $(ARCH),mips64))
|
||||
ifdef BINARY64
|
||||
|
@ -1763,6 +1775,8 @@ export TARGET_CORE
|
|||
export NO_AVX512
|
||||
export NO_AVX2
|
||||
export BUILD_BFLOAT16
|
||||
export NO_LSX
|
||||
export NO_LASX
|
||||
|
||||
export SBGEMM_UNROLL_M
|
||||
export SBGEMM_UNROLL_N
|
||||
|
|
|
@ -75,18 +75,31 @@ endif
|
|||
ifeq ($(CORE), COOPERLAKE)
|
||||
ifndef NO_AVX512
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
# cooperlake support was added in 10.1
|
||||
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
|
||||
CCOMMON_OPT += -march=cooperlake
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=cooperlake
|
||||
endif
|
||||
else # gcc not support, fallback to avx512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
# cooperlake support was added in 10.1
|
||||
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
|
||||
CCOMMON_OPT += -march=cooperlake
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=cooperlake
|
||||
endif
|
||||
else # gcc not support, fallback to avx512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
else ifeq ($(C_COMPILER), CLANG)
|
||||
# cooperlake support was added in clang 9
|
||||
ifeq ($(CLANGVERSIONGTEQ9), 1)
|
||||
CCOMMON_OPT += -march=cooperlake
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=cooperlake
|
||||
endif
|
||||
else # not supported in clang, fallback to avx512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
|
@ -104,18 +117,31 @@ endif
|
|||
ifeq ($(CORE), SAPPHIRERAPIDS)
|
||||
ifndef NO_AVX512
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
# sapphire rapids support was added in 11
|
||||
ifeq ($(GCCVERSIONGTEQ11), 1)
|
||||
CCOMMON_OPT += -march=sapphirerapids
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=sapphirerapids
|
||||
endif
|
||||
else # gcc not support, fallback to avx512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
# sapphire rapids support was added in 11
|
||||
ifeq ($(GCCVERSIONGTEQ11), 1)
|
||||
CCOMMON_OPT += -march=sapphirerapids
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=sapphirerapids
|
||||
endif
|
||||
else # gcc not support, fallback to avx512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
else ifeq ($(C_COMPILER), CLANG)
|
||||
# cooperlake support was added in clang 12
|
||||
ifeq ($(CLANGVERSIONGTEQ12), 1)
|
||||
CCOMMON_OPT += -march=cooperlake
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=cooperlake
|
||||
endif
|
||||
else # not supported in clang, fallback to avx512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
|
|
|
@ -6,11 +6,15 @@ Travis CI: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
|
||||
|
||||
Drone CI: [](https://cloud.drone.io/xianyi/OpenBLAS/)
|
||||
Cirrus CI: [](https://cirrus-ci.com/github/xianyi/OpenBLAS)
|
||||
<!-- Drone CI: [](https://cloud.drone.io/xianyi/OpenBLAS/)-->
|
||||
|
||||
|
||||
[](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
|
||||
|
||||
OSUOSL POWERCI [](http://powerci.osuosl.org/job/OpenBLAS_gh/job/develop/)
|
||||
|
||||
OSUOSL IBMZ-CI [](http://ibmz-ci.osuosl.org/job/OpenBLAS-Z/job/develop/)
|
||||
## Introduction
|
||||
|
||||
OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version.
|
||||
|
|
|
@ -115,7 +115,7 @@ jobs:
|
|||
mkdir build
|
||||
cd build
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
|
||||
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER="flang -I C:\Miniconda\Library\include\flang" -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
|
||||
cmake --build . --config Release
|
||||
ctest
|
||||
|
||||
|
@ -271,6 +271,19 @@ jobs:
|
|||
- script: |
|
||||
make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
|
||||
|
||||
- job: OSX_xbuild_DYNAMIC_ARM64
|
||||
pool:
|
||||
vmImage: 'macOS-11'
|
||||
variables:
|
||||
CC: /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX11.3.sdk -arch arm64
|
||||
steps:
|
||||
- script: |
|
||||
ls /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs
|
||||
/Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus
|
||||
/Applications/Xcode_11.7.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version
|
||||
make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
|
||||
|
||||
- job: ALPINE_MUSL
|
||||
pool:
|
||||
vmImage: 'ubuntu-latest'
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
Copyright (c) 2014, 2023 The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
@ -67,7 +67,7 @@ int main(int argc, char *argv[]){
|
|||
int step = 1;
|
||||
int loops = 1;
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
|
||||
|
||||
double time1,timeg;
|
||||
|
||||
|
@ -77,7 +77,7 @@ int main(int argc, char *argv[]){
|
|||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Loops = %d\n", from, to, step,uplo,trans,loops);
|
||||
|
||||
|
||||
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
|
||||
|
|
60
c_check
60
c_check
|
@ -31,13 +31,17 @@ flags="$*"
|
|||
|
||||
cross_suffix=""
|
||||
|
||||
if [ "`dirname \"$compiler_name\"`" != '.' ]; then
|
||||
cross_suffix="$cross_suffix`dirname \"$compiler_name\"`/"
|
||||
if [ "`dirname "$compiler_name"`" != '.' ]; then
|
||||
cross_suffix="$cross_suffix`dirname "$compiler_name"`/"
|
||||
fi
|
||||
|
||||
bn=`basename $compiler_name`
|
||||
cn=`echo $compiler_name | sed -e 's/ -.*//'`
|
||||
bn=`basename "$cn"`
|
||||
|
||||
case "$bn" in
|
||||
*-*) cross_suffix="$cross_suffix${bn%-*}-"
|
||||
*-*) if [ "$bn" != '-' ]; then
|
||||
cross_suffix="$cross_suffix${bn%-*}-"
|
||||
fi
|
||||
esac
|
||||
|
||||
compiler=""
|
||||
|
@ -164,7 +168,7 @@ fi
|
|||
|
||||
no_msa=0
|
||||
if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
|
||||
tmpd="$(mktemp -d)"
|
||||
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||
tmpf="$tmpd/a.c"
|
||||
code='"addvi.b $w0, $w1, 1"'
|
||||
msa_flags='-mmsa -mfp64 -mload-store-pairs'
|
||||
|
@ -181,6 +185,37 @@ if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
|
|||
rm -rf "$tmpd"
|
||||
fi
|
||||
|
||||
no_lsx=0
|
||||
no_lasx=0
|
||||
if [ "$architecture" = "loongarch64" ]; then
|
||||
tmpd="$(mktemp -d)"
|
||||
tmplsx="$tmpd/lsx.c"
|
||||
codelsx='"vadd.b $vr0, $vr0, $vr0"'
|
||||
lsx_flags='-march=loongarch64 -mlsx'
|
||||
printf "#include <lsxintrin.h>\n\n" >> "$tmplsx"
|
||||
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx"
|
||||
args="$lsx_flags -o $tmplsx.o $tmplsx"
|
||||
{
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
no_lsx=1
|
||||
}
|
||||
|
||||
tmplasx="$tmpd/lasx.c"
|
||||
codelasx='"xvadd.b $xr0, $xr0, $xr0"'
|
||||
lasx_flags='-march=loongarch64 -mlasx'
|
||||
printf "#include <lasxintrin.h>\n\n" >> "$tmplasx"
|
||||
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx"
|
||||
args="$lasx_flags -o $tmplasx.o $tmplasx"
|
||||
{
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
no_lasx=1
|
||||
}
|
||||
|
||||
rm -rf "$tmpd"
|
||||
fi
|
||||
|
||||
case "$data" in
|
||||
*ARCH_X86_64*) architecture=x86_64 ;;
|
||||
*ARCH_X86*) architecture=x86 ;;
|
||||
|
@ -204,7 +239,7 @@ esac
|
|||
|
||||
no_avx512=0
|
||||
if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
|
||||
tmpd=`mktemp -d`
|
||||
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||
tmpf="$tmpd/a.c"
|
||||
code='"vbroadcastss -4 * 4(%rsi), %zmm2"'
|
||||
printf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
|
||||
|
@ -225,7 +260,7 @@ fi
|
|||
|
||||
no_rv64gv=0
|
||||
if [ "$architecture" = "riscv64" ]; then
|
||||
tmpd=`mktemp -d`
|
||||
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||
tmpf="$tmpd/a.c"
|
||||
code='"vsetvli zero, zero, e8, m1\n"'
|
||||
printf "int main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
|
||||
|
@ -241,13 +276,16 @@ fi
|
|||
|
||||
no_sve=0
|
||||
if [ "$architecture" = "arm64" ]; then
|
||||
tmpd=`mktemp -d`
|
||||
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||
tmpf="$tmpd/a.c"
|
||||
printf "#include <arm_sve.h>\n\n int main(void){}\n">> "$tmpf"
|
||||
args=" -march=armv8-a+sve -c -o $tmpf.o $tmpf"
|
||||
no_sve=0
|
||||
{
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
args=" -Msve_intrinsics -c -o $tmpf.o $tmpf"
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
no_sve=1
|
||||
}
|
||||
|
@ -257,7 +295,7 @@ fi
|
|||
c11_atomics=0
|
||||
case "$data" in
|
||||
*HAVE_C11*)
|
||||
tmpd=`mktemp -d`
|
||||
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||
tmpf="$tmpd/a.c"
|
||||
printf "#include <stdatomic.h>\nint main(void){}\n" >> "$tmpf"
|
||||
args=" -c -o $tmpf.o $tmpf"
|
||||
|
@ -395,6 +433,8 @@ done
|
|||
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
|
||||
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
|
||||
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
|
||||
[ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"
|
||||
[ "$no_lasx" -eq 1 ] && printf "NO_LASX=1\n"
|
||||
} >> "$makefile"
|
||||
|
||||
os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ `
|
||||
|
@ -410,6 +450,8 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' `
|
|||
[ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu"
|
||||
[ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n"
|
||||
[ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n"
|
||||
[ "$no_lsx" -eq 1 ] && printf "#define NO_LSX\t1\n"
|
||||
[ "$no_lasx" -eq 1 ] && printf "#define NO_LASX\t1\n"
|
||||
} >> "$config"
|
||||
|
||||
|
||||
|
|
45
c_check.pl
45
c_check.pl
|
@ -232,6 +232,47 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
|||
}
|
||||
}
|
||||
|
||||
$no_lsx = 0;
|
||||
$no_lasx = 0;
|
||||
if (($architecture eq "loongarch64")) {
|
||||
eval "use File::Temp qw(tempfile)";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Temp, so could not check LSX and LASX capatibility";
|
||||
} else {
|
||||
$tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
|
||||
$codelsx = '"vadd.b $vr0, $vr0, $vr0"';
|
||||
$lsx_flags = "-march=loongarch64 -mlsx";
|
||||
print $tmplsx "#include <lsxintrin.h>\n\n";
|
||||
print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n";
|
||||
|
||||
$args = "$lsx_flags -o $tmplsx.o $tmplsx";
|
||||
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$no_lsx = 1;
|
||||
} else {
|
||||
$no_lsx = 0;
|
||||
}
|
||||
unlink("$tmplsx.o");
|
||||
|
||||
$tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
|
||||
$codelasx = '"xvadd.b $xr0, $xr0, $xr0"';
|
||||
$lasx_flags = "-march=loongarch64 -mlasx";
|
||||
print $tmplasx "#include <lasxintrin.h>\n\n";
|
||||
print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n";
|
||||
|
||||
$args = "$lasx_flags -o $tmplasx.o $tmplasx";
|
||||
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$no_lasx = 1;
|
||||
} else {
|
||||
$no_lasx = 0;
|
||||
}
|
||||
unlink("$tmplasx.o");
|
||||
}
|
||||
}
|
||||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = e2k if ($data =~ /ARCH_E2K/);
|
||||
|
@ -424,6 +465,8 @@ print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1;
|
|||
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
|
||||
print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1;
|
||||
print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1;
|
||||
print MAKEFILE "NO_LSX=1\n" if $no_lsx eq 1;
|
||||
print MAKEFILE "NO_LASX=1\n" if $no_lasx eq 1;
|
||||
|
||||
$os =~ tr/[a-z]/[A-Z]/;
|
||||
$architecture =~ tr/[a-z]/[A-Z]/;
|
||||
|
@ -437,6 +480,8 @@ print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64;
|
|||
print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
|
||||
print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1;
|
||||
print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1;
|
||||
print CONFFILE "#define NO_LSX\t1\n" if $no_lsx eq 1;
|
||||
print CONFFILE "#define NO_LASX\t1\n" if $no_lasx eq 1;
|
||||
|
||||
|
||||
if ($os eq "LINUX") {
|
||||
|
|
2
cblas.h
2
cblas.h
|
@ -350,7 +350,7 @@ void cblas_cher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBL
|
|||
void cblas_zher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
|
||||
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, void *C, OPENBLAS_CONST blasint ldc);
|
||||
|
||||
void cblas_xerbla(blasint p, char *rout, char *form, ...);
|
||||
void cblas_xerbla(blasint p, OPENBLAS_CONST char *rout, OPENBLAS_CONST char *form, ...);
|
||||
|
||||
/*** BLAS extensions ***/
|
||||
|
||||
|
|
|
@ -46,7 +46,7 @@ if (DYNAMIC_ARCH)
|
|||
if (ARM64)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
|
||||
set(DYNAMIC_CORE "${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2")
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE)
|
||||
endif ()
|
||||
if (DYNAMIC_LIST)
|
||||
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
|
||||
|
@ -82,7 +82,7 @@ if (DYNAMIC_ARCH)
|
|||
set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
|
||||
endif ()
|
||||
if (NOT NO_AVX512)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE SAPPHIRERAPIDS)
|
||||
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
|
||||
endif ()
|
||||
if (DYNAMIC_LIST)
|
||||
|
@ -135,7 +135,7 @@ if (ARM64)
|
|||
set(BINARY_DEFINED 1)
|
||||
endif ()
|
||||
|
||||
if (${ARCH} STREQUAL "riscv64")
|
||||
if (RISCV64)
|
||||
set(NO_BINARY_MODE 1)
|
||||
set(BINARY_DEFINED 1)
|
||||
endif ()
|
||||
|
|
|
@ -65,6 +65,14 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC")
|
||||
if (POWER)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -tp pwr8")
|
||||
else ()
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -tp px")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
|
||||
if (BINARY64)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
|
||||
|
@ -172,6 +180,9 @@ endif ()
|
|||
|
||||
if (${CORE} STREQUAL NEOVERSEN2)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
|
||||
else ()
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
|
||||
|
@ -179,16 +190,21 @@ if (${CORE} STREQUAL NEOVERSEN2)
|
|||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
|
||||
endif()
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL NEOVERSEV1)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
|
||||
else ()
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
|
||||
else ()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
|
||||
endif()
|
||||
endif()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
|
@ -205,8 +221,12 @@ endif ()
|
|||
|
||||
if (${CORE} STREQUAL ARMV8SVE)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve")
|
||||
else ()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL CORTEXA510)
|
||||
|
|
|
@ -3,7 +3,8 @@
|
|||
## Description: Ported from portion of OpenBLAS/Makefile.system
|
||||
## Sets Fortran related variables.
|
||||
|
||||
if (${F_COMPILER} STREQUAL "FLANG")
|
||||
if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
|
||||
# This is for classic Flang. LLVM Flang is handled with gfortran below.
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
|
||||
if (BINARY64 AND INTERFACE64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
|
||||
|
@ -38,16 +39,18 @@ if (${F_COMPILER} STREQUAL "G95")
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${F_COMPILER} STREQUAL "GFORTRAN")
|
||||
if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
|
||||
if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
|
||||
# ensure reentrancy of lapack codes
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
|
||||
# work around ABI violation in passing string arguments from C
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
|
||||
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||
if (NOT NO_LAPACK)
|
||||
# Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||
set(EXTRALIB "${EXTRALIB} -lgfortran")
|
||||
endif ()
|
||||
endif ()
|
||||
if (NO_BINARY_MODE)
|
||||
if (MIPS64)
|
||||
if (BINARY64)
|
||||
|
@ -63,6 +66,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN")
|
|||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
|
||||
endif ()
|
||||
endif ()
|
||||
if (RISCV64)
|
||||
if (BINARY64)
|
||||
if (INTERFACE64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
else ()
|
||||
if (BINARY64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
|
||||
|
@ -121,7 +131,7 @@ if (${F_COMPILER} STREQUAL "IBM")
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${F_COMPILER} STREQUAL "PGI")
|
||||
if (${F_COMPILER} STREQUAL "PGI" OR ${F_COMPILER} STREQUAL "PGF95")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PGI")
|
||||
set(COMMON_PROF "${COMMON_PROF} -DPGICOMPILER")
|
||||
if (BINARY64)
|
||||
|
|
|
@ -124,7 +124,7 @@ set(SLASRC
|
|||
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
|
||||
sgesvdq.f slaorhr_col_getrfnp.f
|
||||
slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f
|
||||
slatrs3.f strsyl3.f sgelst.f)
|
||||
slatrs3.f strsyl3.f sgelst.f sgedmd.f90 sgedmdq.f90)
|
||||
|
||||
set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
|
||||
sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
|
||||
|
@ -187,7 +187,7 @@ set(CLASRC
|
|||
cposv.f cposvx.f cpotrf2.f cpotri.f cpstrf.f cpstf2.f
|
||||
cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f
|
||||
cptcon.f cpteqr.f cptrfs.f cptsv.f cptsvx.f cpttrf.f cpttrs.f cptts2.f
|
||||
crot.f cspcon.f csprfs.f cspsv.f
|
||||
crot.f crscl.f cspcon.f csprfs.f cspsv.f
|
||||
cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f
|
||||
cstegr.f cstein.f csteqr.f csycon.f
|
||||
csyrfs.f csysv.f csysvx.f csytf2.f csytrf.f csytri.f
|
||||
|
@ -223,7 +223,7 @@ set(CLASRC
|
|||
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
|
||||
cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f
|
||||
cungtsqr.f cungtsqr_row.f cunhr_col.f
|
||||
clatrs3.f ctrsyl3.f cgelst.f)
|
||||
clatrs3.f ctrsyl3.f cgelst.f cgedmd.f90 cgedmdq.f90)
|
||||
|
||||
set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
|
||||
cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
|
||||
|
@ -316,7 +316,7 @@ set(DLASRC
|
|||
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
|
||||
dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
|
||||
dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f
|
||||
dlatrs3.f dtrsyl3.f dgelst.f)
|
||||
dlatrs3.f dtrsyl3.f dgelst.f dgedmd.f90 dgedmdq.f90)
|
||||
|
||||
set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
|
||||
dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
|
||||
|
@ -381,7 +381,7 @@ set(ZLASRC
|
|||
zposv.f zposvx.f zpotrf2.f zpotri.f zpotrs.f zpstrf.f zpstf2.f
|
||||
zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f
|
||||
zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f
|
||||
zrot.f zspcon.f zsprfs.f zspsv.f
|
||||
zrot.f zrscl.f zspcon.f zsprfs.f zspsv.f
|
||||
zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f
|
||||
zstegr.f zstein.f zsteqr.f zsycon.f
|
||||
zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f
|
||||
|
@ -419,7 +419,7 @@ set(ZLASRC
|
|||
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
|
||||
zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
|
||||
zungtsqr.f zungtsqr_row.f zunhr_col.f
|
||||
zlatrs3.f ztrsyl3.f zgelst.f)
|
||||
zlatrs3.f ztrsyl3.f zgelst.f zgedmd.f90 zgedmdq.f90)
|
||||
|
||||
set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
|
||||
zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
|
||||
|
@ -436,6 +436,7 @@ if(USE_XBLAS)
|
|||
set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
|
||||
endif()
|
||||
|
||||
if(BUILD_LAPACK_DEPRECATED)
|
||||
list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f
|
||||
DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f
|
||||
DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f)
|
||||
|
@ -449,6 +450,7 @@ list(APPEND ZLASRC DEPRECATED/zgegs.f DEPRECATED/zgegv.f
|
|||
DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f
|
||||
DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f)
|
||||
message(STATUS "Building deprecated routines")
|
||||
endif()
|
||||
|
||||
set(DSLASRC spotrs.f)
|
||||
|
||||
|
@ -622,7 +624,7 @@ set(SLASRC
|
|||
ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c
|
||||
sgesvdq.c slaorhr_col_getrfnp.c
|
||||
slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c
|
||||
slatrs3.c strsyl3.c sgelst.c)
|
||||
slatrs3.c strsyl3.c sgelst.c sgedmd.c sgedmdq.c)
|
||||
|
||||
set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c
|
||||
sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c
|
||||
|
@ -684,7 +686,7 @@ set(CLASRC
|
|||
cposv.c cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c
|
||||
cppcon.c cppequ.c cpprfs.c cppsv.c cppsvx.c cpptrf.c cpptri.c cpptrs.c
|
||||
cptcon.c cpteqr.c cptrfs.c cptsv.c cptsvx.c cpttrf.c cpttrs.c cptts2.c
|
||||
crot.c cspcon.c csprfs.c cspsv.c
|
||||
crot.c crscl.c cspcon.c csprfs.c cspsv.c
|
||||
cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c
|
||||
cstegr.c cstein.c csteqr.c csycon.c
|
||||
csyrfs.c csysv.c csysvx.c csytf2.c csytrf.c csytri.c
|
||||
|
@ -720,7 +722,7 @@ set(CLASRC
|
|||
chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c
|
||||
cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c
|
||||
cungtsqr.c cungtsqr_row.c cunhr_col.c
|
||||
clatrs3.c ctrsyl3.c cgelst.c)
|
||||
clatrs3.c ctrsyl3.c cgelst.c cgedmd.c cgedmdq.c)
|
||||
|
||||
set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c
|
||||
cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c
|
||||
|
@ -812,7 +814,7 @@ set(DLASRC
|
|||
dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c
|
||||
dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c
|
||||
dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c
|
||||
dlatrs3.c dtrsyl3.c dgelst.c)
|
||||
dlatrs3.c dtrsyl3.c dgelst.c dgedmd.c dgedmdq.c)
|
||||
|
||||
set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c
|
||||
dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c
|
||||
|
@ -876,7 +878,7 @@ set(ZLASRC
|
|||
zposv.c zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c
|
||||
zppcon.c zppequ.c zpprfs.c zppsv.c zppsvx.c zpptrf.c zpptri.c zpptrs.c
|
||||
zptcon.c zpteqr.c zptrfs.c zptsv.c zptsvx.c zpttrf.c zpttrs.c zptts2.c
|
||||
zrot.c zspcon.c zsprfs.c zspsv.c
|
||||
zrot.c zrscl.c zspcon.c zsprfs.c zspsv.c
|
||||
zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c
|
||||
zstegr.c zstein.c zsteqr.c zsycon.c
|
||||
zsyrfs.c zsysv.c zsysvx.c zsytf2.c zsytrf.c zsytri.c
|
||||
|
@ -913,7 +915,8 @@ set(ZLASRC
|
|||
zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c
|
||||
zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c
|
||||
zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c
|
||||
zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c)
|
||||
zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c
|
||||
zgedmd.c zgedmdq.c)
|
||||
|
||||
set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c
|
||||
zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c
|
||||
|
@ -930,6 +933,7 @@ if(USE_XBLAS)
|
|||
set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
|
||||
endif()
|
||||
|
||||
if(BUILD_LAPACK_DEPRECATED)
|
||||
list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c
|
||||
DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c
|
||||
DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c)
|
||||
|
@ -943,6 +947,7 @@ list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c
|
|||
DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c
|
||||
DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c)
|
||||
message(STATUS "Building deprecated routines")
|
||||
endif()
|
||||
|
||||
set(DSLASRC spotrs.c)
|
||||
|
||||
|
|
|
@ -70,8 +70,6 @@ set(CSRC
|
|||
lapacke_cgeqlf_work.c
|
||||
lapacke_cgeqp3.c
|
||||
lapacke_cgeqp3_work.c
|
||||
lapacke_cgeqpf.c
|
||||
lapacke_cgeqpf_work.c
|
||||
lapacke_cgeqr.c
|
||||
lapacke_cgeqr_work.c
|
||||
lapacke_cgeqr2.c
|
||||
|
@ -92,6 +90,10 @@ set(CSRC
|
|||
lapacke_cgerqf_work.c
|
||||
lapacke_cgesdd.c
|
||||
lapacke_cgesdd_work.c
|
||||
lapacke_cgedmd.c
|
||||
lapacke_cgedmd_work.c
|
||||
lapacke_cgedmdq.c
|
||||
lapacke_cgedmdq_work.c
|
||||
lapacke_cgesv.c
|
||||
lapacke_cgesv_work.c
|
||||
lapacke_cgesvd.c
|
||||
|
@ -144,12 +146,8 @@ set(CSRC
|
|||
lapacke_cggqrf_work.c
|
||||
lapacke_cggrqf.c
|
||||
lapacke_cggrqf_work.c
|
||||
lapacke_cggsvd.c
|
||||
lapacke_cggsvd_work.c
|
||||
lapacke_cggsvd3.c
|
||||
lapacke_cggsvd3_work.c
|
||||
lapacke_cggsvp.c
|
||||
lapacke_cggsvp_work.c
|
||||
lapacke_cggsvp3.c
|
||||
lapacke_cggsvp3_work.c
|
||||
lapacke_cgtcon.c
|
||||
|
@ -564,6 +562,8 @@ set(CSRC
|
|||
lapacke_ctrsna_work.c
|
||||
lapacke_ctrsyl.c
|
||||
lapacke_ctrsyl_work.c
|
||||
lapacke_ctrsyl3.c
|
||||
lapacke_ctrsyl3_work.c
|
||||
lapacke_ctrtri.c
|
||||
lapacke_ctrtri_work.c
|
||||
lapacke_ctrtrs.c
|
||||
|
@ -596,6 +596,8 @@ set(CSRC
|
|||
lapacke_cungtr_work.c
|
||||
lapacke_cungtsqr_row.c
|
||||
lapacke_cungtsqr_row_work.c
|
||||
lapacke_cunhr_col.c
|
||||
lapacke_cunhr_col_work.c
|
||||
lapacke_cunmbr.c
|
||||
lapacke_cunmbr_work.c
|
||||
lapacke_cunmhr.c
|
||||
|
@ -695,8 +697,6 @@ set(DSRC
|
|||
lapacke_dgeqlf_work.c
|
||||
lapacke_dgeqp3.c
|
||||
lapacke_dgeqp3_work.c
|
||||
lapacke_dgeqpf.c
|
||||
lapacke_dgeqpf_work.c
|
||||
lapacke_dgeqr.c
|
||||
lapacke_dgeqr_work.c
|
||||
lapacke_dgeqr2.c
|
||||
|
@ -717,6 +717,10 @@ set(DSRC
|
|||
lapacke_dgerqf_work.c
|
||||
lapacke_dgesdd.c
|
||||
lapacke_dgesdd_work.c
|
||||
lapacke_dgedmd.c
|
||||
lapacke_dgedmd_work.c
|
||||
lapacke_dgedmdq.c
|
||||
lapacke_dgedmdq_work.c
|
||||
lapacke_dgesv.c
|
||||
lapacke_dgesv_work.c
|
||||
lapacke_dgesvd.c
|
||||
|
@ -771,12 +775,8 @@ set(DSRC
|
|||
lapacke_dggqrf_work.c
|
||||
lapacke_dggrqf.c
|
||||
lapacke_dggrqf_work.c
|
||||
lapacke_dggsvd.c
|
||||
lapacke_dggsvd_work.c
|
||||
lapacke_dggsvd3.c
|
||||
lapacke_dggsvd3_work.c
|
||||
lapacke_dggsvp.c
|
||||
lapacke_dggsvp_work.c
|
||||
lapacke_dggsvp3.c
|
||||
lapacke_dggsvp3_work.c
|
||||
lapacke_dgtcon.c
|
||||
|
@ -874,6 +874,8 @@ set(DSRC
|
|||
lapacke_dorgtr_work.c
|
||||
lapacke_dorgtsqr_row.c
|
||||
lapacke_dorgtsqr_row_work.c
|
||||
lapacke_dorhr_col.c
|
||||
lapacke_dorhr_col_work.c
|
||||
lapacke_dormbr.c
|
||||
lapacke_dormbr_work.c
|
||||
lapacke_dormhr.c
|
||||
|
@ -1186,6 +1188,8 @@ set(DSRC
|
|||
lapacke_dtrsna_work.c
|
||||
lapacke_dtrsyl.c
|
||||
lapacke_dtrsyl_work.c
|
||||
lapacke_dtrsyl3.c
|
||||
lapacke_dtrsyl3_work.c
|
||||
lapacke_dtrtri.c
|
||||
lapacke_dtrtri_work.c
|
||||
lapacke_dtrtrs.c
|
||||
|
@ -1275,8 +1279,6 @@ set(SSRC
|
|||
lapacke_sgeqlf_work.c
|
||||
lapacke_sgeqp3.c
|
||||
lapacke_sgeqp3_work.c
|
||||
lapacke_sgeqpf.c
|
||||
lapacke_sgeqpf_work.c
|
||||
lapacke_sgeqr.c
|
||||
lapacke_sgeqr_work.c
|
||||
lapacke_sgeqr2.c
|
||||
|
@ -1297,6 +1299,10 @@ set(SSRC
|
|||
lapacke_sgerqf_work.c
|
||||
lapacke_sgesdd.c
|
||||
lapacke_sgesdd_work.c
|
||||
lapacke_sgedmd.c
|
||||
lapacke_sgedmd_work.c
|
||||
lapacke_sgedmdq.c
|
||||
lapacke_sgedmdq_work.c
|
||||
lapacke_sgesv.c
|
||||
lapacke_sgesv_work.c
|
||||
lapacke_sgesvd.c
|
||||
|
@ -1351,12 +1357,8 @@ set(SSRC
|
|||
lapacke_sggqrf_work.c
|
||||
lapacke_sggrqf.c
|
||||
lapacke_sggrqf_work.c
|
||||
lapacke_sggsvd.c
|
||||
lapacke_sggsvd_work.c
|
||||
lapacke_sggsvd3.c
|
||||
lapacke_sggsvd3_work.c
|
||||
lapacke_sggsvp.c
|
||||
lapacke_sggsvp_work.c
|
||||
lapacke_sggsvp3.c
|
||||
lapacke_sggsvp3_work.c
|
||||
lapacke_sgtcon.c
|
||||
|
@ -1453,6 +1455,8 @@ set(SSRC
|
|||
lapacke_sorgtr_work.c
|
||||
lapacke_sorgtsqr_row.c
|
||||
lapacke_sorgtsqr_row_work.c
|
||||
lapacke_sorhr_col.c
|
||||
lapacke_sorhr_col_work.c
|
||||
lapacke_sormbr.c
|
||||
lapacke_sormbr_work.c
|
||||
lapacke_sormhr.c
|
||||
|
@ -1762,6 +1766,8 @@ set(SSRC
|
|||
lapacke_strsna_work.c
|
||||
lapacke_strsyl.c
|
||||
lapacke_strsyl_work.c
|
||||
lapacke_ctrsyl3.c
|
||||
lapacke_ctrsyl3_work.c
|
||||
lapacke_strtri.c
|
||||
lapacke_strtri_work.c
|
||||
lapacke_strtrs.c
|
||||
|
@ -1849,8 +1855,6 @@ set(ZSRC
|
|||
lapacke_zgeqlf_work.c
|
||||
lapacke_zgeqp3.c
|
||||
lapacke_zgeqp3_work.c
|
||||
lapacke_zgeqpf.c
|
||||
lapacke_zgeqpf_work.c
|
||||
lapacke_zgeqr.c
|
||||
lapacke_zgeqr_work.c
|
||||
lapacke_zgeqr2.c
|
||||
|
@ -1871,6 +1875,10 @@ set(ZSRC
|
|||
lapacke_zgerqf_work.c
|
||||
lapacke_zgesdd.c
|
||||
lapacke_zgesdd_work.c
|
||||
lapacke_zgedmd.c
|
||||
lapacke_zgedmd_work.c
|
||||
lapacke_zgedmdq.c
|
||||
lapacke_zgedmdq_work.c
|
||||
lapacke_zgesv.c
|
||||
lapacke_zgesv_work.c
|
||||
lapacke_zgesvd.c
|
||||
|
@ -1925,12 +1933,8 @@ set(ZSRC
|
|||
lapacke_zggqrf_work.c
|
||||
lapacke_zggrqf.c
|
||||
lapacke_zggrqf_work.c
|
||||
lapacke_zggsvd.c
|
||||
lapacke_zggsvd_work.c
|
||||
lapacke_zggsvd3.c
|
||||
lapacke_zggsvd3_work.c
|
||||
lapacke_zggsvp.c
|
||||
lapacke_zggsvp_work.c
|
||||
lapacke_zggsvp3.c
|
||||
lapacke_zggsvp3_work.c
|
||||
lapacke_zgtcon.c
|
||||
|
@ -2343,6 +2347,8 @@ set(ZSRC
|
|||
lapacke_ztrsna_work.c
|
||||
lapacke_ztrsyl.c
|
||||
lapacke_ztrsyl_work.c
|
||||
lapacke_ztrsyl3.c
|
||||
lapacke_ztrsyl3_work.c
|
||||
lapacke_ztrtri.c
|
||||
lapacke_ztrtri_work.c
|
||||
lapacke_ztrtrs.c
|
||||
|
@ -2375,6 +2381,8 @@ set(ZSRC
|
|||
lapacke_zungtr_work.c
|
||||
lapacke_zungtsqr_row.c
|
||||
lapacke_zungtsqr_row_work.c
|
||||
lapacke_zunhr_col.c
|
||||
lapacke_zunhr_col_work.c
|
||||
lapacke_zunmbr.c
|
||||
lapacke_zunmbr_work.c
|
||||
lapacke_zunmhr.c
|
||||
|
@ -2401,6 +2409,12 @@ set(ZSRC
|
|||
lapacke_csyr_work.c
|
||||
lapacke_ilaver.c
|
||||
)
|
||||
if (BUILD_LAPACK_DEPRECATED)
|
||||
set(SRCS $SRCS lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c)
|
||||
set(SRCD $SRCD lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c)
|
||||
set(SRCC $SRCC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c)
|
||||
set(SRCZ $SRCZ lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c)
|
||||
endif()
|
||||
|
||||
set(SRCX
|
||||
lapacke_cgbrfsx.c lapacke_cporfsx.c lapacke_dgerfsx.c lapacke_sgbrfsx.c lapacke_ssyrfsx.c lapacke_zherfsx.c
|
||||
|
|
|
@ -55,7 +55,7 @@ if (DEFINED TARGET)
|
|||
endif ()
|
||||
|
||||
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
|
||||
if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
|
||||
if (X86_64 AND NOT (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" OR ${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC"))
|
||||
set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
|
||||
endif ()
|
||||
|
||||
|
@ -280,7 +280,41 @@ if (DEFINED TARGET)
|
|||
if (${TARGET} STREQUAL POWER8)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
|
||||
endif()
|
||||
|
||||
if (${TARGET} STREQUAL NEOVERSEV1)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
|
||||
else ()
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1")
|
||||
else ()
|
||||
message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL NEOVERSEN2)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
|
||||
else ()
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
|
||||
else ()
|
||||
message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL ARMV8SVE)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve")
|
||||
else ()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
endif()
|
||||
|
||||
if (DEFINED BINARY)
|
||||
message(STATUS "Compiling a ${BINARY}-bit binary.")
|
||||
endif ()
|
||||
|
|
|
@ -44,6 +44,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
|
|||
set(MIPS64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
|
||||
set(LOONGARCH64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*")
|
||||
set(RISCV64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
||||
if (NOT BINARY)
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
|
@ -60,7 +62,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
|||
endif()
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
|
||||
set(X86 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*|armv8.*)")
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
set(ARM64 1)
|
||||
else()
|
||||
|
@ -107,7 +109,7 @@ else()
|
|||
endif ()
|
||||
|
||||
if (NOT BINARY)
|
||||
if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64)
|
||||
if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64)
|
||||
set(BINARY 64)
|
||||
else ()
|
||||
set(BINARY 32)
|
||||
|
|
|
@ -87,6 +87,15 @@ macro(ParseMakefileVars MAKEFILE_IN)
|
|||
#message(STATUS "skipping ${makefile_line}")
|
||||
continue ()
|
||||
endif ()
|
||||
|
||||
# Example 1: SBGEMM_SMALL_M_PERMIT =
|
||||
# Unset the variable
|
||||
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*$" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
set(var_name ${CMAKE_MATCH_1})
|
||||
unset(${var_name})
|
||||
endif()
|
||||
|
||||
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "match on ${line_match}")
|
||||
|
|
2
common.h
2
common.h
|
@ -525,7 +525,7 @@ static inline unsigned long long rpcc(void){
|
|||
#endif // !RPCC_DEFINED
|
||||
|
||||
#if !defined(BLAS_LOCK_DEFINED) && defined(__GNUC__)
|
||||
static void __inline blas_lock(volatile BLASULONG *address){
|
||||
static __inline void blas_lock(volatile BLASULONG *address){
|
||||
|
||||
do {
|
||||
while (*address) {YIELDING;};
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
#define WMB asm("wmb")
|
||||
#define RMB asm("mb")
|
||||
|
||||
static void __inline blas_lock(unsigned long *address){
|
||||
static __inline void blas_lock(unsigned long *address){
|
||||
#ifndef __DECC
|
||||
unsigned long tmp1, tmp2;
|
||||
asm volatile(
|
||||
|
|
|
@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#if defined(ARMV6) || defined(ARMV7) || defined(ARMV8)
|
||||
|
||||
static void __inline blas_lock(volatile BLASULONG *address){
|
||||
static __inline void blas_lock(volatile BLASULONG *address){
|
||||
|
||||
int register ret;
|
||||
|
||||
|
|
|
@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifndef ASSEMBLER
|
||||
|
||||
|
||||
static void __inline blas_lock(volatile BLASULONG *address){
|
||||
static __inline void blas_lock(volatile BLASULONG *address){
|
||||
|
||||
BLASULONG ret;
|
||||
|
||||
|
|
|
@ -83,6 +83,19 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
return x / y;
|
||||
}
|
||||
|
||||
#ifndef NO_AFFINITY
|
||||
static inline int WhereAmI(void){
|
||||
int ret = 0, counter = 0;
|
||||
__asm__ volatile (
|
||||
"rdtimel.w %[counter], %[id]"
|
||||
: [id]"=r"(ret), [counter]"=r"(counter)
|
||||
:
|
||||
: "memory"
|
||||
);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory")
|
||||
#else
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -45,12 +46,14 @@
|
|||
|
||||
typedef struct {
|
||||
int dtb_entries;
|
||||
int switch_ratio;
|
||||
int offsetA, offsetB, align;
|
||||
|
||||
#if BUILD_BFLOAT16 == 1
|
||||
int sbgemm_p, sbgemm_q, sbgemm_r;
|
||||
int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn;
|
||||
int sbgemm_align_k;
|
||||
int need_amxtile_permission; // 0 default, 1 for device support amx.
|
||||
|
||||
void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG);
|
||||
void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG);
|
||||
|
|
|
@ -91,7 +91,7 @@
|
|||
|
||||
void *qalloc(int flags, size_t bytes);
|
||||
|
||||
static void INLINE blas_lock(volatile unsigned long *address){
|
||||
static INLINE void blas_lock(volatile unsigned long *address){
|
||||
|
||||
long int ret, val = 1;
|
||||
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
static void __inline blas_lock(volatile unsigned long *address){
|
||||
static __inline void blas_lock(volatile unsigned long *address){
|
||||
|
||||
long int ret = 1;
|
||||
|
||||
|
|
|
@ -53,7 +53,6 @@ extern void goto_set_num_threads(int nthreads);
|
|||
/* Global Parameter */
|
||||
extern int blas_cpu_number;
|
||||
extern int blas_num_threads;
|
||||
extern int blas_num_threads_set;
|
||||
extern int blas_omp_linked;
|
||||
|
||||
#define BLAS_LEGACY 0x8000U
|
||||
|
@ -136,15 +135,13 @@ typedef struct blas_queue {
|
|||
#ifdef SMP_SERVER
|
||||
|
||||
extern int blas_server_avail;
|
||||
extern int blas_omp_number_max;
|
||||
|
||||
static __inline int num_cpu_avail(int level) {
|
||||
|
||||
#ifdef USE_OPENMP
|
||||
int openmp_nthreads;
|
||||
if (blas_num_threads_set == 0)
|
||||
openmp_nthreads=omp_get_max_threads();
|
||||
else
|
||||
openmp_nthreads=blas_cpu_number;
|
||||
#endif
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
|
@ -156,6 +153,12 @@ int openmp_nthreads;
|
|||
) return 1;
|
||||
|
||||
#ifdef USE_OPENMP
|
||||
if (openmp_nthreads > blas_omp_number_max){
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr,"WARNING - more OpenMP threads requested (%d) than available (%d)\n",openmp_nthreads,blas_omp_number_max);
|
||||
#endif
|
||||
openmp_nthreads = blas_omp_number_max;
|
||||
}
|
||||
if (blas_cpu_number != openmp_nthreads) {
|
||||
goto_set_num_threads(openmp_nthreads);
|
||||
}
|
||||
|
|
|
@ -54,7 +54,7 @@
|
|||
#define __volatile__
|
||||
#endif
|
||||
|
||||
static void __inline blas_lock(volatile BLASULONG *address){
|
||||
static __inline void blas_lock(volatile BLASULONG *address){
|
||||
|
||||
int ret;
|
||||
|
||||
|
|
|
@ -70,7 +70,7 @@
|
|||
#define RMB
|
||||
#endif
|
||||
|
||||
static void __inline blas_lock(volatile BLASULONG *address){
|
||||
static __inline void blas_lock(volatile BLASULONG *address){
|
||||
|
||||
|
||||
#ifndef C_MSVC
|
||||
|
|
|
@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifndef ASSEMBLER
|
||||
|
||||
/*
|
||||
static void __inline blas_lock(volatile BLASULONG *address){
|
||||
static __inline void blas_lock(volatile BLASULONG *address){
|
||||
|
||||
BLASULONG ret;
|
||||
|
||||
|
|
|
@ -267,8 +267,9 @@ int detect(void)
|
|||
}
|
||||
#else
|
||||
#ifdef __APPLE__
|
||||
sysctlbyname("hw.cpufamily",&value,&length,NULL,0);
|
||||
if (value ==131287967|| value == 458787763 ) return CPU_VORTEX;
|
||||
sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0);
|
||||
if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1
|
||||
if (value64 == 3660830781) return CPU_VORTEX; //A15/M2
|
||||
#endif
|
||||
return CPU_ARMV8;
|
||||
#endif
|
||||
|
|
|
@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
**********************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <sys/auxv.h>
|
||||
|
||||
/* If LASX extension instructions supported,
|
||||
* using core LOONGSON3R5
|
||||
|
@ -46,9 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CPU_LOONGSON3R5 1
|
||||
#define CPU_LOONGSON2K1000 2
|
||||
|
||||
#define LOONGARCH_CFG2 0x02
|
||||
#define LOONGARCH_LASX 1<<7
|
||||
#define LOONGARCH_LSX 1<<6
|
||||
#define LA_HWCAP_LSX (1<<4)
|
||||
#define LA_HWCAP_LASX (1<<5)
|
||||
|
||||
static char *cpuname[] = {
|
||||
"LOONGSONGENERIC",
|
||||
|
@ -64,17 +64,11 @@ static char *cpuname_lower[] = {
|
|||
|
||||
int detect(void) {
|
||||
#ifdef __linux
|
||||
uint32_t reg = 0;
|
||||
int flag = (int)getauxval(AT_HWCAP);
|
||||
|
||||
__asm__ volatile (
|
||||
"cpucfg %0, %1 \n\t"
|
||||
: "+&r"(reg)
|
||||
: "r"(LOONGARCH_CFG2)
|
||||
);
|
||||
|
||||
if (reg & LOONGARCH_LASX)
|
||||
if (flag & LA_HWCAP_LASX)
|
||||
return CPU_LOONGSON3R5;
|
||||
else if (reg & LOONGARCH_LSX)
|
||||
else if (flag & LA_HWCAP_LSX)
|
||||
return CPU_LOONGSON2K1000;
|
||||
else
|
||||
return CPU_GENERIC;
|
||||
|
|
28
cpuid_x86.c
28
cpuid_x86.c
|
@ -1479,6 +1479,8 @@ int get_cpuname(void){
|
|||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 15: // Sapphire Rapids
|
||||
if(support_amx_bf16())
|
||||
return CPUTYPE_SAPPHIRERAPIDS;
|
||||
if(support_avx512_bf16())
|
||||
return CPUTYPE_COOPERLAKE;
|
||||
if(support_avx512())
|
||||
|
@ -1549,6 +1551,7 @@ int get_cpuname(void){
|
|||
case 7: // Raptor Lake
|
||||
case 10:
|
||||
case 15:
|
||||
case 14: // Alder Lake N
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
|
@ -1845,7 +1848,8 @@ static char *cpuname[] = {
|
|||
"ZEN",
|
||||
"SKYLAKEX",
|
||||
"DHYANA",
|
||||
"COOPERLAKE"
|
||||
"COOPERLAKE",
|
||||
"SAPPHIRERAPIDS",
|
||||
};
|
||||
|
||||
static char *lowercpuname[] = {
|
||||
|
@ -1902,7 +1906,8 @@ static char *lowercpuname[] = {
|
|||
"zen",
|
||||
"skylakex",
|
||||
"dhyana",
|
||||
"cooperlake"
|
||||
"cooperlake",
|
||||
"sapphirerapids",
|
||||
};
|
||||
|
||||
static char *corename[] = {
|
||||
|
@ -1936,7 +1941,8 @@ static char *corename[] = {
|
|||
"ZEN",
|
||||
"SKYLAKEX",
|
||||
"DHYANA",
|
||||
"COOPERLAKE"
|
||||
"COOPERLAKE",
|
||||
"SAPPHIRERAPIDS",
|
||||
};
|
||||
|
||||
static char *corename_lower[] = {
|
||||
|
@ -1970,7 +1976,8 @@ static char *corename_lower[] = {
|
|||
"zen",
|
||||
"skylakex",
|
||||
"dhyana",
|
||||
"cooperlake"
|
||||
"cooperlake",
|
||||
"sapphirerapids",
|
||||
};
|
||||
|
||||
|
||||
|
@ -2276,16 +2283,18 @@ int get_coretype(void){
|
|||
return CORE_NEHALEM;
|
||||
}
|
||||
if (model == 15) { // Sapphire Rapids
|
||||
if(support_amx_bf16())
|
||||
return CORE_SAPPHIRERAPIDS;
|
||||
if(support_avx512_bf16())
|
||||
return CPUTYPE_COOPERLAKE;
|
||||
return CORE_COOPERLAKE;
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
return CORE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
return CORE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
return CORE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -2352,6 +2361,7 @@ int get_coretype(void){
|
|||
case 7: // Raptor Lake
|
||||
case 10:
|
||||
case 15:
|
||||
case 14: // Alder Lake N
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CORE_HASWELL;
|
||||
|
|
|
@ -208,7 +208,7 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
|
|||
ifeq ($(USE_OPENMP), 1)
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
CEXTRALIB = -lomp
|
||||
CEXTRALIB += -lomp
|
||||
endif
|
||||
endif
|
||||
ifeq ($(F_COMPILER), NAG)
|
||||
|
|
|
@ -0,0 +1,270 @@
|
|||
# Guidance for redistributing OpenBLAS
|
||||
|
||||
*We note that this document contains recommendations only - packagers and other
|
||||
redistributors are in charge of how OpenBLAS is built and distributed in their
|
||||
systems, and may have good reasons to deviate from the guidance given on this
|
||||
page. These recommendations are aimed at general packaging systems, with a user
|
||||
base that typically is large, open source (or freely available at least), and
|
||||
doesn't behave uniformly or that the packager is directly connected with.*
|
||||
|
||||
OpenBLAS has a large number of build-time options which can be used to change
|
||||
how it behaves at runtime, how artifacts or symbols are named, etc. Variation
|
||||
in build configuration can be necessary to acheive a given end goal within a
|
||||
distribution or as an end user. However, such variation can also make it more
|
||||
difficult to build on top of OpenBLAS and ship code or other packages in a way
|
||||
that works across many different distros. Here we provide guidance about the
|
||||
most important build options, what effects they may have when changed, and
|
||||
which ones to default to.
|
||||
|
||||
The Make and CMake build systems provide equivalent options and yield more or
|
||||
less the same artifacts, but not exactly (the CMake builds are still
|
||||
experimental). You can choose either one and the options will function in the
|
||||
same way, however the CMake outputs may require some renaming. To review
|
||||
available build options, see `Makefile.rule` or `CMakeLists.txt` in the root of
|
||||
the repository.
|
||||
|
||||
Build options typically fall into two categories: (a) options that affect the
|
||||
user interface, such as library and symbol names or APIs that are made
|
||||
available, and (b) options that affect performance and runtime behavior, such
|
||||
as threading behavior or CPU architecture-specific code paths. The user
|
||||
interface options are more important to keep aligned between distributions,
|
||||
while for the performance-related options there are typically more reasons to
|
||||
make choices that deviate from the defaults.
|
||||
|
||||
Here are recommendations for user interface related packaging choices where it
|
||||
is not likely to be a good idea to deviate (typically these are the default
|
||||
settings):
|
||||
|
||||
1. Include CBLAS. The CBLAS interface is widely used and it doesn't affect
|
||||
binary size much, so don't turn it off.
|
||||
2. Include LAPACK and LAPACKE. The LAPACK interface is also widely used, and
|
||||
while it does make up a significant part of the binary size of the installed
|
||||
library, that does not outweigh the regression in usability when deviating
|
||||
from the default here.[^1]
|
||||
3. Always distribute the pkg-config (`.pc`) and CMake `.cmake`) dependency
|
||||
detection files. These files are used by build systems when users want to
|
||||
link against OpenBLAS, and there is no benefit of leaving them out.
|
||||
4. Provide the LP64 interface by default, and if in addition to that you choose
|
||||
to provide an ILP64 interface build as well, use a symbol suffix to avoid
|
||||
symbol name clashes (see the next section).
|
||||
|
||||
[^1] All major distributions do include LAPACK as of mid 2023 as far as we
|
||||
know. Older versions of Arch Linux did not, and that was known to cause
|
||||
problems.
|
||||
|
||||
|
||||
## ILP64 interface builds
|
||||
|
||||
The LP64 (32-bit integer) interface is the default build, and has
|
||||
well-established C and Fortran APIs as determined by the reference (Netlib)
|
||||
BLAS and LAPACK libraries. The ILP64 (64-bit integer) interface however does
|
||||
not have a standard API: symbol names and shared/static library names can be
|
||||
produced in multiple ways, and this tends to make it difficult to use.
|
||||
As of today there is an agreed-upon way of choosing names for OpenBLAS between
|
||||
a number of key users/redistributors, which is the closest thing to a standard
|
||||
that there is now. However, there is an ongoing standardization effort in the
|
||||
reference BLAS and LAPACK libraries, which differs from the current OpenBLAS
|
||||
agreed-upon convention. In this section we'll aim to explain both.
|
||||
|
||||
Those two methods are fairly similar, and have a key thing in common: *using a
|
||||
symbol suffix*. This is good practice; it is recommended that if you distribute
|
||||
an ILP64 build, to have it use a symbol suffix containing `64` in the name.
|
||||
This avoids potential symbol clashes when different packages which depend on
|
||||
OpenBLAS load both an LP64 and an ILP64 library into memory at the same time.
|
||||
|
||||
### The current OpenBLAS agreed-upon ILP64 convention
|
||||
|
||||
This convention comprises the shared library name and the symbol suffix in the
|
||||
shared library. The symbol suffix to use is `64_`, implying that the library
|
||||
name will be `libopenblas64_.so` and the symbols in that library end in `64_`.
|
||||
The central issue where this was discussed is
|
||||
[openblas#646](https://github.com/xianyi/OpenBLAS/issues/646), and adopters
|
||||
include Fedora, Julia, NumPy and SciPy - SuiteSparse already used it as well.
|
||||
|
||||
To build shared and static libraries with the currently recommended ILP64
|
||||
conventions with Make:
|
||||
```bash
|
||||
$ make INTERFACE64=1 SYMBOLSUFFIX=64_
|
||||
```
|
||||
|
||||
This will produce libraries named `libopenblas64_.so|a`, a pkg-config file
|
||||
named `openblas64.pc`, and CMake and header files.
|
||||
|
||||
Installing locally and inspecting the output will show a few more details:
|
||||
```bash
|
||||
$ make install PREFIX=$PWD/../openblas/make64 INTERFACE64=1 SYMBOLSUFFIX=64_
|
||||
$ tree . # output slightly edited down
|
||||
.
|
||||
├── include
|
||||
│ ├── cblas.h
|
||||
│ ├── f77blas.h
|
||||
│ ├── lapacke_config.h
|
||||
│ ├── lapacke.h
|
||||
│ ├── lapacke_mangling.h
|
||||
│ ├── lapacke_utils.h
|
||||
│ ├── lapack.h
|
||||
│ └── openblas_config.h
|
||||
└── lib
|
||||
├── cmake
|
||||
│ └── openblas
|
||||
│ ├── OpenBLASConfig.cmake
|
||||
│ └── OpenBLASConfigVersion.cmake
|
||||
├── libopenblas64_.a
|
||||
├── libopenblas64_.so
|
||||
└── pkgconfig
|
||||
└── openblas64.pc
|
||||
```
|
||||
|
||||
A key point are the symbol names. These will equal the LP64 symbol names, then
|
||||
(for Fortran only) the compiler mangling, and then the `64_` symbol suffix.
|
||||
Hence to obtain the final symbol names, we need to take into account which
|
||||
Fortran compiler we are using. For the most common cases (e.g., gfortran, Intel
|
||||
Fortran, or Flang), that means appending a single underscore. In that case, the
|
||||
result is:
|
||||
|
||||
| base API name | binary symbol name | call from Fortran code | call from C code |
|
||||
|---------------|--------------------|------------------------|-----------------------|
|
||||
| `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` |
|
||||
| `cblas_dgemm` | `cblas_dgemm64_` | n/a | `cblas_dgemm64_(...)` |
|
||||
|
||||
It is quite useful to have these symbol names be as uniform as possible across
|
||||
different packaging systems.
|
||||
|
||||
The equivalent build options with CMake are:
|
||||
```bash
|
||||
$ mkdir build && cd build
|
||||
$ cmake .. -DINTERFACE64=1 -DSYMBOLSUFFIX=64_ -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON
|
||||
$ cmake --build . -j
|
||||
```
|
||||
|
||||
Note that the result is not 100% identical to the Make result. For example, the
|
||||
library name ends in `_64` rather than `64_` - it is recommended to rename them
|
||||
to match the Make library names (also update the `libsuffix` entry in
|
||||
`openblas64.pc` to match that rename).
|
||||
```bash
|
||||
$ cmake --install . --prefix $PWD/../../openblas/cmake64
|
||||
$ tree .
|
||||
.
|
||||
├── include
|
||||
│ └── openblas64
|
||||
│ ├── cblas.h
|
||||
│ ├── f77blas.h
|
||||
│ ├── lapacke_config.h
|
||||
│ ├── lapacke_example_aux.h
|
||||
│ ├── lapacke.h
|
||||
│ ├── lapacke_mangling.h
|
||||
│ ├── lapacke_utils.h
|
||||
│ ├── lapack.h
|
||||
│ ├── openblas64
|
||||
│ │ └── lapacke_mangling.h
|
||||
│ └── openblas_config.h
|
||||
└── lib
|
||||
├── cmake
|
||||
│ └── OpenBLAS64
|
||||
│ ├── OpenBLAS64Config.cmake
|
||||
│ ├── OpenBLAS64ConfigVersion.cmake
|
||||
│ ├── OpenBLAS64Targets.cmake
|
||||
│ └── OpenBLAS64Targets-noconfig.cmake
|
||||
├── libopenblas_64.a
|
||||
├── libopenblas_64.so -> libopenblas_64.so.0
|
||||
└── pkgconfig
|
||||
└── openblas64.pc
|
||||
```
|
||||
|
||||
|
||||
### The upcoming standardized ILP64 convention
|
||||
|
||||
While the `64_` convention above got some adoption, it's slightly hacky and is
|
||||
implemented through the use of `objcopy`. An effort is ongoing for a more
|
||||
broadly adopted convention in the reference BLAS and LAPACK libraries, using
|
||||
(a) the `_64` suffix, and (b) applying that suffix _before_ rather than after
|
||||
Fortran compiler mangling. The central issue for this is
|
||||
[lapack#666](https://github.com/Reference-LAPACK/lapack/issues/666).
|
||||
|
||||
For the most common cases of compiler mangling (a single `_` appended), the end
|
||||
result will be:
|
||||
|
||||
| base API name | binary symbol name | call from Fortran code | call from C code |
|
||||
|---------------|--------------------|------------------------|-----------------------|
|
||||
| `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` |
|
||||
| `cblas_dgemm` | `cblas_dgemm_64` | n/a | `cblas_dgemm_64(...)` |
|
||||
|
||||
For other compiler mangling schemes, replace the trailing `_` by the scheme in use.
|
||||
|
||||
The shared library name for this `_64` convention should be `libopenblas_64.so`.
|
||||
|
||||
Note: it is not yet possible to produce an OpenBLAS build which employs this
|
||||
convention! Once reference BLAS and LAPACK with support for `_64` have been
|
||||
released, a future OpenBLAS release will support it. For now, please use the
|
||||
older `64_` scheme and avoid using the name `libopenblas_64.so`; it should be
|
||||
considered reserved for future use of the `_64` standard as prescribed by
|
||||
reference BLAS/LAPACK.
|
||||
|
||||
|
||||
## Performance and runtime behavior related build options
|
||||
|
||||
For these options there are multiple reasonable or common choices.
|
||||
|
||||
### Threading related options
|
||||
|
||||
OpenBLAS can be built as a multi-threaded or single-threaded library, with the
|
||||
default being multi-threaded. It's expected that the default `libopenblas`
|
||||
library is multi-threaded; if you'd like to also distribute single-threaded
|
||||
builds, consider naming them `libopenblas_sequential`.
|
||||
|
||||
OpenBLAS can be built with pthreads or OpenMP as the threading model, with the
|
||||
default being pthreads. Both options are commonly used, and the choice here
|
||||
should not influence the shared library name. The choice will be captured by
|
||||
the `.pc` file. E.g.,:
|
||||
```bash
|
||||
$ pkg-config --libs openblas
|
||||
-fopenmp -lopenblas
|
||||
|
||||
$ cat openblas.pc
|
||||
...
|
||||
openblas_config= ... USE_OPENMP=0 MAX_THREADS=24
|
||||
```
|
||||
|
||||
The maximum number of threads users will be able to use is determined at build
|
||||
time by the `NUM_THREADS` build option. It defaults to 24, and there's a wide
|
||||
range of values that are reasonable to use (up to 256). 64 is a typical choice
|
||||
here; there is a memory footprint penalty that is linear in `NUM_THREADS`.
|
||||
Please see `Makefile.rule` for more details.
|
||||
|
||||
### CPU architecture related options
|
||||
|
||||
OpenBLAS contains a lot of CPU architecture-specific optimizations, hence when
|
||||
distributing to a user base with a variety of hardware, it is recommended to
|
||||
enable CPU architecture runtime detection. This will dynamically select
|
||||
optimized kernels for individual APIs. To do this, use the `DYNAMIC_ARCH=1`
|
||||
build option. This is usually done on all common CPU families, except when
|
||||
there are known issues.
|
||||
|
||||
In case the CPU architecture is known (e.g. you're building binaries for macOS
|
||||
M1 users), it is possible to specify the target architecture directly with the
|
||||
`TARGET=` build option.
|
||||
|
||||
`DYNAMIC_ARCH` and `TARGET` are covered in more detail in the main `README.md`
|
||||
in this repository.
|
||||
|
||||
|
||||
## Real-world examples
|
||||
|
||||
OpenBLAS is likely to be distributed in one of these distribution models:
|
||||
|
||||
1. As a standalone package, or multiple packages, in a packaging ecosystem like
|
||||
a Linux distro, Homebrew, conda-forge or MSYS2.
|
||||
2. Vendored as part of a larger package, e.g. in Julia, NumPy, SciPy, or R.
|
||||
3. Locally, e.g. making available as a build on a single HPC cluster.
|
||||
|
||||
The guidance on this page is most important for models (1) and (2). These links
|
||||
to build recipes for a representative selection of packaging systems may be
|
||||
helpful as a reference:
|
||||
|
||||
- [Fedora](https://src.fedoraproject.org/rpms/openblas/blob/rawhide/f/openblas.spec)
|
||||
- [Debian](https://salsa.debian.org/science-team/openblas/-/blob/master/debian/rules)
|
||||
- [Homebrew](https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/openblas.rb)
|
||||
- [MSYS2](https://github.com/msys2/MINGW-packages/blob/master/mingw-w64-openblas/PKGBUILD)
|
||||
- [conda-forge](https://github.com/conda-forge/openblas-feedstock/blob/main/recipe/build.sh)
|
||||
- [NumPy/SciPy](https://github.com/MacPython/openblas-libs/blob/main/tools/build_openblas.sh)
|
||||
- [Nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/development/libraries/science/math/openblas/default.nix)
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -44,10 +45,6 @@
|
|||
#define DIVIDE_RATE 2
|
||||
#endif
|
||||
|
||||
#ifndef SWITCH_RATIO
|
||||
#define SWITCH_RATIO 2
|
||||
#endif
|
||||
|
||||
//The array of job_t may overflow the stack.
|
||||
//Instead, use malloc to alloc job_t.
|
||||
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
|
||||
|
@ -1015,6 +1012,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
BLASLONG divN, divT;
|
||||
int mode;
|
||||
|
||||
#if defined(DYNAMIC_ARCH)
|
||||
int switch_ratio = gotoblas->switch_ratio;
|
||||
#else
|
||||
int switch_ratio = SWITCH_RATIO;
|
||||
#endif
|
||||
|
||||
if (range_m) {
|
||||
BLASLONG m_from = *(((BLASLONG *)range_m) + 0);
|
||||
BLASLONG m_to = *(((BLASLONG *)range_m) + 1);
|
||||
|
@ -1030,7 +1033,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
}
|
||||
*/
|
||||
|
||||
if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) {
|
||||
if ((args -> m < nthreads * switch_ratio) || (args -> n < nthreads * switch_ratio)) {
|
||||
GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0);
|
||||
return 0;
|
||||
}
|
||||
|
@ -1038,7 +1041,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
divT = nthreads;
|
||||
divN = 1;
|
||||
|
||||
while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) {
|
||||
while ((GEMM3M_P * divT > m * switch_ratio) && (divT > 1)) {
|
||||
do {
|
||||
divT --;
|
||||
divN = 1;
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -44,10 +45,6 @@
|
|||
#define DIVIDE_RATE 2
|
||||
#endif
|
||||
|
||||
#ifndef SWITCH_RATIO
|
||||
#define SWITCH_RATIO 2
|
||||
#endif
|
||||
|
||||
//The array of job_t may overflow the stack.
|
||||
//Instead, use malloc to alloc job_t.
|
||||
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
|
||||
|
@ -528,7 +525,13 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
int mode, mask;
|
||||
double dnum, di, dinum;
|
||||
|
||||
if ((nthreads == 1) || (args -> n < nthreads * SWITCH_RATIO)) {
|
||||
#if defined(DYNAMIC_ARCH)
|
||||
int switch_ratio = gotoblas->switch_ratio;
|
||||
#else
|
||||
int switch_ratio = SWITCH_RATIO;
|
||||
#endif
|
||||
|
||||
if ((nthreads == 1) || (args->n < nthreads * switch_ratio)) {
|
||||
SYRK_LOCAL(args, range_m, range_n, sa, sb, 0);
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -44,10 +45,6 @@
|
|||
#define DIVIDE_RATE 2
|
||||
#endif
|
||||
|
||||
#ifndef SWITCH_RATIO
|
||||
#define SWITCH_RATIO 2
|
||||
#endif
|
||||
|
||||
#ifndef GEMM_PREFERED_SIZE
|
||||
#define GEMM_PREFERED_SIZE 1
|
||||
#endif
|
||||
|
@ -577,6 +574,11 @@ InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
|||
BLASLONG width, i, j, k, js;
|
||||
BLASLONG m, n, n_from, n_to;
|
||||
int mode;
|
||||
#if defined(DYNAMIC_ARCH)
|
||||
int switch_ratio = gotoblas->switch_ratio;
|
||||
#else
|
||||
int switch_ratio = SWITCH_RATIO;
|
||||
#endif
|
||||
|
||||
/* Get execution mode */
|
||||
#ifndef COMPLEX
|
||||
|
@ -698,8 +700,8 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
|||
num_parts = 0;
|
||||
while (n > 0){
|
||||
width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts);
|
||||
if (width < SWITCH_RATIO) {
|
||||
width = SWITCH_RATIO;
|
||||
if (width < switch_ratio) {
|
||||
width = switch_ratio;
|
||||
}
|
||||
width = round_up(n, width, GEMM_PREFERED_SIZE);
|
||||
|
||||
|
@ -746,6 +748,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
|
|||
BLASLONG m = args -> m;
|
||||
BLASLONG n = args -> n;
|
||||
BLASLONG nthreads_m, nthreads_n;
|
||||
#if defined(DYNAMIC_ARCH)
|
||||
int switch_ratio = gotoblas->switch_ratio;
|
||||
#else
|
||||
int switch_ratio = SWITCH_RATIO;
|
||||
#endif
|
||||
|
||||
/* Get dimensions from index ranges if available */
|
||||
if (range_m) {
|
||||
|
@ -755,21 +762,21 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
|
|||
n = range_n[1] - range_n[0];
|
||||
}
|
||||
|
||||
/* Partitions in m should have at least SWITCH_RATIO rows */
|
||||
if (m < 2 * SWITCH_RATIO) {
|
||||
/* Partitions in m should have at least switch_ratio rows */
|
||||
if (m < 2 * switch_ratio) {
|
||||
nthreads_m = 1;
|
||||
} else {
|
||||
nthreads_m = args -> nthreads;
|
||||
while (m < nthreads_m * SWITCH_RATIO) {
|
||||
while (m < nthreads_m * switch_ratio) {
|
||||
nthreads_m = nthreads_m / 2;
|
||||
}
|
||||
}
|
||||
|
||||
/* Partitions in n should have at most SWITCH_RATIO * nthreads_m columns */
|
||||
if (n < SWITCH_RATIO * nthreads_m) {
|
||||
/* Partitions in n should have at most switch_ratio * nthreads_m columns */
|
||||
if (n < switch_ratio * nthreads_m) {
|
||||
nthreads_n = 1;
|
||||
} else {
|
||||
nthreads_n = (n + SWITCH_RATIO * nthreads_m - 1) / (SWITCH_RATIO * nthreads_m);
|
||||
nthreads_n = (n + switch_ratio * nthreads_m - 1) / (switch_ratio * nthreads_m);
|
||||
if (nthreads_m * nthreads_n > args -> nthreads) {
|
||||
nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m);
|
||||
}
|
||||
|
|
|
@ -973,7 +973,7 @@ void goto_set_num_threads(int num_threads) {
|
|||
|
||||
increased_threads = 1;
|
||||
|
||||
for(i = blas_num_threads - 1; i < num_threads - 1; i++){
|
||||
for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
|
||||
|
||||
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0);
|
||||
thread_status[i].status = THREAD_STATUS_WAKEUP;
|
||||
|
|
|
@ -68,6 +68,7 @@
|
|||
#endif
|
||||
|
||||
int blas_server_avail = 0;
|
||||
int blas_omp_number_max = 0;
|
||||
|
||||
extern int openblas_omp_adaptive_env();
|
||||
|
||||
|
@ -100,8 +101,6 @@ static void adjust_thread_buffers() {
|
|||
|
||||
void goto_set_num_threads(int num_threads) {
|
||||
|
||||
blas_num_threads_set = 1;
|
||||
if (num_threads < 0) blas_num_threads_set = 0;
|
||||
if (num_threads < 1) num_threads = blas_num_threads;
|
||||
|
||||
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
|
||||
|
@ -125,6 +124,8 @@ void openblas_set_num_threads(int num_threads) {
|
|||
}
|
||||
|
||||
int blas_thread_init(void){
|
||||
if(blas_omp_number_max <= 0)
|
||||
blas_omp_number_max = omp_get_max_threads();
|
||||
|
||||
blas_get_cpu_number();
|
||||
|
||||
|
|
|
@ -568,7 +568,7 @@ void goto_set_num_threads(int num_threads)
|
|||
blas_server_avail = 1;
|
||||
}
|
||||
|
||||
for(i = blas_num_threads - 1; i < num_threads - 1; i++){
|
||||
for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
|
||||
|
||||
blas_threads[i] = CreateThread(NULL, 0,
|
||||
blas_thread_server, (void *)i,
|
||||
|
|
|
@ -220,6 +220,19 @@ extern gotoblas_t gotoblas_COOPERLAKE;
|
|||
#else
|
||||
#define gotoblas_COOPERLAKE gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_SAPPHIRERAPIDS
|
||||
extern gotoblas_t gotoblas_SAPPHIRERAPIDS;
|
||||
#elif defined(DYN_SKYLAKEX)
|
||||
#define gotoblas_SAPPHIRERAPIDS gotoblas_SKYLAKEX
|
||||
#elif defined(DYN_HASWELL)
|
||||
#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_SAPPHIRERAPIDS gotoblas_PRESCOTT
|
||||
#endif
|
||||
|
||||
|
||||
#else // not DYNAMIC_LIST
|
||||
|
@ -268,9 +281,11 @@ extern gotoblas_t gotoblas_ZEN;
|
|||
#ifndef NO_AVX512
|
||||
extern gotoblas_t gotoblas_SKYLAKEX;
|
||||
extern gotoblas_t gotoblas_COOPERLAKE;
|
||||
extern gotoblas_t gotoblas_SAPPHIRERAPIDS;
|
||||
#else
|
||||
#define gotoblas_SKYLAKEX gotoblas_HASWELL
|
||||
#define gotoblas_COOPERLAKE gotoblas_HASWELL
|
||||
#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
|
@ -279,6 +294,7 @@ extern gotoblas_t gotoblas_COOPERLAKE;
|
|||
#define gotoblas_HASWELL gotoblas_NEHALEM
|
||||
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
|
||||
#define gotoblas_COOPERLAKE gotoblas_NEHALEM
|
||||
#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM
|
||||
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
||||
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
|
||||
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
|
||||
|
@ -378,6 +394,31 @@ int support_avx512_bf16(){
|
|||
#endif
|
||||
}
|
||||
|
||||
#define BIT_AMX_TILE 0x01000000
|
||||
#define BIT_AMX_BF16 0x00400000
|
||||
#define BIT_AMX_ENBD 0x00060000
|
||||
|
||||
int support_amx_bf16() {
|
||||
#if !defined(NO_AVX) && !defined(NO_AVX512)
|
||||
int eax, ebx, ecx, edx;
|
||||
int ret=0;
|
||||
|
||||
if (!support_avx512())
|
||||
return 0;
|
||||
// CPUID.7.0:EDX indicates AMX support
|
||||
cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
|
||||
if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) {
|
||||
// CPUID.D.0:EAX[17:18] indicates AMX enabled
|
||||
cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
|
||||
if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD)
|
||||
ret = 1;
|
||||
}
|
||||
return ret;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
#define FALLBACK_VERBOSE 1
|
||||
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
|
||||
|
@ -689,6 +730,8 @@ static gotoblas_t *get_coretype(void){
|
|||
}
|
||||
}
|
||||
if (model == 15){ // Sapphire Rapids
|
||||
if(support_amx_bf16())
|
||||
return &gotoblas_SAPPHIRERAPIDS;
|
||||
if(support_avx512_bf16())
|
||||
return &gotoblas_COOPERLAKE;
|
||||
if (support_avx512())
|
||||
|
@ -941,7 +984,8 @@ static char *corename[] = {
|
|||
"Excavator",
|
||||
"Zen",
|
||||
"SkylakeX",
|
||||
"Cooperlake"
|
||||
"Cooperlake",
|
||||
"SapphireRapids"
|
||||
};
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
|
@ -1006,6 +1050,7 @@ char *gotoblas_corename(void) {
|
|||
if (gotoblas == &gotoblas_ZEN) return corename[23];
|
||||
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
|
||||
if (gotoblas == &gotoblas_COOPERLAKE) return corename[25];
|
||||
if (gotoblas == &gotoblas_SAPPHIRERAPIDS) return corename[26];
|
||||
return corename[0];
|
||||
}
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2;
|
|||
#else
|
||||
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_ARMV8SVE
|
||||
extern gotoblas_t gotoblas_ARMV8SVE;
|
||||
#else
|
||||
#define gotoblas_ARMV8SVE gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_CORTEX_A55
|
||||
extern gotoblas_t gotoblas_CORTEXA55;
|
||||
#else
|
||||
|
@ -128,17 +134,21 @@ extern gotoblas_t gotoblas_NEOVERSEN1;
|
|||
#ifndef NO_SVE
|
||||
extern gotoblas_t gotoblas_NEOVERSEV1;
|
||||
extern gotoblas_t gotoblas_NEOVERSEN2;
|
||||
extern gotoblas_t gotoblas_ARMV8SVE;
|
||||
#else
|
||||
#define gotoblas_NEOVERSEV1 gotoblas_ARMV8
|
||||
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
|
||||
#define gotoblas_ARMV8SVE gotoblas_ARMV8
|
||||
#endif
|
||||
extern gotoblas_t gotoblas_THUNDERX3T110;
|
||||
extern gotoblas_t gotoblas_CORTEXA55;
|
||||
#endif
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
#define FALLBACK_VERBOSE 1
|
||||
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"
|
||||
|
||||
#define NUM_CORETYPES 13
|
||||
#define NUM_CORETYPES 16
|
||||
|
||||
/*
|
||||
* In case asm/hwcap.h is outdated on the build system, make sure
|
||||
|
@ -147,6 +157,9 @@ extern void openblas_warning(int verbose, const char * msg);
|
|||
#ifndef HWCAP_CPUID
|
||||
#define HWCAP_CPUID (1 << 11)
|
||||
#endif
|
||||
#ifndef HWCAP_SVE
|
||||
#define HWCAP_SVE (1 << 22)
|
||||
#endif
|
||||
|
||||
#define get_cpu_ftr(id, var) ({ \
|
||||
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \
|
||||
|
@ -168,6 +181,7 @@ static char *corename[] = {
|
|||
"neoversen2",
|
||||
"thunderx3t110",
|
||||
"cortexa55",
|
||||
"armv8sve",
|
||||
"unknown"
|
||||
};
|
||||
|
||||
|
@ -187,6 +201,7 @@ char *gotoblas_corename(void) {
|
|||
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12];
|
||||
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
|
||||
if (gotoblas == &gotoblas_CORTEXA55) return corename[14];
|
||||
if (gotoblas == &gotoblas_ARMV8SVE) return corename[15];
|
||||
return corename[NUM_CORETYPES];
|
||||
}
|
||||
|
||||
|
@ -221,6 +236,7 @@ static gotoblas_t *force_coretype(char *coretype) {
|
|||
case 12: return (&gotoblas_NEOVERSEN2);
|
||||
case 13: return (&gotoblas_THUNDERX3T110);
|
||||
case 14: return (&gotoblas_CORTEXA55);
|
||||
case 15: return (&gotoblas_ARMV8SVE);
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
|
@ -281,8 +297,16 @@ static gotoblas_t *get_coretype(void) {
|
|||
return &gotoblas_NEOVERSEN1;
|
||||
#ifndef NO_SVE
|
||||
case 0xd49:
|
||||
if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
|
||||
return &gotoblas_NEOVERSEN1;
|
||||
} else
|
||||
return &gotoblas_NEOVERSEN2;
|
||||
case 0xd40:
|
||||
if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
|
||||
return &gotoblas_NEOVERSEN1;
|
||||
}else
|
||||
return &gotoblas_NEOVERSEV1;
|
||||
#endif
|
||||
case 0xd05: // Cortex A55
|
||||
|
@ -332,6 +356,12 @@ static gotoblas_t *get_coretype(void) {
|
|||
snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
|
||||
openblas_warning(1, coremsg);
|
||||
}
|
||||
#ifndef NO_SVE
|
||||
if ((getauxval(AT_HWCAP) & HWCAP_SVE)) {
|
||||
return &gotoblas_ARMV8SVE;
|
||||
}
|
||||
#endif
|
||||
|
||||
return NULL;
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -422,8 +422,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s
|
|||
*/
|
||||
int blas_num_threads = 0;
|
||||
|
||||
int blas_num_threads_set = 0;
|
||||
|
||||
int goto_get_num_procs (void) {
|
||||
return blas_cpu_number;
|
||||
}
|
||||
|
@ -1996,8 +1994,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s
|
|||
*/
|
||||
int blas_num_threads = 0;
|
||||
|
||||
int blas_num_threads_set = 0;
|
||||
|
||||
int goto_get_num_procs (void) {
|
||||
return blas_cpu_number;
|
||||
}
|
||||
|
@ -3015,6 +3011,8 @@ void *blas_memory_alloc(int procpos){
|
|||
#endif
|
||||
if (memory_overflowed) goto terminate;
|
||||
fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n");
|
||||
fprintf(stderr,"To avoid this warning, please rebuild your copy of OpenBLAS with a larger NUM_THREADS setting\n");
|
||||
fprintf(stderr,"or set the environment variable OPENBLAS_NUM_THREADS to %d or lower\n", NUM_BUFFERS);
|
||||
memory_overflowed=1;
|
||||
new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t));
|
||||
newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct));
|
||||
|
|
|
@ -283,7 +283,6 @@ The numbers of threads in the thread pool.
|
|||
This value is equal or large than blas_cpu_number. This means some threads are sleep.
|
||||
*/
|
||||
int blas_num_threads = 0;
|
||||
int blas_num_threads_set = 0;
|
||||
|
||||
int goto_get_num_procs (void) {
|
||||
return blas_cpu_number;
|
||||
|
|
|
@ -21,7 +21,7 @@ blasobjsc="
|
|||
chbmv chemm chemv cher2 cher2k cher cherk scabs1 scamax
|
||||
chpmv chpr2 chpr crotg cscal csrot csscal cswap scamin scasum scnrm2
|
||||
csymm csyr2k csyrk ctbmv ctbsv ctpmv ctpsv ctrmm ctrmv ctrsm
|
||||
ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum"
|
||||
ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum cgemmt"
|
||||
|
||||
blasobjsd="
|
||||
damax damin dasum daxpy daxpby dcabs1 dcopy ddot dgbmv dgemm
|
||||
|
@ -29,7 +29,7 @@ blasobjsd="
|
|||
dscal dsdot dspmv dspr2 dimatcopy domatcopy
|
||||
dspr dswap dsymm dsymv dsyr2 dsyr2k dsyr dsyrk dtbmv dtbsv
|
||||
dtpmv dtpsv dtrmm dtrmv dtrsm dtrsv
|
||||
idamax idamin idmax idmin dgeadd dsum"
|
||||
idamax idamin idmax idmin dgeadd dsum dgemmt"
|
||||
|
||||
blasobjss="
|
||||
isamax isamin ismax ismin
|
||||
|
@ -38,7 +38,7 @@ blasobjss="
|
|||
smax smin snrm2 simatcopy somatcopy
|
||||
srot srotg srotm srotmg ssbmv sscal sspmv sspr2 sspr sswap
|
||||
ssymm ssymv ssyr2 ssyr2k ssyr ssyrk stbmv stbsv stpmv stpsv
|
||||
strmm strmv strsm strsv sgeadd ssum"
|
||||
strmm strmv strsm strsv sgeadd ssum sgemmt"
|
||||
|
||||
blasobjsz="
|
||||
izamax izamin
|
||||
|
@ -48,7 +48,7 @@ blasobjsz="
|
|||
zhpr zrotg zscal zswap zsymm zsyr2k zsyrk ztbmv
|
||||
ztbsv ztpmv ztpsv ztrmm ztrmv ztrsm ztrsv
|
||||
zomatcopy zimatcopy dzamax dzamin dzasum dznrm2
|
||||
zgeadd dzsum"
|
||||
zgeadd dzsum zgemmt"
|
||||
|
||||
blasobjs="lsame xerbla"
|
||||
bfblasobjs="sbgemm sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod"
|
||||
|
@ -58,7 +58,7 @@ cblasobjsc="
|
|||
cblas_cher cblas_cherk cblas_chpmv cblas_chpr2 cblas_chpr cblas_cscal cblas_caxpby
|
||||
cblas_csscal cblas_cswap cblas_csymm cblas_csyr2k cblas_csyrk cblas_ctbmv cblas_cgeadd
|
||||
cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv
|
||||
cblas_scnrm2 cblas_scasum
|
||||
cblas_scnrm2 cblas_scasum cblas_cgemmt
|
||||
cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy
|
||||
"
|
||||
cblasobjsd="
|
||||
|
@ -67,7 +67,7 @@ cblasobjsd="
|
|||
cblas_drot cblas_drotg cblas_drotm cblas_drotmg cblas_dsbmv cblas_dscal cblas_dsdot
|
||||
cblas_dspmv cblas_dspr2 cblas_dspr cblas_dswap cblas_dsymm cblas_dsymv cblas_dsyr2
|
||||
cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv
|
||||
cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd
|
||||
cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt
|
||||
cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy
|
||||
"
|
||||
|
||||
|
@ -78,7 +78,7 @@ cblasobjss="
|
|||
cblas_srotm cblas_srotmg cblas_ssbmv cblas_sscal cblas_sspmv cblas_sspr2 cblas_sspr
|
||||
cblas_sswap cblas_ssymm cblas_ssymv cblas_ssyr2 cblas_ssyr2k cblas_ssyr cblas_ssyrk
|
||||
cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm
|
||||
cblas_strsv cblas_sgeadd
|
||||
cblas_strsv cblas_sgeadd cblas_sgemmt
|
||||
cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy
|
||||
"
|
||||
|
||||
|
@ -89,7 +89,7 @@ cblasobjsz="
|
|||
cblas_zhpr cblas_zscal cblas_zswap cblas_zsymm cblas_zsyr2k cblas_zsyrk
|
||||
cblas_ztbmv cblas_ztbsv cblas_ztpmv cblas_ztpsv cblas_ztrmm cblas_ztrmv cblas_ztrsm
|
||||
cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub
|
||||
cblas_zaxpby cblas_zgeadd
|
||||
cblas_zaxpby cblas_zgeadd cblas_zgemmt
|
||||
cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy
|
||||
"
|
||||
|
||||
|
@ -716,6 +716,7 @@ lapackobjs2z="$lapackobjs2z
|
|||
# functions added for lapack-3.7.0
|
||||
lapackobjs2s="$lapackobjs2s
|
||||
slarfy
|
||||
ssyconvf
|
||||
strevc3
|
||||
sgelqt
|
||||
sgelqt3
|
||||
|
@ -843,6 +844,23 @@ lapackobjs2z="$lapackobjs2z
|
|||
zungtsqr_row
|
||||
"
|
||||
|
||||
#functions added for lapack-3.11
|
||||
lapackobjs2c="$lapackobjs2c
|
||||
cgedmd
|
||||
cgedmdq
|
||||
"
|
||||
lapackobjs2d="$lapackobjs2d
|
||||
dgedmd
|
||||
dgedmdq
|
||||
"
|
||||
lapackobjs2s="$lapackobjs2s
|
||||
sgedmd
|
||||
sgedmdq
|
||||
"
|
||||
lapackobjs2z="$lapackobjs2z
|
||||
zgedmd
|
||||
zgedmdq
|
||||
"
|
||||
lapack_extendedprecision_objs="
|
||||
zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx
|
||||
dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx
|
||||
|
@ -1012,6 +1030,10 @@ lapackeobjsc="
|
|||
LAPACKE_cgebrd_work
|
||||
LAPACKE_cgecon
|
||||
LAPACKE_cgecon_work
|
||||
LAPACKE_cgedmd
|
||||
LAPACKE_cgedmd_work
|
||||
LAPACKE_cgedmdq
|
||||
LAPACKE_cgedmdq_work
|
||||
LAPACKE_cgeequ
|
||||
LAPACKE_cgeequ_work
|
||||
LAPACKE_cgeequb
|
||||
|
@ -1671,6 +1693,10 @@ lapackeobjsd="
|
|||
LAPACKE_dgebrd_work
|
||||
LAPACKE_dgecon
|
||||
LAPACKE_dgecon_work
|
||||
LAPACKE_dgedmd
|
||||
LAPACKE_dgedmd_work
|
||||
LAPACKE_dgedmdq
|
||||
LAPACKE_dgedmdq_work
|
||||
LAPACKE_dgeequ
|
||||
LAPACKE_dgeequ_work
|
||||
LAPACKE_dgeequb
|
||||
|
@ -2284,6 +2310,10 @@ lapackeobjss="
|
|||
LAPACKE_sgebrd_work
|
||||
LAPACKE_sgecon
|
||||
LAPACKE_sgecon_work
|
||||
LAPACKE_sgedmd
|
||||
LAPACKE_sgedmd_work
|
||||
LAPACKE_sgedmdq
|
||||
LAPACKE_sgedmdq_work
|
||||
LAPACKE_sgeequ
|
||||
LAPACKE_sgeequ_work
|
||||
LAPACKE_sgeequb
|
||||
|
@ -2893,6 +2923,10 @@ lapackeobjsz="
|
|||
LAPACKE_zgebrd_work
|
||||
LAPACKE_zgecon
|
||||
LAPACKE_zgecon_work
|
||||
LAPACKE_zgedmd
|
||||
LAPACKE_zgedmd_work
|
||||
LAPACKE_zgedmdq
|
||||
LAPACKE_zgedmdq_work
|
||||
LAPACKE_zgeequ
|
||||
LAPACKE_zgeequ_work
|
||||
LAPACKE_zgeequb
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax,
|
||||
chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2,
|
||||
csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm,
|
||||
ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum);
|
||||
ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt);
|
||||
|
||||
@blasobjsd = (
|
||||
damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm,
|
||||
|
@ -29,7 +29,7 @@
|
|||
dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy,
|
||||
dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv,
|
||||
dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv,
|
||||
idamax,idamin,idmax,idmin,dgeadd,dsum);
|
||||
idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt);
|
||||
|
||||
@blasobjss = (
|
||||
isamax,isamin,ismax,ismin,
|
||||
|
@ -38,7 +38,7 @@
|
|||
smax,smin,snrm2,simatcopy,somatcopy,
|
||||
srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap,
|
||||
ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv,
|
||||
strmm,strmv,strsm,strsv, sgeadd,ssum);
|
||||
strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt);
|
||||
|
||||
@blasobjsz = (
|
||||
izamax,izamin,,
|
||||
|
@ -48,7 +48,7 @@
|
|||
zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv,
|
||||
ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv,
|
||||
zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2,
|
||||
zgeadd, dzsum);
|
||||
zgeadd, dzsum, zgemmt);
|
||||
|
||||
@blasobjs = (lsame, xerbla);
|
||||
@bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
|
||||
|
@ -60,7 +60,7 @@
|
|||
cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv,
|
||||
cblas_scnrm2, cblas_scasum,
|
||||
cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy
|
||||
);
|
||||
cblas_cgemmt);
|
||||
@cblasobjsd = (
|
||||
cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot,
|
||||
cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2,
|
||||
|
@ -69,7 +69,7 @@
|
|||
cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv,
|
||||
cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd,
|
||||
cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy
|
||||
);
|
||||
cblas_dgemmt);
|
||||
|
||||
@cblasobjss = (
|
||||
cblas_sasum, cblas_saxpy, cblas_saxpby,
|
||||
|
@ -80,7 +80,7 @@
|
|||
cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm,
|
||||
cblas_strsv, cblas_sgeadd,
|
||||
cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy
|
||||
);
|
||||
cblas_sgemmt);
|
||||
@cblasobjsz = (
|
||||
cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal,
|
||||
cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm,
|
||||
|
@ -90,7 +90,7 @@
|
|||
cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub,
|
||||
cblas_zaxpby, cblas_zgeadd,
|
||||
cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy
|
||||
);
|
||||
cblas_zgemmt);
|
||||
|
||||
@cblasobjs = ( cblas_xerbla );
|
||||
|
||||
|
|
7
f_check
7
f_check
|
@ -101,6 +101,13 @@ else
|
|||
*flang*)
|
||||
vendor=FLANG
|
||||
openmp='-fopenmp'
|
||||
data=`$compiler -v 2>&1 > /dev/null `
|
||||
v="${data#*version *}"
|
||||
v="${v%%*.}"
|
||||
major="${v%%.*}"
|
||||
if [ "$major" -ge 17 ]; then
|
||||
vendor=FLANGNEW
|
||||
fi
|
||||
;;
|
||||
*ifort*|*ifx*)
|
||||
vendor=INTEL
|
||||
|
|
|
@ -1930,15 +1930,15 @@ printf("ELF_VERSION=2\n");
|
|||
|
||||
#ifdef MAKE_NB_JOBS
|
||||
#if MAKE_NB_JOBS > 0
|
||||
printf("MAKE += -j %d\n", MAKE_NB_JOBS);
|
||||
printf("MAKEFLAGS += -j %d\n", MAKE_NB_JOBS);
|
||||
#else
|
||||
// Let make use parent -j argument or -j1 if there
|
||||
// is no make parent
|
||||
#endif
|
||||
#elif NO_PARALLEL_MAKE==1
|
||||
printf("MAKE += -j 1\n");
|
||||
printf("MAKEFLAGS += -j 1\n");
|
||||
#else
|
||||
printf("MAKE += -j %d\n", get_num_cores());
|
||||
printf("MAKEFLAGS += -j %d\n", get_num_cores());
|
||||
#endif
|
||||
|
||||
break;
|
||||
|
|
|
@ -68,7 +68,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
|
|||
info = 0;
|
||||
|
||||
|
||||
if (lda < MAX(1, m)) info = 6;
|
||||
if (lda < MAX(1, m)) info = 5;
|
||||
if (ldc < MAX(1, m)) info = 8;
|
||||
|
||||
if (n < 0) info = 2;
|
||||
|
|
|
@ -154,6 +154,23 @@ static size_t zgemm_small_kernel_b0[] = {
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
|
||||
#define XFEATURE_XTILEDATA 18
|
||||
#define ARCH_REQ_XCOMP_PERM 0x1023
|
||||
static int openblas_amxtile_permission = 0;
|
||||
static int init_amxtile_permission() {
|
||||
long status =
|
||||
syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);
|
||||
if (status != 0) {
|
||||
fprintf(stderr, "XTILEDATA permission not granted in your device(Linux, "
|
||||
"Intel Sapphier Rapids), skip sbgemm calculation\n");
|
||||
return -1;
|
||||
}
|
||||
openblas_amxtile_permission = 1;
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef CBLAS
|
||||
|
||||
void NAME(char *TRANSA, char *TRANSB,
|
||||
|
@ -455,6 +472,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
|||
|
||||
#endif
|
||||
|
||||
#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
|
||||
#if defined(DYNAMIC_ARCH)
|
||||
if (gotoblas->need_amxtile_permission &&
|
||||
openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#if !defined(DYNAMIC_ARCH) && defined(SAPPHIRERAPIDS)
|
||||
if (openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) {
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#endif // defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
|
||||
|
||||
if ((args.m == 0) || (args.n == 0)) return;
|
||||
|
||||
#if 0
|
||||
|
|
|
@ -35,29 +35,26 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "common.h"
|
||||
#ifdef FUNCTION_PROFILE
|
||||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX
|
||||
#define SMP_THRESHOLD_MIN 65536.0
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "QGEMT "
|
||||
#define ERROR_NAME "QGEMMT "
|
||||
#elif defined(DOUBLE)
|
||||
#define ERROR_NAME "DGEMT "
|
||||
#define ERROR_NAME "DGEMMT "
|
||||
#elif defined(BFLOAT16)
|
||||
#define ERROR_NAME "SBGEMT "
|
||||
#define ERROR_NAME "SBGEMMT "
|
||||
#else
|
||||
#define ERROR_NAME "SGEMT "
|
||||
#define ERROR_NAME "SGEMMT "
|
||||
#endif
|
||||
#else
|
||||
#define SMP_THRESHOLD_MIN 8192.0
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "XGEMT "
|
||||
#define ERROR_NAME "XGEMMT "
|
||||
#elif defined(DOUBLE)
|
||||
#define ERROR_NAME "ZGEMT "
|
||||
#define ERROR_NAME "ZGEMMT "
|
||||
#else
|
||||
#define ERROR_NAME "CGEMT "
|
||||
#define ERROR_NAME "CGEMMT "
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -68,18 +65,19 @@
|
|||
#ifndef CBLAS
|
||||
|
||||
void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
||||
blasint * M, blasint * N, blasint * K,
|
||||
blasint * M, blasint * K,
|
||||
FLOAT * Alpha,
|
||||
IFLOAT * a, blasint * ldA,
|
||||
IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
|
||||
{
|
||||
|
||||
blasint m, n, k;
|
||||
blasint m, k;
|
||||
blasint lda, ldb, ldc;
|
||||
int transa, transb, uplo;
|
||||
blasint info;
|
||||
|
||||
char transA, transB, Uplo;
|
||||
blasint nrowa, nrowb;
|
||||
IFLOAT *buffer;
|
||||
IFLOAT *aa, *bb;
|
||||
FLOAT *cc;
|
||||
|
@ -92,7 +90,6 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
|||
PRINT_DEBUG_NAME;
|
||||
|
||||
m = *M;
|
||||
n = *N;
|
||||
k = *K;
|
||||
|
||||
#if defined(COMPLEX)
|
||||
|
@ -159,32 +156,39 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
|||
if (Uplo == 'L')
|
||||
uplo = 1;
|
||||
|
||||
nrowa = m;
|
||||
if (transa) nrowa = k;
|
||||
nrowb = k;
|
||||
if (transb) nrowb = m;
|
||||
|
||||
info = 0;
|
||||
|
||||
if (uplo < 0)
|
||||
info = 14;
|
||||
if (ldc < m)
|
||||
if (ldc < MAX(1, m))
|
||||
info = 13;
|
||||
if (ldb < MAX(1, nrowa))
|
||||
info = 10;
|
||||
if (lda < MAX(1, nrowb))
|
||||
info = 8;
|
||||
if (k < 0)
|
||||
info = 5;
|
||||
if (n < 0)
|
||||
info = 4;
|
||||
if (m < 0)
|
||||
info = 3;
|
||||
info = 4;
|
||||
if (transb < 0)
|
||||
info = 2;
|
||||
info = 3;
|
||||
if (transa < 0)
|
||||
info = 2;
|
||||
if (uplo < 0)
|
||||
info = 1;
|
||||
|
||||
if (info) {
|
||||
if (info != 0) {
|
||||
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
return;
|
||||
}
|
||||
#else
|
||||
|
||||
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M,
|
||||
blasint N, blasint k,
|
||||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m,
|
||||
blasint k,
|
||||
#ifndef COMPLEX
|
||||
FLOAT alpha,
|
||||
IFLOAT * A, blasint LDA,
|
||||
|
@ -205,17 +209,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
|
||||
int transa, transb, uplo;
|
||||
blasint info;
|
||||
blasint m, n, lda, ldb;
|
||||
blasint lda, ldb;
|
||||
FLOAT *a, *b;
|
||||
XFLOAT *buffer;
|
||||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
uplo = -1;
|
||||
transa = -1;
|
||||
transb = -1;
|
||||
info = 0;
|
||||
|
||||
if (order == CblasColMajor) {
|
||||
if (Uplo == CblasUpper) uplo = 0;
|
||||
if (Uplo == CblasLower) uplo = 1;
|
||||
|
||||
if (TransA == CblasNoTrans)
|
||||
transa = 0;
|
||||
|
@ -248,9 +255,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
transb = 3;
|
||||
#endif
|
||||
|
||||
m = M;
|
||||
n = N;
|
||||
|
||||
a = (void *)A;
|
||||
b = (void *)B;
|
||||
lda = LDA;
|
||||
|
@ -258,23 +262,31 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
|
||||
info = -1;
|
||||
|
||||
if (ldc < m)
|
||||
blasint nrowa, nrowb;
|
||||
nrowa = m;
|
||||
if (transa) nrowa = k;
|
||||
nrowb = k;
|
||||
if (transb) nrowb = m;
|
||||
|
||||
if (ldc < MAX(1, m))
|
||||
info = 13;
|
||||
if (ldb < MAX(1, nrowb))
|
||||
info = 10;
|
||||
if (lda < MAX(1, nrowa))
|
||||
info = 8;
|
||||
if (k < 0)
|
||||
info = 5;
|
||||
if (n < 0)
|
||||
info = 4;
|
||||
if (m < 0)
|
||||
info = 3;
|
||||
info = 4;
|
||||
if (transb < 0)
|
||||
info = 2;
|
||||
info = 3;
|
||||
if (transa < 0)
|
||||
info = 2;
|
||||
if (uplo < 0)
|
||||
info = 1;
|
||||
}
|
||||
|
||||
if (order == CblasRowMajor) {
|
||||
m = N;
|
||||
n = M;
|
||||
|
||||
a = (void *)B;
|
||||
b = (void *)A;
|
||||
|
@ -282,6 +294,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
lda = LDB;
|
||||
ldb = LDA;
|
||||
|
||||
if (Uplo == CblasUpper) uplo = 0;
|
||||
if (Uplo == CblasLower) uplo = 1;
|
||||
|
||||
if (TransB == CblasNoTrans)
|
||||
transa = 0;
|
||||
if (TransB == CblasTrans)
|
||||
|
@ -315,28 +330,29 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
|
||||
info = -1;
|
||||
|
||||
if (ldc < m)
|
||||
blasint ncola, ncolb;
|
||||
ncola = k;
|
||||
if (transa) ncola = m;
|
||||
ncolb = m;
|
||||
if (transb) ncolb = k;
|
||||
|
||||
if (ldc < MAX(1,m))
|
||||
info = 13;
|
||||
if (ldb < MAX(1, ncolb))
|
||||
info = 10;
|
||||
if (lda < MAX(1, ncola))
|
||||
info = 8;
|
||||
if (k < 0)
|
||||
info = 5;
|
||||
if (n < 0)
|
||||
info = 4;
|
||||
if (m < 0)
|
||||
info = 3;
|
||||
info = 4;
|
||||
if (transb < 0)
|
||||
info = 2;
|
||||
info = 3;
|
||||
if (transa < 0)
|
||||
info = 1;
|
||||
|
||||
}
|
||||
|
||||
uplo = -1;
|
||||
if (Uplo == CblasUpper)
|
||||
uplo = 0;
|
||||
if (Uplo == CblasLower)
|
||||
uplo = 1;
|
||||
info = 2;
|
||||
if (uplo < 0)
|
||||
info = 14;
|
||||
info = 1;
|
||||
}
|
||||
|
||||
if (info >= 0) {
|
||||
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
|
@ -407,37 +423,35 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
|
||||
#endif
|
||||
|
||||
if ((m == 0) || (n == 0))
|
||||
if (m == 0)
|
||||
return;
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
const blasint incb = (transb == 0) ? 1 : ldb;
|
||||
|
||||
if (uplo == 1) {
|
||||
for (i = 0; i < n; i++) {
|
||||
j = n - i;
|
||||
for (i = 0; i < m; i++) {
|
||||
j = m - i;
|
||||
|
||||
l = j;
|
||||
#if defined(COMPLEX)
|
||||
aa = a + i * 2;
|
||||
bb = b + i * ldb * 2;
|
||||
if (transa) {
|
||||
l = k;
|
||||
aa = a + lda * i * 2;
|
||||
bb = b + i * 2;
|
||||
}
|
||||
if (transb)
|
||||
bb = b + i * 2;
|
||||
cc = c + i * 2 * ldc + i * 2;
|
||||
#else
|
||||
aa = a + i;
|
||||
bb = b + i * ldb;
|
||||
if (transa) {
|
||||
l = k;
|
||||
aa = a + lda * i;
|
||||
bb = b + i;
|
||||
}
|
||||
if (transb)
|
||||
bb = b + i;
|
||||
cc = c + i * ldc + i;
|
||||
#endif
|
||||
|
||||
|
@ -458,8 +472,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
buffer_size = j + k + 128 / sizeof(FLOAT);
|
||||
#ifdef WINDOWS_ABI
|
||||
buffer_size += 160 / sizeof(FLOAT);
|
||||
|
@ -479,20 +491,34 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
#endif
|
||||
|
||||
#if defined(COMPLEX)
|
||||
if (!transa)
|
||||
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
|
||||
aa, lda, bb, incb, cc, 1,
|
||||
buffer);
|
||||
else
|
||||
(gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
|
||||
aa, lda, bb, incb, cc, 1,
|
||||
buffer);
|
||||
#else
|
||||
if (!transa)
|
||||
(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
|
||||
bb, incb, cc, 1, buffer);
|
||||
else
|
||||
(gemv[(int)transa]) (k, j, 0, alpha, aa, lda,
|
||||
bb, incb, cc, 1, buffer);
|
||||
#endif
|
||||
#ifdef SMP
|
||||
} else {
|
||||
|
||||
if (!transa)
|
||||
(gemv_thread[(int)transa]) (j, k, alpha, aa,
|
||||
lda, bb, incb, cc,
|
||||
1, buffer,
|
||||
nthreads);
|
||||
else
|
||||
(gemv_thread[(int)transa]) (k, j, alpha, aa,
|
||||
lda, bb, incb, cc,
|
||||
1, buffer,
|
||||
nthreads);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
@ -501,21 +527,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
}
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
for (i = 0; i < m; i++) {
|
||||
j = i + 1;
|
||||
|
||||
l = j;
|
||||
#if defined COMPLEX
|
||||
bb = b + i * ldb * 2;
|
||||
if (transa) {
|
||||
l = k;
|
||||
if (transb) {
|
||||
bb = b + i * 2;
|
||||
}
|
||||
cc = c + i * 2 * ldc;
|
||||
#else
|
||||
bb = b + i * ldb;
|
||||
if (transa) {
|
||||
l = k;
|
||||
if (transb) {
|
||||
bb = b + i;
|
||||
}
|
||||
cc = c + i * ldc;
|
||||
|
@ -537,8 +561,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
#endif
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
buffer_size = j + k + 128 / sizeof(FLOAT);
|
||||
#ifdef WINDOWS_ABI
|
||||
buffer_size += 160 / sizeof(FLOAT);
|
||||
|
@ -558,30 +580,39 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
#endif
|
||||
|
||||
#if defined(COMPLEX)
|
||||
if (!transa)
|
||||
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
|
||||
a, lda, bb, incb, cc, 1,
|
||||
buffer);
|
||||
else
|
||||
(gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
|
||||
a, lda, bb, incb, cc, 1,
|
||||
buffer);
|
||||
#else
|
||||
if (!transa)
|
||||
(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
|
||||
incb, cc, 1, buffer);
|
||||
else
|
||||
(gemv[(int)transa]) (k, j, 0, alpha, a, lda, bb,
|
||||
incb, cc, 1, buffer);
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
} else {
|
||||
|
||||
if (!transa)
|
||||
(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
|
||||
bb, incb, cc, 1,
|
||||
buffer, nthreads);
|
||||
|
||||
else
|
||||
(gemv_thread[(int)transa]) (k, j, alpha, a, lda,
|
||||
bb, incb, cc, 1,
|
||||
buffer, nthreads);
|
||||
}
|
||||
#endif
|
||||
|
||||
STACK_FREE(buffer);
|
||||
}
|
||||
}
|
||||
FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE,
|
||||
args.m * args.k + args.k * args.n +
|
||||
args.m * args.n, 2 * args.m * args.n * args.k);
|
||||
|
||||
IDEBUG_END;
|
||||
|
||||
|
|
|
@ -100,13 +100,13 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
|||
|
||||
if ( order == BlasColMajor)
|
||||
{
|
||||
if ( trans == BlasNoTrans && *ldb < *rows ) info = 9;
|
||||
if ( trans == BlasTrans && *ldb < *cols ) info = 9;
|
||||
if ( trans == BlasNoTrans && *ldb < *rows ) info = 8;
|
||||
if ( trans == BlasTrans && *ldb < *cols ) info = 8;
|
||||
}
|
||||
if ( order == BlasRowMajor)
|
||||
{
|
||||
if ( trans == BlasNoTrans && *ldb < *cols ) info = 9;
|
||||
if ( trans == BlasTrans && *ldb < *rows ) info = 9;
|
||||
if ( trans == BlasNoTrans && *ldb < *cols ) info = 8;
|
||||
if ( trans == BlasTrans && *ldb < *rows ) info = 8;
|
||||
}
|
||||
|
||||
if ( order == BlasColMajor && *lda < *rows ) info = 7;
|
||||
|
@ -120,17 +120,20 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
|||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef NEW_IMATCOPY
|
||||
if ( *lda == *ldb && *rows == *cols) {
|
||||
if ( *lda == *ldb ) {
|
||||
if ( order == BlasColMajor )
|
||||
{
|
||||
if ( trans == BlasNoTrans )
|
||||
{
|
||||
IMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda );
|
||||
return;
|
||||
}
|
||||
else
|
||||
else if ( *rows == *cols )
|
||||
{
|
||||
IMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda );
|
||||
return;
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@ -138,26 +141,23 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
|||
if ( trans == BlasNoTrans )
|
||||
{
|
||||
IMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda );
|
||||
}
|
||||
else
|
||||
{
|
||||
IMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda );
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
else if ( *rows == *cols )
|
||||
{
|
||||
IMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda );
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if ( *lda > *ldb )
|
||||
msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT);
|
||||
else
|
||||
msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT);
|
||||
msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT);
|
||||
|
||||
b = malloc(msize);
|
||||
if ( b == NULL )
|
||||
{
|
||||
printf("Memory alloc failed\n");
|
||||
printf("Memory alloc failed in imatcopy\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
@ -165,26 +165,26 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
|||
{
|
||||
if ( trans == BlasNoTrans )
|
||||
{
|
||||
OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *ldb );
|
||||
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *ldb, a, *ldb );
|
||||
OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *rows );
|
||||
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *rows, a, *ldb );
|
||||
}
|
||||
else
|
||||
{
|
||||
OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *ldb );
|
||||
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, b, *ldb, a, *ldb );
|
||||
OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *cols );
|
||||
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, b, *cols, a, *ldb );
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if ( trans == BlasNoTrans )
|
||||
{
|
||||
OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *ldb );
|
||||
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *ldb, a, *ldb );
|
||||
OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *cols );
|
||||
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *cols, a, *ldb );
|
||||
}
|
||||
else
|
||||
{
|
||||
OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *ldb );
|
||||
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, b, *ldb, a, *ldb );
|
||||
OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *rows );
|
||||
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, b, *rows, a, *ldb );
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -54,6 +54,21 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
|
|||
|
||||
if (n <= 0) return 0.;
|
||||
|
||||
#ifndef COMPLEX
|
||||
if (n == 1)
|
||||
#ifdef DOUBLE
|
||||
return fabs(x[0]);
|
||||
#else
|
||||
return fabsf(x[0]);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (incx < 0)
|
||||
#ifdef COMPLEX
|
||||
x -= (n - 1) * incx * 2;
|
||||
#else
|
||||
x -= (n - 1) * incx;
|
||||
#endif
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
@ -82,6 +97,22 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
|
|||
|
||||
if (n <= 0) return 0.;
|
||||
|
||||
#ifndef COMPLEX
|
||||
if (n == 1)
|
||||
#ifdef DOUBLE
|
||||
return fabs(x[0]);
|
||||
#else
|
||||
return fabsf(x[0]);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (incx < 0)
|
||||
#ifdef COMPLEX
|
||||
x -= (n - 1) * incx * 2;
|
||||
#else
|
||||
x -= (n - 1) * incx;
|
||||
#endif
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
#include <math.h>
|
||||
#include <float.h>
|
||||
#include "common.h"
|
||||
#ifdef FUNCTION_PROFILE
|
||||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef CBLAS
|
||||
|
||||
void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
||||
|
@ -14,17 +16,27 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
|||
|
||||
#endif
|
||||
|
||||
#ifdef DOUBLE
|
||||
long double safmin = DBL_MIN;
|
||||
#else
|
||||
long double safmin = FLT_MIN;
|
||||
#endif
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86)
|
||||
|
||||
long double da = *DA;
|
||||
long double db = *DB;
|
||||
long double c;
|
||||
long double s;
|
||||
long double r, roe, z;
|
||||
long double r, z;
|
||||
long double sigma, dascal,dbscal;
|
||||
|
||||
long double ada = fabsl(da);
|
||||
long double adb = fabsl(db);
|
||||
long double scale = ada + adb;
|
||||
long double maxab = MAX(ada,adb);
|
||||
long double safmax;
|
||||
long double scale;
|
||||
|
||||
|
||||
#ifndef CBLAS
|
||||
PRINT_DEBUG_NAME;
|
||||
|
@ -32,17 +44,25 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
|||
PRINT_DEBUG_CNAME;
|
||||
#endif
|
||||
|
||||
roe = db;
|
||||
if (ada > adb) roe = da;
|
||||
|
||||
if (scale == ZERO) {
|
||||
if (adb == ZERO) {
|
||||
*C = ONE;
|
||||
*S = ZERO;
|
||||
*DA = ZERO;
|
||||
*DB = ZERO;
|
||||
} else if (ada == ZERO) {
|
||||
*C = ZERO;
|
||||
*S = ONE;
|
||||
*DA = *DB;
|
||||
*DB = ONE;
|
||||
} else {
|
||||
r = sqrt(da * da + db * db);
|
||||
if (roe < 0) r = -r;
|
||||
safmax = 1./safmin;
|
||||
scale = MIN(MAX(safmin,maxab), safmax);
|
||||
if (ada > adb)
|
||||
sigma = copysign(1.,da);
|
||||
else
|
||||
sigma = copysign(1.,db);
|
||||
dascal = da / scale;
|
||||
dbscal = db / scale;
|
||||
r = sigma * (scale * sqrt(dascal * dascal + dbscal * dbscal));
|
||||
c = da / r;
|
||||
s = db / r;
|
||||
z = ONE;
|
||||
|
@ -65,11 +85,22 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
|||
FLOAT db = *DB;
|
||||
FLOAT c = *C;
|
||||
FLOAT s = *S;
|
||||
FLOAT r, roe, z;
|
||||
FLOAT sigma;
|
||||
FLOAT r, z;
|
||||
|
||||
FLOAT ada = fabs(da);
|
||||
FLOAT adb = fabs(db);
|
||||
FLOAT scale = ada + adb;
|
||||
FLOAT maxab = MAX(ada,adb);
|
||||
long double safmax ;
|
||||
FLOAT scale ;
|
||||
|
||||
safmax = 1./safmin;
|
||||
scale = MIN(MAX(safmin,maxab), safmax);
|
||||
|
||||
if (ada > adb)
|
||||
sigma = copysign(1.,da);
|
||||
else
|
||||
sigma = copysign(1.,db);
|
||||
|
||||
#ifndef CBLAS
|
||||
PRINT_DEBUG_NAME;
|
||||
|
@ -77,20 +108,21 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
|||
PRINT_DEBUG_CNAME;
|
||||
#endif
|
||||
|
||||
roe = db;
|
||||
if (ada > adb) roe = da;
|
||||
|
||||
if (scale == ZERO) {
|
||||
if (adb == ZERO) {
|
||||
*C = ONE;
|
||||
*S = ZERO;
|
||||
*DA = ZERO;
|
||||
*DB = ZERO;
|
||||
} else if (ada == ZERO) {
|
||||
*C = ZERO;
|
||||
*S = ONE;
|
||||
*DA = *DB;
|
||||
*DB = ONE;
|
||||
} else {
|
||||
FLOAT aa = da / scale;
|
||||
FLOAT bb = db / scale;
|
||||
|
||||
r = scale * sqrt(aa * aa + bb * bb);
|
||||
if (roe < 0) r = -r;
|
||||
r = sigma * scale * sqrt(aa * aa + bb * bb);
|
||||
c = da / r;
|
||||
s = db / r;
|
||||
z = ONE;
|
||||
|
|
|
@ -166,7 +166,7 @@ void NAME(char *SIDE, char *UPLO,
|
|||
int nodes;
|
||||
#endif
|
||||
# if defined(SMP)
|
||||
int MN;
|
||||
double MN;
|
||||
#endif
|
||||
blasint info;
|
||||
int side;
|
||||
|
@ -264,7 +264,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
|
|||
int nodes;
|
||||
#endif
|
||||
#if defined(SMP)
|
||||
int MN;
|
||||
double MN;
|
||||
#endif
|
||||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
|
|
@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS,
|
|||
FLOAT *sa, *sb;
|
||||
|
||||
#ifdef SMP
|
||||
int NNK;
|
||||
double NNK;
|
||||
#ifdef USE_SIMPLE_THREADED_LEVEL3
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
|
@ -232,7 +232,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
|
|||
FLOAT *sa, *sb;
|
||||
|
||||
#ifdef SMP
|
||||
int NNK;
|
||||
double NNK;
|
||||
|
||||
#ifdef USE_SIMPLE_THREADED_LEVEL3
|
||||
#ifndef COMPLEX
|
||||
|
|
|
@ -125,27 +125,33 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
|||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
return;
|
||||
}
|
||||
|
||||
#ifdef NEW_IMATCOPY
|
||||
if (*lda == *ldb && *cols == *rows) {
|
||||
if (*lda == *ldb ) {
|
||||
if ( order == BlasColMajor )
|
||||
{
|
||||
|
||||
if ( trans == BlasNoTrans )
|
||||
{
|
||||
IMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
||||
return;
|
||||
}
|
||||
if ( trans == BlasConj )
|
||||
{
|
||||
IMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
||||
return;
|
||||
}
|
||||
if ( trans == BlasTrans )
|
||||
if ( trans == BlasTrans && *rows == *cols )
|
||||
{
|
||||
IMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
||||
return;
|
||||
}
|
||||
if ( trans == BlasTransConj )
|
||||
if ( trans == BlasTransConj && *rows == *cols )
|
||||
{
|
||||
IMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -153,28 +159,29 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
|||
if ( trans == BlasNoTrans )
|
||||
{
|
||||
IMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
||||
return;
|
||||
}
|
||||
if ( trans == BlasConj )
|
||||
{
|
||||
IMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
||||
return;
|
||||
}
|
||||
if ( trans == BlasTrans )
|
||||
if ( trans == BlasTrans && *rows == *cols )
|
||||
{
|
||||
IMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
||||
return;
|
||||
}
|
||||
if ( trans == BlasTransConj )
|
||||
if ( trans == BlasTransConj && *rows == *cols )
|
||||
{
|
||||
IMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda );
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if ( *lda > *ldb )
|
||||
msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT) * 2;
|
||||
else
|
||||
msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT) * 2;
|
||||
msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT) * 2;
|
||||
|
||||
b = malloc(msize);
|
||||
if ( b == NULL )
|
||||
|
@ -183,37 +190,28 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
|||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
if ( order == BlasColMajor )
|
||||
{
|
||||
|
||||
if ( trans == BlasNoTrans )
|
||||
{
|
||||
OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
||||
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
free(b);
|
||||
return;
|
||||
OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
|
||||
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
|
||||
}
|
||||
if ( trans == BlasConj )
|
||||
else if ( trans == BlasConj )
|
||||
{
|
||||
OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
||||
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
free(b);
|
||||
return;
|
||||
OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
|
||||
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
|
||||
}
|
||||
if ( trans == BlasTrans )
|
||||
else if ( trans == BlasTrans )
|
||||
{
|
||||
OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
||||
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
free(b);
|
||||
return;
|
||||
OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
|
||||
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
|
||||
}
|
||||
if ( trans == BlasTransConj )
|
||||
else if ( trans == BlasTransConj )
|
||||
{
|
||||
OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
||||
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
free(b);
|
||||
return;
|
||||
OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
|
||||
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -222,34 +220,27 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
|||
|
||||
if ( trans == BlasNoTrans )
|
||||
{
|
||||
OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
||||
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
free(b);
|
||||
return;
|
||||
OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
|
||||
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
|
||||
}
|
||||
if ( trans == BlasConj )
|
||||
else if ( trans == BlasConj )
|
||||
{
|
||||
OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
||||
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
free(b);
|
||||
return;
|
||||
OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
|
||||
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
|
||||
}
|
||||
if ( trans == BlasTrans )
|
||||
else if ( trans == BlasTrans )
|
||||
{
|
||||
OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
||||
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
free(b);
|
||||
return;
|
||||
OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
|
||||
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
|
||||
}
|
||||
if ( trans == BlasTransConj )
|
||||
else if ( trans == BlasTransConj )
|
||||
{
|
||||
OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
|
||||
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
|
||||
free(b);
|
||||
return;
|
||||
OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
|
||||
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
free(b);
|
||||
return;
|
||||
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
#include <math.h>
|
||||
#include <float.h>
|
||||
#include "common.h"
|
||||
#ifdef FUNCTION_PROFILE
|
||||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef CBLAS
|
||||
void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
||||
|
||||
|
@ -14,123 +16,166 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
|
|||
FLOAT *S = (FLOAT*) VS;
|
||||
#endif /* CBLAS */
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86)
|
||||
|
||||
long double da_r = *(DA + 0);
|
||||
long double da_i = *(DA + 1);
|
||||
long double db_r = *(DB + 0);
|
||||
long double db_i = *(DB + 1);
|
||||
long double r;
|
||||
|
||||
long double ada = fabsl(da_r) + fabsl(da_i);
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
if (ada == ZERO) {
|
||||
*C = ZERO;
|
||||
*(S + 0) = ONE;
|
||||
*(S + 1) = ZERO;
|
||||
*(DA + 0) = db_r;
|
||||
*(DA + 1) = db_i;
|
||||
} else {
|
||||
long double alpha_r, alpha_i;
|
||||
|
||||
ada = sqrt(da_r * da_r + da_i * da_i);
|
||||
|
||||
r = sqrt(da_r * da_r + da_i * da_i + db_r * db_r + db_i * db_i);
|
||||
|
||||
alpha_r = da_r / ada;
|
||||
alpha_i = da_i / ada;
|
||||
|
||||
*(C + 0) = ada / r;
|
||||
*(S + 0) = (alpha_r * db_r + alpha_i *db_i) / r;
|
||||
*(S + 1) = (alpha_i * db_r - alpha_r *db_i) / r;
|
||||
*(DA + 0) = alpha_r * r;
|
||||
*(DA + 1) = alpha_i * r;
|
||||
}
|
||||
#ifdef DOUBLE
|
||||
long double safmin = DBL_MIN;
|
||||
long double rtmin = sqrt(DBL_MIN/DBL_EPSILON);
|
||||
#else
|
||||
FLOAT da_r = *(DA + 0);
|
||||
FLOAT da_i = *(DA + 1);
|
||||
FLOAT db_r = *(DB + 0);
|
||||
FLOAT db_i = *(DB + 1);
|
||||
FLOAT r;
|
||||
|
||||
FLOAT ada = fabs(da_r) + fabs(da_i);
|
||||
FLOAT adb;
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
if (ada == ZERO) {
|
||||
*C = ZERO;
|
||||
*(S + 0) = ONE;
|
||||
*(S + 1) = ZERO;
|
||||
*(DA + 0) = db_r;
|
||||
*(DA + 1) = db_i;
|
||||
} else {
|
||||
FLOAT scale;
|
||||
FLOAT aa_r, aa_i, bb_r, bb_i;
|
||||
FLOAT alpha_r, alpha_i;
|
||||
|
||||
aa_r = fabs(da_r);
|
||||
aa_i = fabs(da_i);
|
||||
|
||||
if (aa_i > aa_r) {
|
||||
aa_r = fabs(da_i);
|
||||
aa_i = fabs(da_r);
|
||||
}
|
||||
|
||||
if (aa_r == ZERO) {
|
||||
ada = 0.;
|
||||
} else {
|
||||
scale = (aa_i / aa_r);
|
||||
ada = aa_r * sqrt(ONE + scale * scale);
|
||||
}
|
||||
|
||||
bb_r = fabs(db_r);
|
||||
bb_i = fabs(db_i);
|
||||
|
||||
if (bb_i > bb_r) {
|
||||
bb_r = fabs(bb_i);
|
||||
bb_i = fabs(bb_r);
|
||||
}
|
||||
|
||||
if (bb_r == ZERO) {
|
||||
adb = 0.;
|
||||
} else {
|
||||
scale = (bb_i / bb_r);
|
||||
adb = bb_r * sqrt(ONE + scale * scale);
|
||||
}
|
||||
scale = ada + adb;
|
||||
|
||||
aa_r = da_r / scale;
|
||||
aa_i = da_i / scale;
|
||||
bb_r = db_r / scale;
|
||||
bb_i = db_i / scale;
|
||||
|
||||
r = scale * sqrt(aa_r * aa_r + aa_i * aa_i + bb_r * bb_r + bb_i * bb_i);
|
||||
|
||||
alpha_r = da_r / ada;
|
||||
alpha_i = da_i / ada;
|
||||
|
||||
*(C + 0) = ada / r;
|
||||
*(S + 0) = (alpha_r * db_r + alpha_i *db_i) / r;
|
||||
*(S + 1) = (alpha_i * db_r - alpha_r *db_i) / r;
|
||||
*(DA + 0) = alpha_r * r;
|
||||
*(DA + 1) = alpha_i * r;
|
||||
}
|
||||
long double safmin = FLT_MIN;
|
||||
long double rtmin = sqrt(FLT_MIN/FLT_EPSILON);
|
||||
#endif
|
||||
|
||||
FUNCTION_PROFILE_END(4, 4, 4);
|
||||
|
||||
IDEBUG_END;
|
||||
FLOAT da_r = *(DA+0);
|
||||
FLOAT da_i = *(DA+1);
|
||||
FLOAT db_r = *(DB+0);
|
||||
FLOAT db_i = *(DB+1);
|
||||
//long double r;
|
||||
FLOAT *r, *S1=(FLOAT *)malloc(2*sizeof(FLOAT));
|
||||
FLOAT *R=(FLOAT *)malloc(2*sizeof(FLOAT));
|
||||
long double d;
|
||||
|
||||
FLOAT ada = da_r * da_r + da_i * da_i;
|
||||
FLOAT adb = db_r * db_r + db_i * db_i;
|
||||
FLOAT adart = sqrt( da_r * da_r + da_i * da_i);
|
||||
FLOAT adbrt = sqrt( db_r * db_r + db_i * db_i);
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
if (db_r == ZERO && db_i == ZERO) {
|
||||
*C = ONE;
|
||||
*(S + 0) = ZERO;
|
||||
*(S + 1) = ZERO;
|
||||
return;
|
||||
}
|
||||
|
||||
long double safmax = 1./safmin;
|
||||
#if defined DOUBLE
|
||||
long double rtmax = safmax /DBL_EPSILON;
|
||||
#else
|
||||
long double rtmax = safmax /FLT_EPSILON;
|
||||
#endif
|
||||
*(S1 + 0) = *(DB + 0);
|
||||
*(S1 + 1) = *(DB + 1) *-1;
|
||||
if (da_r == ZERO && da_i == ZERO) {
|
||||
*C = ZERO;
|
||||
if (db_r == ZERO) {
|
||||
(*DA) = fabsl(db_i);
|
||||
*S = *S1 /da_r;
|
||||
*(S+1) = *(S1+1) /da_r;
|
||||
return;
|
||||
} else if ( db_i == ZERO) {
|
||||
*DA = fabsl(db_r);
|
||||
*S = *S1 /da_r;
|
||||
*(S+1) = *(S1+1) /da_r;
|
||||
return;
|
||||
} else {
|
||||
long double g1 = MAX( fabsl(db_r), fabsl(db_i));
|
||||
rtmax =sqrt(safmax/2.);
|
||||
if (g1 > rtmin && g1 < rtmax) { // unscaled
|
||||
d = sqrt(adb);
|
||||
*S = *S1 /d;
|
||||
*(S+1) = *(S1+1) /d;
|
||||
*DA = d ;
|
||||
*(DA+1) = ZERO;
|
||||
return;
|
||||
} else { // scaled algorithm
|
||||
long double u = MIN ( safmax, MAX ( safmin, g1));
|
||||
FLOAT gs_r = db_r/u;
|
||||
FLOAT gs_i = db_i/u;
|
||||
d = sqrt ( gs_r*gs_r + gs_i*gs_i);
|
||||
*S = gs_r / d;
|
||||
*(S + 1) = (gs_i * -1) / d;
|
||||
*DA = d * u;
|
||||
*(DA+1) = ZERO;
|
||||
return;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
FLOAT f1 = MAX ( fabsl(da_r), fabsl(da_i));
|
||||
FLOAT g1 = MAX ( fabsl(db_r), fabsl(db_i));
|
||||
rtmax = sqrt(safmax / 4.);
|
||||
if ( f1 > rtmin && f1 < rtmax && g1 > rtmin && g1 < rtmax) { //unscaled
|
||||
long double h = ada + adb;
|
||||
double adahsq = sqrt(ada * h);
|
||||
if (ada >= h *safmin) {
|
||||
*C = sqrt(ada/h);
|
||||
*R = *DA / *C;
|
||||
*(R+1) = *(DA+1) / *(C+1);
|
||||
rtmax *= 2.;
|
||||
if ( ada > rtmin && h < rtmax) { // no risk of intermediate overflow
|
||||
*S = *S1 * (*DA / adahsq) - *(S1+1)* (*(DA+1)/adahsq);
|
||||
*(S+1) = *S1 * (*(DA+1) / adahsq) + *(S1+1) * (*DA/adahsq);
|
||||
} else {
|
||||
*S = *S1 * (*R/h) - *(S1+1) * (*(R+1)/h);
|
||||
*(S+1) = *S1 * (*(R+1)/h) + *(S1+1) * (*(R)/h);
|
||||
}
|
||||
} else {
|
||||
*C = ada / adahsq;
|
||||
if (*C >= safmin)
|
||||
*R = *DA / *C;
|
||||
else
|
||||
*R = *DA * (h / adahsq);
|
||||
*S = *S1 * ada / adahsq;
|
||||
*(S+1) = *(S1+1) * ada / adahsq;
|
||||
}
|
||||
*DA=*R;
|
||||
*(DA+1)=*(R+1);
|
||||
return;
|
||||
} else { // scaled
|
||||
FLOAT fs_r, fs_i, gs_r, gs_i;
|
||||
long double v,w,f2,g2,h;
|
||||
long double u = MIN ( safmax, MAX ( safmin, MAX(f1,g1)));
|
||||
gs_r = db_r/u;
|
||||
gs_i = db_i/u;
|
||||
g2 = sqrt ( gs_r*gs_r + gs_i*gs_i);
|
||||
if (f1 /u < rtmin) {
|
||||
v = MIN (safmax, MAX (safmin, f1));
|
||||
w = v / u;
|
||||
fs_r = *DA/ v;
|
||||
fs_i = *(DA+1) / v;
|
||||
f2 = sqrt ( fs_r*fs_r + fs_i*fs_i);
|
||||
h = f2 * w * w + g2;
|
||||
} else { // use same scaling for both
|
||||
w = 1.;
|
||||
fs_r = *DA/ u;
|
||||
fs_i = *(DA+1) / u;
|
||||
f2 = sqrt ( fs_r*fs_r + fs_i*fs_i);
|
||||
h = f2 + g2;
|
||||
}
|
||||
if ( f2 >= h * safmin) {
|
||||
*C = sqrt ( f2 / h );
|
||||
*DA = fs_r / *C;
|
||||
*(DA+1) = fs_i / *C;
|
||||
rtmax *= 2;
|
||||
if ( f2 > rtmin && h < rtmax) {
|
||||
*S = gs_r * (fs_r /sqrt(f2*h)) - gs_i * (fs_i / sqrt(f2*h));
|
||||
*(S+1) = gs_r * (fs_i /sqrt(f2*h)) + gs_i * -1. * (fs_r / sqrt(f2*h));
|
||||
} else {
|
||||
*S = gs_r * (*DA/h) - gs_i * (*(DA+1) / h);
|
||||
*(S+1) = gs_r * (*(DA+1) /h) + gs_i * -1. * (*DA / h);
|
||||
}
|
||||
} else { // intermediates might overflow
|
||||
d = sqrt ( f2 * h);
|
||||
*C = f2 /d;
|
||||
if (*C >= safmin) {
|
||||
*DA = fs_r / *C;
|
||||
*(DA+1) = fs_i / *C;
|
||||
} else {
|
||||
*DA = fs_r * (h / d);
|
||||
*(DA+1) = fs_i / (h / d);
|
||||
}
|
||||
*S = gs_r * (fs_r /d) - gs_i * (fs_i / d);
|
||||
*(S+1) = gs_r * (fs_i /d) + gs_i * -1. * (fs_r / d);
|
||||
}
|
||||
*C *= w;
|
||||
*DA *= u;
|
||||
*(DA+1) *= u;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -33,7 +33,7 @@ endif
|
|||
ifdef TARGET_CORE
|
||||
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12)))
|
||||
override CFLAGS += -march=sapphirerapids
|
||||
else
|
||||
override CFLAGS += -march=skylake-avx512 -mavx512f
|
||||
|
@ -48,7 +48,7 @@ ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
|
|||
endif
|
||||
else ifeq ($(TARGET_CORE), COOPERLAKE)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(CLANGVERSIONGTEQ9)))
|
||||
override CFLAGS += -march=cooperlake
|
||||
else
|
||||
override CFLAGS += -march=skylake-avx512 -mavx512f
|
||||
|
@ -77,6 +77,12 @@ else ifeq ($(TARGET_CORE), ZEN)
|
|||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
|
||||
else ifeq ($(TARGET_CORE), LOONGSON3R4)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS)
|
||||
else ifneq ($(filter NEOVERSEN2 NEOVERSEV1, $(TARGET_CORE)),)
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -Msve_intrinsics
|
||||
else
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||
endif
|
||||
else
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||
endif
|
||||
|
|
|
@ -35,6 +35,12 @@ USE_TRMM = 1
|
|||
endif
|
||||
endif
|
||||
|
||||
ifneq ($(DYNAMIC_ARCH), 1)
|
||||
ifeq ($(TARGET), MIPS64_GENERIC)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), HASWELL)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
|
|
@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
FLOAT absxi = 0.0;
|
||||
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(0.0);
|
||||
if (n <= 0 || inc_x == 0) return(0.0);
|
||||
if ( n == 1 ) return( ABS(x[0]) );
|
||||
|
||||
n *= inc_x;
|
||||
|
|
|
@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
BLASLONG inc_x2;
|
||||
FLOAT temp;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(0.0);
|
||||
if (n <= 0 || inc_x == 0) return(0.0);
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
|
|
|
@ -57,7 +57,7 @@ CAMAXKERNEL = zamax.S
|
|||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
|
@ -81,45 +81,35 @@ DGEMVTKERNEL = gemv_t.S
|
|||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
SASUMKERNEL = sasum_thunderx2t99.c
|
||||
DASUMKERNEL = dasum_thunderx2t99.c
|
||||
CASUMKERNEL = casum_thunderx2t99.c
|
||||
ZASUMKERNEL = zasum_thunderx2t99.c
|
||||
|
||||
SASUMKERNEL = asum.S
|
||||
DASUMKERNEL = asum.S
|
||||
CASUMKERNEL = casum.S
|
||||
ZASUMKERNEL = zasum.S
|
||||
SCOPYKERNEL = copy_thunderx2t99.c
|
||||
DCOPYKERNEL = copy_thunderx2t99.c
|
||||
CCOPYKERNEL = copy_thunderx2t99.c
|
||||
ZCOPYKERNEL = copy_thunderx2t99.c
|
||||
|
||||
SCOPYKERNEL = copy.S
|
||||
DCOPYKERNEL = copy.S
|
||||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
SSWAPKERNEL = swap_thunderx2t99.S
|
||||
DSWAPKERNEL = swap_thunderx2t99.S
|
||||
CSWAPKERNEL = swap_thunderx2t99.S
|
||||
ZSWAPKERNEL = swap_thunderx2t99.S
|
||||
|
||||
SSWAPKERNEL = swap.S
|
||||
DSWAPKERNEL = swap.S
|
||||
CSWAPKERNEL = swap.S
|
||||
ZSWAPKERNEL = swap.S
|
||||
ISAMAXKERNEL = iamax_thunderx2t99.c
|
||||
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||
|
||||
ISAMAXKERNEL = iamax.S
|
||||
IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
DDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
else
|
||||
SDOTKERNEL = dot.S
|
||||
endif
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DDOTKERNEL = dot.c
|
||||
SDOTKERNEL = dot.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
|
@ -128,10 +118,10 @@ SGEMM_BETA = sgemm_beta.S
|
|||
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
|
||||
|
||||
SGEMMINCOPY = sgemm_ncopy_sve_v1.c
|
||||
SGEMMITCOPY = sgemm_tcopy_sve_v1.c
|
||||
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
SGEMMINCOPY = gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c
|
||||
SGEMMITCOPY = gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c
|
||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
@ -149,8 +139,8 @@ SSYMMLCOPY_M = symm_lcopy_sve.c
|
|||
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
|
||||
|
||||
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
|
||||
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
|
||||
DGEMMINCOPY = gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c
|
||||
DGEMMITCOPY = gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
|
||||
|
@ -170,8 +160,8 @@ DSYMMLCOPY_M = symm_lcopy_sve.c
|
|||
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
|
||||
CGEMMINCOPY = cgemm_ncopy_sve_v1.c
|
||||
CGEMMITCOPY = cgemm_tcopy_sve_v1.c
|
||||
CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
|
||||
CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
|
||||
|
@ -194,8 +184,8 @@ CSYMMLCOPY_M = zsymm_lcopy_sve.c
|
|||
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
|
||||
ZGEMMINCOPY = zgemm_ncopy_sve_v1.c
|
||||
ZGEMMITCOPY = zgemm_tcopy_sve_v1.c
|
||||
ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
|
||||
|
|
|
@ -1,189 +1 @@
|
|||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
|
||||
SASUMKERNEL = sasum_thunderx2t99.c
|
||||
DASUMKERNEL = dasum_thunderx2t99.c
|
||||
CASUMKERNEL = casum_thunderx2t99.c
|
||||
ZASUMKERNEL = zasum_thunderx2t99.c
|
||||
|
||||
SCOPYKERNEL = copy_thunderx2t99.c
|
||||
DCOPYKERNEL = copy_thunderx2t99.c
|
||||
CCOPYKERNEL = copy_thunderx2t99.c
|
||||
ZCOPYKERNEL = copy_thunderx2t99.c
|
||||
|
||||
SSWAPKERNEL = swap_thunderx2t99.S
|
||||
DSWAPKERNEL = swap_thunderx2t99.S
|
||||
CSWAPKERNEL = swap_thunderx2t99.S
|
||||
ZSWAPKERNEL = swap_thunderx2t99.S
|
||||
|
||||
ISAMAXKERNEL = iamax_thunderx2t99.c
|
||||
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||
|
||||
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
DDOTKERNEL = dot.c
|
||||
SDOTKERNEL = dot.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
ifeq ($(SGEMM_UNROLL_M), 16)
|
||||
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_M), 4)
|
||||
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_N), 16)
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_N), 4)
|
||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||
else
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
else
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
include $(KERNELDIR)/KERNEL.ARMV8SVE
|
||||
|
|
|
@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define alphaR w17
|
||||
#define alphaI w18
|
||||
#define alphaI w19
|
||||
|
||||
#define alpha0_R s10
|
||||
#define alphaV0_R v10.s[0]
|
||||
|
|
|
@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define alphaR w17
|
||||
#define alphaI w18
|
||||
#define alphaI w19
|
||||
|
||||
#define alpha0_R s10
|
||||
#define alphaV0_R v10.s[0]
|
||||
|
|
|
@ -240,7 +240,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
add pB, pB, 32
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_M1
|
||||
|
@ -276,9 +275,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1rw z15.s, p0/z, [pB, 28]
|
||||
|
||||
add pB, pB, 32
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_M2
|
||||
|
@ -313,11 +309,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri z23.s, p1/m, z2.s, z15.s
|
||||
ld1rw z15.s, p0/z, [pB, 28]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
add pB, pB, 32
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_E
|
||||
|
@ -341,10 +333,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ii z22.s, p1/m, z3.s, z15.s
|
||||
OP_ri z23.s, p1/m, z2.s, z15.s
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_SUB
|
||||
|
@ -383,13 +371,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ii z22.s, p1/m, z1.s, z15.s
|
||||
OP_ri z23.s, p1/m, z0.s, z15.s
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x4
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
|
||||
fmla z24.s, p1/m, z16.s, alphaz_R
|
||||
fmls z24.s, p1/m, z17.s, alphaz_I
|
||||
|
@ -407,8 +391,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
st2w {z26.s, z27.s}, p1, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, lanes, lsl #3
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
ld2w {z28.s, z29.s}, p1/z, [pCRow2]
|
||||
fmla z28.s, p1/m, z20.s, alphaz_R
|
||||
fmls z28.s, p1/m, z21.s, alphaz_I
|
||||
|
@ -425,12 +407,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
fmla z31.s, p1/m, z23.s, alphaz_R
|
||||
st2w {z30.s, z31.s}, p1, [pCRow3]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4
|
||||
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
@ -466,8 +444,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVEv1x2
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
|
||||
fmla z24.s, p1/m, z16.s, alphaz_R
|
||||
fmls z24.s, p1/m, z17.s, alphaz_I
|
||||
|
@ -485,10 +461,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
st2w {z26.s, z27.s}, p1, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, lanes, lsl #3
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
@ -516,8 +488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVEv1x1
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
|
||||
fmla z24.s, p1/m, z16.s, alphaz_R
|
||||
fmls z24.s, p1/m, z17.s, alphaz_I
|
||||
|
@ -527,8 +497,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4
|
||||
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
@ -553,9 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
|
||||
fmov alphaR, s0
|
||||
dup alphaz_R, alphaR
|
||||
fmov alphaI, s1
|
||||
|
@ -676,10 +641,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
bne .Lcgemm_kernel_L4_Mv1_46
|
||||
|
||||
.Lcgemm_kernel_L4_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVEv1x4
|
||||
|
||||
.Lcgemm_kernel_L4_Mv1_END:
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
|||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, n);
|
||||
svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
|
||||
uint32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
do {
|
||||
|
||||
|
@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
|||
aoffset += active * lda * 2;
|
||||
|
||||
j += svcntw();
|
||||
pg = svwhilelt_b32(j, n);
|
||||
pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
|||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, n);
|
||||
svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
|
||||
uint32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
do {
|
||||
|
||||
|
@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
|||
aoffset += active * 2;
|
||||
|
||||
j += svcntw();
|
||||
pg = svwhilelt_b32(j, n);
|
||||
pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
|
|
@ -49,10 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define pCRow3 x15
|
||||
#define pA x16
|
||||
#define alphaR w17
|
||||
#define alphaI w18
|
||||
#define temp x19
|
||||
#define tempOffset x20
|
||||
#define tempK x21
|
||||
#define alphaI w19
|
||||
#define temp x20
|
||||
#define tempOffset x21
|
||||
#define tempK x22
|
||||
|
||||
#define alpha0_R s10
|
||||
#define alphaV0_R v10.s[0]
|
||||
|
|
|
@ -1,79 +0,0 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
// TODO: write in assembly with proper unrolling of inner loop
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG j;
|
||||
IFLOAT *aoffset, *aoffset1, *boffset;
|
||||
|
||||
svint64_t lda_vec = svindex_s64(0LL, lda);
|
||||
uint64_t sve_size = svcntd();
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
uint64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
do {
|
||||
|
||||
aoffset1 = aoffset;
|
||||
|
||||
uint64_t i_cnt = m;
|
||||
while (i_cnt--) {
|
||||
svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
|
||||
svst1_f64(pg, (double *) boffset, a_vec);
|
||||
aoffset1++;
|
||||
boffset += active;
|
||||
}
|
||||
aoffset += sve_size * lda;
|
||||
|
||||
j += svcntd();
|
||||
pg = svwhilelt_b64(j, n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
|
||||
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,77 +0,0 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
// TODO: write in assembly with proper unrolling of inner loop
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG j;
|
||||
IFLOAT *aoffset, *aoffset1, *boffset;
|
||||
|
||||
uint64_t sve_size = svcntd();
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
uint64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
do {
|
||||
|
||||
aoffset1 = aoffset;
|
||||
|
||||
uint64_t i_cnt = m;
|
||||
while (i_cnt--) {
|
||||
svfloat64_t a_vec = svld1(pg, (double *)aoffset1);
|
||||
svst1_f64(pg, (double *) boffset, a_vec);
|
||||
aoffset1 += lda;
|
||||
boffset += active;
|
||||
}
|
||||
aoffset += sve_size;
|
||||
|
||||
j += svcntd();
|
||||
pg = svwhilelt_b64(j, n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -50,8 +50,8 @@ static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) {
|
|||
BLASLONG sve_width = SVE_WIDTH;
|
||||
|
||||
for (BLASLONG i = 0; i < n; i += sve_width * 2) {
|
||||
svbool_t pg_a = SVE_WHILELT(i, n);
|
||||
svbool_t pg_b = SVE_WHILELT(i + sve_width, n);
|
||||
svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n);
|
||||
svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n);
|
||||
|
||||
SVE_TYPE x_vec_a = svld1(pg_a, &x[i]);
|
||||
SVE_TYPE y_vec_a = svld1(pg_a, &y[i]);
|
||||
|
|
|
@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include <float.h>
|
||||
#include <arm_neon.h>
|
||||
|
||||
#if defined(SMP)
|
||||
|
@ -404,7 +404,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
#else
|
||||
nrm2_compute(n, x, inc_x, &ssq, &scale);
|
||||
#endif
|
||||
if (fabs(scale) <1.e-300) return 0.;
|
||||
volatile FLOAT sca = fabs(scale);
|
||||
if (sca < DBL_MIN) return 0.;
|
||||
ssq = sqrt(ssq) * scale;
|
||||
|
||||
return ssq;
|
||||
|
|
|
@ -0,0 +1,121 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <arm_sve.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define COUNT "cntd"
|
||||
#define SV_TYPE svfloat64_t
|
||||
#define SV_INDEX svuint64_t
|
||||
#define SV_INDEXER svindex_u64
|
||||
#define SV_TRUE svptrue_b64
|
||||
#define SV_WHILE svwhilelt_b64
|
||||
#else
|
||||
#define COUNT "cntw"
|
||||
#define SV_TYPE svfloat32_t
|
||||
#define SV_INDEX svuint32_t
|
||||
#define SV_INDEXER svindex_u32
|
||||
#define SV_TRUE svptrue_b32
|
||||
#define SV_WHILE svwhilelt_b32
|
||||
#endif
|
||||
|
||||
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \
|
||||
a_vec_real = svld1_gather_index(pg, a_offset_inner, lda_vec); \
|
||||
a_vec_imag = svld1_gather_index(pg, a_offset_inner + 1, lda_vec); \
|
||||
svst2(pg, b_offset, svcreate2(a_vec_real, a_vec_imag)); \
|
||||
a_offset_inner += 2; \
|
||||
b_offset += active * 2;
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
|
||||
uint64_t sve_size;
|
||||
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : );
|
||||
|
||||
IFLOAT *a_offset, *a_offset_inner, *b_offset;
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
SV_INDEX lda_vec = SV_INDEXER(0LL, lda * 2);
|
||||
SV_TYPE a_vec_real;
|
||||
SV_TYPE a_vec_imag;
|
||||
svbool_t pg_true = SV_TRUE();
|
||||
|
||||
BLASLONG single_vectors_n = n & -sve_size;
|
||||
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
|
||||
a_offset_inner = a_offset;
|
||||
|
||||
svbool_t pg = pg_true;
|
||||
uint64_t active = sve_size;
|
||||
uint64_t i_cnt = m >> 2;
|
||||
while (i_cnt--) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 2) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
a_offset += sve_size * lda * 2;
|
||||
}
|
||||
|
||||
BLASLONG remaining_n = n - single_vectors_n;
|
||||
if (remaining_n) {
|
||||
a_offset_inner = a_offset;
|
||||
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
|
||||
uint64_t active = remaining_n;
|
||||
uint64_t i_cnt = m >> 2;
|
||||
while (i_cnt--) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 2) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,131 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <arm_sve.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define COUNT "cntd"
|
||||
#define SV_TYPE svfloat64_t
|
||||
#define SV_INDEX svuint64_t
|
||||
#define SV_INDEXER svindex_u64
|
||||
#define SV_TRUE svptrue_b64
|
||||
#define SV_WHILE svwhilelt_b64
|
||||
#define SV_PREFETCH svprfd_gather_index
|
||||
#else
|
||||
#define COUNT "cntw"
|
||||
#define SV_TYPE svfloat32_t
|
||||
#define SV_INDEX svuint32_t
|
||||
#define SV_INDEXER svindex_u32
|
||||
#define SV_TRUE svptrue_b32
|
||||
#define SV_WHILE svwhilelt_b32
|
||||
#define SV_PREFETCH svprfw_gather_index
|
||||
#endif
|
||||
|
||||
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \
|
||||
a_vec = svld1_gather_index(pg, a_offset_inner, lda_vec); \
|
||||
svst1(pg, b_offset, a_vec); \
|
||||
a_offset_inner++; \
|
||||
b_offset += active;
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
|
||||
uint64_t sve_size;
|
||||
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : );
|
||||
|
||||
IFLOAT *a_offset, *a_offset_inner, *b_offset;
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
SV_INDEX lda_vec = SV_INDEXER(0LL, lda);
|
||||
SV_TYPE a_vec;
|
||||
svbool_t pg_true = SV_TRUE();
|
||||
|
||||
BLASLONG single_vectors_n = n & -sve_size;
|
||||
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
|
||||
a_offset_inner = a_offset;
|
||||
|
||||
svbool_t pg = pg_true;
|
||||
uint64_t active = sve_size;
|
||||
uint64_t i_cnt = m >> 3;
|
||||
while (i_cnt--) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 4) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 2) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
a_offset += sve_size * lda;
|
||||
}
|
||||
|
||||
BLASLONG remaining_n = n - single_vectors_n;
|
||||
if (remaining_n) {
|
||||
a_offset_inner = a_offset;
|
||||
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
|
||||
uint64_t active = remaining_n;
|
||||
uint64_t i_cnt = m >> 2;
|
||||
while (i_cnt--) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 2) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,115 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <arm_sve.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define COUNT "cntd"
|
||||
#define SV_TYPE svfloat64x2_t
|
||||
#define SV_TRUE svptrue_b64
|
||||
#define SV_WHILE svwhilelt_b64
|
||||
#else
|
||||
#define COUNT "cntw"
|
||||
#define SV_TYPE svfloat32x2_t
|
||||
#define SV_TRUE svptrue_b32
|
||||
#define SV_WHILE svwhilelt_b32
|
||||
#endif
|
||||
|
||||
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \
|
||||
a_vec = svld2(pg, a_offset_inner); \
|
||||
svst2(pg, b_offset, a_vec); \
|
||||
a_offset_inner += lda * 2; \
|
||||
b_offset += active * 2;
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
uint64_t sve_size = svcntw();
|
||||
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : );
|
||||
|
||||
IFLOAT *a_offset, *a_offset_inner, *b_offset;
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
SV_TYPE a_vec;
|
||||
svbool_t pg_true = SV_TRUE();
|
||||
|
||||
BLASLONG single_vectors_n = n & -sve_size;
|
||||
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
|
||||
a_offset_inner = a_offset;
|
||||
|
||||
svbool_t pg = pg_true;
|
||||
uint64_t active = sve_size;
|
||||
uint64_t i_cnt = m >> 2;
|
||||
while (i_cnt--) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 2) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
a_offset += sve_size * 2;
|
||||
}
|
||||
|
||||
BLASLONG remaining_n = n - single_vectors_n;
|
||||
if (remaining_n) {
|
||||
a_offset_inner = a_offset;
|
||||
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
|
||||
uint64_t active = remaining_n;
|
||||
uint64_t i_cnt = m >> 2;
|
||||
while (i_cnt--) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 2) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,125 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <arm_sve.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define COUNT "cntd"
|
||||
#define SV_TYPE svfloat64_t
|
||||
#define SV_TRUE svptrue_b64
|
||||
#define SV_WHILE svwhilelt_b64
|
||||
#else
|
||||
#define COUNT "cntw"
|
||||
#define SV_TYPE svfloat32_t
|
||||
#define SV_TRUE svptrue_b32
|
||||
#define SV_WHILE svwhilelt_b32
|
||||
#endif
|
||||
|
||||
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \
|
||||
a_vec = svld1(pg, a_offset_inner); \
|
||||
svst1(pg, b_offset, a_vec); \
|
||||
a_offset_inner += lda; \
|
||||
b_offset += active;
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
uint64_t sve_size = svcntw();
|
||||
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : );
|
||||
|
||||
IFLOAT *a_offset, *a_offset_inner, *b_offset;
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
SV_TYPE a_vec;
|
||||
svbool_t pg_true = SV_TRUE();
|
||||
|
||||
BLASLONG single_vectors_n = n & -sve_size;
|
||||
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
|
||||
a_offset_inner = a_offset;
|
||||
|
||||
svbool_t pg = pg_true;
|
||||
uint64_t active = sve_size;
|
||||
uint64_t i_cnt = m >> 3;
|
||||
while (i_cnt--) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 4) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 2) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
a_offset += sve_size;
|
||||
}
|
||||
|
||||
BLASLONG remaining_n = n - single_vectors_n;
|
||||
if (remaining_n) {
|
||||
a_offset_inner = a_offset;
|
||||
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
|
||||
uint64_t active = remaining_n;
|
||||
uint64_t i_cnt = m >> 2;
|
||||
while (i_cnt--) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 2) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -1,78 +0,0 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
// TODO: write in assembly with proper unrolling of inner loop
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG j;
|
||||
IFLOAT *aoffset, *aoffset1, *boffset;
|
||||
|
||||
svint32_t lda_vec = svindex_s32(0LL, lda);
|
||||
uint32_t sve_size = svcntw();
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, n);
|
||||
uint32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
do {
|
||||
|
||||
aoffset1 = aoffset;
|
||||
|
||||
uint32_t i_cnt = m;
|
||||
while (i_cnt--) {
|
||||
svfloat32_t a_vec = svld1_gather_index(pg, (float *) aoffset1, lda_vec);
|
||||
svst1_f32(pg, (float *) boffset, a_vec);
|
||||
aoffset1++;
|
||||
boffset += active;
|
||||
}
|
||||
aoffset += sve_size * lda;
|
||||
|
||||
j += svcntw();
|
||||
pg = svwhilelt_b32(j, n);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,77 +0,0 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#include <arm_sve.h>
|
||||
|
||||
// TODO: write in assembly with proper unrolling of inner loop
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
|
||||
BLASLONG j;
|
||||
IFLOAT *aoffset, *aoffset1, *boffset;
|
||||
|
||||
uint32_t sve_size = svcntw();
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, n);
|
||||
uint32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
do {
|
||||
|
||||
aoffset1 = aoffset;
|
||||
|
||||
uint32_t i_cnt = m;
|
||||
while (i_cnt--) {
|
||||
svfloat32_t a_vec = svld1(pg, (float *) aoffset1);
|
||||
svst1_f32(pg, (float *) boffset, a_vec);
|
||||
aoffset1 += lda;
|
||||
boffset += active;
|
||||
}
|
||||
aoffset += sve_size;
|
||||
|
||||
j += svcntw();
|
||||
pg = svwhilelt_b32(j, n);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
svint64_t one_vec = svdup_s64(1LL);
|
||||
|
||||
int64_t j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||
int64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
svint64_t index_neg = svindex_s64(0LL, -1LL);
|
||||
svint64_t index = svindex_s64(0LL, 1LL);
|
||||
|
@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posX += sve_size;
|
||||
posX_vec = svdup_s64(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b64(j, n);
|
||||
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
|
@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
|
||||
int32_t N = n;
|
||||
int32_t j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, N);
|
||||
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
|
||||
int32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
svint32_t index_neg = svindex_s32(0, -1);
|
||||
svint32_t index = svindex_s32(0, 1);
|
||||
|
@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posX += sve_size;
|
||||
posX_vec = svdup_s32(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b32(j, N);
|
||||
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
svint64_t one_vec = svdup_s64(1LL);
|
||||
|
||||
int64_t j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||
int64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
svint64_t index_neg = svindex_s64(0LL, -1LL);
|
||||
svint64_t index = svindex_s64(0LL, 1LL);
|
||||
|
@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posX += sve_size;
|
||||
posX_vec = svdup_s64(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b64(j, n);
|
||||
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
|
@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
|
||||
int32_t N = n;
|
||||
int32_t j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, N);
|
||||
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
|
||||
int32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
svint32_t index_neg = svindex_s32(0, -1);
|
||||
svint32_t index = svindex_s32(0, 1);
|
||||
|
@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posX += sve_size;
|
||||
posX_vec = svdup_s32(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b32(j, N);
|
||||
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
|
|
|
@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
FLOAT *ao;
|
||||
#ifdef DOUBLE
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svint32_t index = svindex_s32(0, lda);
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
|
@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
|
|
@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
FLOAT *ao;
|
||||
js = 0;
|
||||
#ifdef DOUBLE
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
|
@ -122,11 +122,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
|
|
@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
FLOAT *ao;
|
||||
#ifdef DOUBLE
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svint32_t index = svindex_s32(0, lda);
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
|
@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue