Merge pull request #4210 from xianyi/develop

merge develop into 0.3.0 for 0.3.24
This commit is contained in:
Martin Kroeker 2023-09-03 22:55:22 +02:00 committed by GitHub
commit 2c68822cde
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
845 changed files with 48441 additions and 6334 deletions

167
.cirrus.yml Normal file
View File

@ -0,0 +1,167 @@
macos_instance:
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
task:
name: AppleM1/LLVM
compile_script:
- brew install llvm
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- make TARGET=VORTEX USE_OPENMP=1 CC=clang
task:
name: AppleM1/LLVM/ILP64
compile_script:
- brew install llvm
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1
task:
name: AppleM1/LLVM/CMAKE
compile_script:
- brew install llvm
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- mkdir build
- cd build
- cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
- make
task:
name: AppleM1/GCC/MAKE/OPENMP
compile_script:
- brew install gcc@11
- export PATH=/opt/homebrew/bin:$PATH
- export LDFLAGS="-L/opt/homebrew/lib"
- export CPPFLAGS="-I/opt/homebrew/include"
- make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
macos_instance:
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
task:
name: AppleM1/LLVM x86_64 xbuild
compile_script:
- #brew install llvm
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- export ARCHS="i386 x86_64"
- export ARCHS_STANDARD="i386 x86_64"
- export ARCHS_STANDARD_32_64_BIT="i386 x86_64"
- export ARCHS_STANDARD_64_BIT=x86_64
- export ARCHS_STANDARD_INCLUDING_64_BIT="i386 x86_64"
- export ARCHS_UNIVERSAL_IPHONE_OS="i386 x86_64"
- export VALID_ARCHS="i386 x86_64"
- xcrun --sdk macosx --show-sdk-path
- xcodebuild -version
- export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX12.3.sdk -arch x86_64"
- make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
always:
config_artifacts:
path: "*conf*"
type: text/plain
# lib_artifacts:
# path: "libopenblas*"
# type: application/octet-streamm
macos_instance:
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
task:
name: AppleM1/LLVM armv8-ios xbuild
compile_script:
- #brew install llvm
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
- make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1
always:
config_artifacts:
path: "*conf*"
type: text/plain
macos_instance:
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
task:
name: AppleM1/LLVM armv7-androidndk xbuild
compile_script:
- #brew install android-ndk
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- find /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b -name "armv7a-linux-androideabi*-ranlib"
- #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b/AndroidNDK8937393.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
always:
config_artifacts:
path: "*conf*"
type: text/plain
task:
name: NeoverseN1
arm_container:
image: node:latest
compile_script:
- make
task:
name: NeoverseN1-ILP64
arm_container:
image: node:latest
compile_script:
- make INTERFACE64=1
task:
name: NeoverseN1-OMP
arm_container:
image: node:latest
cpu: 8
compile_script:
- make USE_OPENMP=1
FreeBSD_task:
name: FreeBSD-gcc12
freebsd_instance:
image_family: freebsd-13-2
install_script:
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
compile_script:
- ls -l /usr/local/lib
- gmake CC=gcc
FreeBSD_task:
name: freebsd-gcc12-ilp64
freebsd_instance:
image_family: freebsd-13-2
install_script:
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
compile_script:
- ls -l /usr/local/lib
- gmake CC=gcc INTERFACE64=1
#task:
# name: Windows/LLVM16 --- too slow ---
# windows_container:
# image: cirrusci/windowsservercore:cmake-2021.12.07
# install_script:
# - choco list --localonly
# - choco install -y llvm
# - # choco install -y cmake --installargs '"ADD_CMAKE_TO_PATH=System"'
# - choco install -y ninja
# - refreshenv
# - cd "c:/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/VC/Auxiliary/Build"
# - vcvarsall x64
# - cd "C:\Users\ContainerAdministrator\AppData\Local\Temp\cirrus-ci-build"
# - cmake -S . -B build -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release
# - cd build
# - cmake --build .
# - ctest

121
.github/workflows/c910v.yml vendored Normal file
View File

@ -0,0 +1,121 @@
name: c910v qemu test
on: [push, pull_request]
permissions:
contents: read # to fetch code (actions/checkout)
jobs:
TEST:
runs-on: ubuntu-latest
env:
xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282
toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz
strategy:
fail-fast: false
matrix:
include:
- target: RISCV64_GENERIC
triple: riscv64-linux-gnu
apt_triple: riscv64-linux-gnu
opts: NO_SHARED=1 TARGET=RISCV64_GENERIC
- target: C910V
triple: riscv64-unknown-linux-gnu
apt_triple: riscv64-linux-gnu
opts: NO_SHARED=1 TARGET=C910V
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: install build deps
run: |
sudo apt-get update
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross
- name: checkout qemu
uses: actions/checkout@v3
with:
repository: T-head-Semi/qemu
path: qemu
ref: 1e692ebb43d396c52352406323fc782c1ac99a42
- name: build qemu
run: |
# Force use c910v qemu-user
wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
cd qemu
patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
make -j$(nproc)
make install
- name: Compilation cache
uses: actions/cache@v3
with:
path: ~/.ccache
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
restore-keys: |
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
ccache-${{ runner.os }}-${{ matrix.target }}
- name: Configure ccache
run: |
test -d ~/.ccache || mkdir -p ~/.ccache
echo "max_size = 300M" > ~/.ccache/ccache.conf
echo "compression = true" >> ~/.ccache/ccache.conf
ccache -s
- name: build OpenBLAS
run: |
wget ${xuetie_toolchain}/${toolchain_file_name}
tar -xvf ${toolchain_file_name} -C /opt
export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1/bin:$PATH"
make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
qemu-riscv64 ./utest/openblas_utest
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat1
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat2 < ./ctest/sin2
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat2 < ./ctest/din2
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat2 < ./ctest/cin2
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat2 < ./ctest/zin2
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat3 < ./ctest/sin3
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat3 < ./ctest/din3
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat3 < ./ctest/cin3
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat3 < ./ctest/zin3
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat1
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat1
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat1
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat1
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat1
rm -f ./test/?BLAT2.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat
rm -f ./test/?BLAT2.SUMM
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat
rm -f ./test/?BLAT3.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat
rm -f ./test/?BLAT3.SUMM
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat

View File

@ -151,40 +151,53 @@ jobs:
strategy:
fail-fast: false
matrix:
msystem: [MINGW64, MINGW32, CLANG64]
msystem: [MINGW64, MINGW32, CLANG64, CLANG32]
idx: [int32, int64]
build-type: [Release]
include:
- msystem: MINGW64
idx: int32
target-prefix: mingw-w64-x86_64
fc-pkg: mingw-w64-x86_64-gcc-fortran
fc-pkg: fc
- msystem: MINGW32
idx: int32
target-prefix: mingw-w64-i686
fc-pkg: mingw-w64-i686-gcc-fortran
fc-pkg: fc
- msystem: CLANG64
idx: int32
target-prefix: mingw-w64-clang-x86_64
fc-pkg: fc
# Compiling with Flang 16 seems to cause test errors on machines
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
no-avx512-flags: -DNO_AVX512=1
- msystem: CLANG32
idx: int32
target-prefix: mingw-w64-clang-i686
fc-pkg: cc
c-lapack-flags: -DC_LAPACK=ON
- msystem: MINGW64
idx: int64
idx64-flags: -DBINARY=64 -DINTERFACE64=1
target-prefix: mingw-w64-x86_64
fc-pkg: mingw-w64-x86_64-gcc-fortran
fc-pkg: fc
- msystem: CLANG64
idx: int64
idx64-flags: -DBINARY=64 -DINTERFACE64=1
target-prefix: mingw-w64-clang-x86_64
c-lapack-flags: -DC_LAPACK=ON
fc-pkg: fc
# Compiling with Flang 16 seems to cause test errors on machines
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
no-avx512-flags: -DNO_AVX512=1
- msystem: MINGW64
idx: int32
target-prefix: mingw-w64-x86_64
fc-pkg: mingw-w64-x86_64-gcc-fortran
fc-pkg: fc
build-type: None
exclude:
- msystem: MINGW32
idx: int64
- msystem: CLANG32
idx: int64
defaults:
run:
@ -209,7 +222,7 @@ jobs:
install: >-
base-devel
${{ matrix.target-prefix }}-cc
${{ matrix.fc-pkg }}
${{ matrix.target-prefix }}-${{ matrix.fc-pkg }}
${{ matrix.target-prefix }}-cmake
${{ matrix.target-prefix }}-ninja
${{ matrix.target-prefix }}-ccache
@ -217,14 +230,21 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v3
- name: Compilation cache
uses: actions/cache@v3
with:
# It looks like this path needs to be hard-coded.
path: C:/msys64/home/runneradmin/.ccache
- name: Prepare ccache
# Get cache location of ccache
# Create key that is used in action/cache/restore and action/cache/save steps
id: ccache-prepare
run: |
echo "ccachedir=$(cygpath -m $(ccache -k cache_dir))" >> $GITHUB_OUTPUT
# We include the commit sha in the cache key, as new cache entries are
# only created if there is no existing entry for the key yet.
key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}
echo "key=ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}" >> $GITHUB_OUTPUT
- name: Restore ccache
uses: actions/cache/restore@v3
with:
path: ${{ steps.ccache-prepare.outputs.ccachedir }}
key: ${{ steps.ccache-prepare.outputs.key }}
# Restore a matching ccache cache entry. Prefer same branch.
restore-keys: |
ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}
@ -234,9 +254,10 @@ jobs:
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota.
run: |
which ccache
test -d ~/.ccache || mkdir -p ~/.ccache
echo "max_size = 250M" > ~/.ccache/ccache.conf
echo "compression = true" >> ~/.ccache/ccache.conf
test -d ${{ steps.ccache-prepare.outputs.ccachedir }} || mkdir -p ${{ steps.ccache-prepare.outputs.ccachedir }}
echo "max_size = 250M" > ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf
echo "compression = true" >> ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf
ccache -p
ccache -s
echo $HOME
cygpath -w $HOME
@ -253,6 +274,7 @@ jobs:
-DTARGET=CORE2 \
${{ matrix.idx64-flags }} \
${{ matrix.c-lapack-flags }} \
${{ matrix.no-avx512-flags }} \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
..
@ -264,10 +286,30 @@ jobs:
continue-on-error: true
run: ccache -s
- name: Save ccache
# Save the cache after we are done (successfully) building
uses: actions/cache/save@v3
with:
path: ${{ steps.ccache-prepare.outputs.ccachedir }}
key: ${{ steps.ccache-prepare.outputs.key }}
- name: Run tests
id: run-ctest
timeout-minutes: 60
run: cd build && ctest
- name: Re-run tests
if: always() && (steps.run-ctest.outcome == 'failure')
timeout-minutes: 60
run: |
cd build
echo "::group::Re-run ctest"
ctest --rerun-failed --output-on-failure || true
echo "::endgroup::"
echo "::group::Log from these tests"
[ ! -f Testing/Temporary/LastTest.log ] || cat Testing/Temporary/LastTest.log
echo "::endgroup::"
cross_build:
runs-on: ubuntu-22.04
@ -295,6 +337,7 @@ jobs:
- name: Install Dependencies
run: |
sudo apt-get update
sudo apt-get install -y ccache gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-${{ matrix.target }}-cross
- name: Compilation cache

110
.github/workflows/loongarch64.yml vendored Normal file
View File

@ -0,0 +1,110 @@
name: loongarch64 qemu test
on: [push, pull_request]
jobs:
TEST:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- target: LOONGSONGENERIC
triple: loongarch64-unknown-linux-gnu
opts: NO_SHARED=1 TARGET=LOONGSONGENERIC
- target: LOONGSON3R5
triple: loongarch64-unknown-linux-gnu
opts: NO_SHARED=1 TARGET=LOONGSON3R5
- target: LOONGSON2K1000
triple: loongarch64-unknown-linux-gnu
opts: NO_SHARED=1 TARGET=LOONGSON2K1000
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Install APT deps
run: |
sudo add-apt-repository ppa:savoury1/virtualisation
sudo apt-get update
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
qemu-user-static
- name: Download and install loongarch64-toolchain
run: |
wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz
tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt
- name: Set env
run: |
echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV
- name: Compilation cache
uses: actions/cache@v3
with:
path: ~/.ccache
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
restore-keys: |
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
ccache-${{ runner.os }}-${{ matrix.target }}
- name: Configure ccache
run: |
test -d ~/.ccache || mkdir -p ~/.ccache
echo "max_size = 300M" > ~/.ccache/ccache.conf
echo "compression = true" >> ~/.ccache/ccache.conf
ccache -s
- name: Disable utest dsdot:dsdot_n_1
run: |
echo -n > utest/test_dsdot.c
echo "Due to the qemu versions 7.2 causing utest cases to fail,"
echo "the utest dsdot:dsdot_n_1 have been temporarily disabled."
- name: Build OpenBLAS
run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
- name: Test
run: |
qemu-loongarch64-static ./utest/openblas_utest
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1
rm -f ./test/?BLAT2.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
rm -f ./test/?BLAT2.SUMM
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
rm -f ./test/?BLAT3.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat
rm -f ./test/?BLAT3.SUMM
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat

3
.gitignore vendored
View File

@ -14,6 +14,7 @@ lapack-3.4.2
lapack-3.4.2.tgz
lapack-netlib/make.inc
lapack-netlib/lapacke/include/lapacke_mangling.h
lapack-netlib/SRC/la_constants.mod
lapack-netlib/TESTING/testing_results.txt
lapack-netlib/INSTALL/test*
lapack-netlib/TESTING/xeigtstc
@ -71,6 +72,7 @@ test/SBLAT3.SUMM
test/ZBLAT2.SUMM
test/ZBLAT3.SUMM
test/SHBLAT3.SUMM
test/SBBLAT3.SUMM
test/cblat1
test/cblat2
test/cblat3
@ -81,6 +83,7 @@ test/sblat1
test/sblat2
test/sblat3
test/test_shgemm
test/test_sbgemm
test/zblat1
test/zblat2
test/zblat3

View File

@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 23)
set(OpenBLAS_PATCH_VERSION 23.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
@ -20,6 +20,8 @@ include(CMakePackageConfigHelpers)
#######
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF)
option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON)
option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
@ -309,19 +311,25 @@ endif()
#if (MSVC OR NOT NOFORTRAN)
if (NOT NO_CBLAS)
if (NOT ONLY_CBLAS)
# Broken without fortran on unix
add_subdirectory(utest)
add_subdirectory(utest)
endif()
endif()
if (NOT NOFORTRAN)
if (NOT ONLY_CBLAS)
# Build test and ctest
add_subdirectory(test)
endif()
if (BUILD_TESTING)
add_subdirectory(lapack-netlib/TESTING)
endif()
endif()
if(NOT NO_CBLAS)
if (NOT ONLY_CBLAS)
add_subdirectory(ctest)
endif()
endif()
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
add_subdirectory(cpp_thread_test)
@ -398,15 +406,45 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
endif()
if (${BUILD_LAPACK_DEPRECATED})
set (BLD 1)
else ()
set (BLD 0)
endif()
if (${BUILD_BFLOAT16})
set (BBF16 1)
else ()
set (BBF16 0)
endif()
if (${BUILD_SINGLE})
set (BS 1)
else ()
set (BS 0)
endif()
if (${BUILD_DOUBLE})
set (BD 1)
else ()
set (BD 0)
endif()
if (${BUILD_COMPLEX})
set (BC 1)
else ()
set (BC 0)
endif()
if (${BUILD_COMPLEX16})
set (BZ 1)
else ()
set (BZ 0)
endif()
if (NOT USE_PERL)
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def
COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
COMMENT "renaming symbols"
)
else()
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
COMMENT "renaming symbols"
)
@ -511,9 +549,8 @@ configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/
install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
set(PN OpenBLAS)
set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}")
set(CMAKECONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PN}${SUFFIX64}")
configure_package_config_file(cmake/${PN}Config.cmake.in
"${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake"
INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR})

View File

@ -23,6 +23,9 @@
* Optimization on AMD Piledriver
* Optimization on Intel Haswell
* Chris Sidebottom <chris.sidebottom@arm.com>
* Optimizations and other improvements targeting AArch64
## Previous Developers
* Zaheer Chothia <zaheer.chothia@gmail.com>

View File

@ -1,4 +1,104 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.24
03-Sep-2023
general:
- declared the arguments of cblas_xerbla as const (in accordance with the reference implementation
and others, the previous discrepancy appears to have dated back to GotoBLAS)
- fixed the implementation of ?GEMMT that was added in 0.3.23
- made cpu-specific SWITCH_RATIO parameters for GEMM available to DYNAMIC_ARCH builds
- fixed application of SYMBOLSUFFIX in CMAKE builds
- fixed missing SSYCONVF function in the shared library
- fixed parallel build logic used with gmake
- added support for compilation with LLVM17, in particular its new Fortran compiler
- added support for CMAKE builds using the NVIDIA HPC compiler
- fixed INTERFACE64 builds with CMAKE and the f95 Fortran compiler
- fixed cross-build detection and management in c_check
- disabled building of the tests with CMAKE when ONLY_CBLAS is defined
- fixed several issues with the handling of runtime limits on the number of OPENMP threads
- corrected the error code returned by SGEADD/DGEADD when LDA is too small
- corrected the error code returned by IMATCOPY when LDB is too small
- updated ?NRM2 to support negative increment values (as introduced in release 3.10
of the reference BLAS)
- fixed OpenMP builds with CLANG for the case where libomp is not in a standard location
- fixed a potential overwrite of unrelated memory during thread initialisation on startup
- fixed a potential integer overflow in the multithreading threshold for ?SYMM/?SYRK
- fixed build of the LAPACKE interfaces for the LAPACK 3.11.0 ?TRSYL functions added in 0.3.22
- fixed installation of .cmake files in concurrent 32 and 64bit builds with CMAKE
- applied additions and corrections from the development branch of Reference-LAPACK:
- fixed actual arguments passed to a number of LAPACK functions (from Reference-LAPACK PR 885)
- fixed workspace query results in LAPACK ?SYTRF/?TRECV3 (from Reference-LAPACK PR 883)
- fixed derivation of the UPLO parameter in LAPACKE_?larfb (from Reference-LAPACK PR 878)
- fixed a crash in LAPACK ?GELSDD on NRHS=0 (from Reference-LAPACK PR 876)
- added new LAPACK utility functions CRSCL and ZRSCL (from Reference-LAPACK PR 839)
- corrected the order of eigenvalues for 2x2 matrices in ?STEMR (Reference-LAPACK PR 867)
- removed spurious reference to OpenMP variables outside OpenMP contexts (Reference-LAPACK PR 860)
- updated file comments on use of LAMBDA variable in LAPACK (Reference-LAPACK PR 852)
- fixed documentation of LAPACK SLASD0/DLASD0 (Reference-LAPACK PR 855)
- fixed confusing use of "minor" in LAPACK documentation (Reference-LAPACK PR 849)
- added new LAPACK functions ?GEDMD for dynamic mode decomposition (Reference-LAPACK PR 736)
- fixed potential stack overflows in the EIG part of the LAPACK testsuite (Reference-LAPACK PR 854)
- applied small improvements to the variants of Cholesky and QR functions (Reference-LAPACK PR 847)
- removed unused variables from LAPACK ?BDSQR (Reference-LAPACK PR 832)
- fixed a potential crash on allocation failure in LAPACKE SGEESX/DGEESX (Reference-LAPACK PR 836)
- added a quick return from SLARUV/DLARUV for N < 1 (Reference-LAPACK PR 837)
- updated function descriptions in LAPACK ?GEGS/?GEGV (Reference-LAPACK PR 831)
- improved algorithm description in ?GELSY (Reference-LAPACK PR 833)
- fixed scaling in LAPACK STGSNA/DTGSNA (Reference-LAPACK PR 830)
- fixed crash in LAPACKE_?geqrt with row-major data (Reference-LAPACK PR 768)
- added LAPACKE interfaces for C/ZUNHR_COL and S/DORHR_COL (Reference-LAPACK PR 827)
- added error exit tests for SYSV/SYTD2/GEHD2 to the testsuite (Reference-LAPACK PR 795)
- fixed typos in LAPACK source and comments (Reference-LAPACK PRs 809,811,812,814,820)
- adopt refactored ?GEBAL implementation (Reference-LAPACK PR 808)
x86_64:
- added cpu model autodetection for Intel Alder Lake N
- added activation of the AMX tile to the Sapphire Rapids SBGEMM kernel
- worked around miscompilations of GEMV/SYMV kernels by gcc's tree-vectorizer
- fixed compilation of Cooperlake and Sapphire Rapids kernels with CLANG
- fixed runtime detection of Cooperlake and Sapphire Rapids in DYNAMIC_ARCH
- fixed feature-based cputype fallback in DYNAMIC_ARCH
- added support for building the AVX512 kernels with the NVIDIA HPC compiler
- corrected ZAXPY result on old pre-AVX hardware for the INCX=0 case
- fixed a potential use of uninitialized variables in ZTRSM
ARM64:
- added cpu model autodetection for Apple M2
- fixed wrong results of CGEMM/CTRMM/DNRM2 under OSX (use of reserved register)
- added support for building the SVE kernels with the NVIDIA HPC compiler
- added support for building the SVE kernels with the Apple Clang compiler
- fixed compiler option handling for building the SVE kernels with LLVM
- implemented SWITCH_RATIO parameter for improved GEMM performance on Neoverse
- activated SVE SGEMM and DGEMM kernels for Neoverse V1
- improved performance of the SVE CGEMM and ZGEMM kernels on Neoverse V1
- improved kernel selection for the ARMV8SVE target and added it to DYNAMIC_ARCH
- fixed runtime check for SVE availability in DYNAMIC_ARCH builds to take OS or
container restrictions into account
- fixed a potential use of uninitialized variables in ZTRSM
- fix a potential misdetection of ARMV8 hardware as 32bit in CMAKE builds
LOONGARCH64:
- added ABI detection
- added support for cpu affinity handling
- fixed compilation with early versions of the Loongson toolchain
- added an optimized SGEMM kernel for 3A5000
- added optimized DGEMV kernels for 3A5000
- improved the performance of the DGEMM kernel for 3A5000
MIPS64:
- fixed miscompilation of TRMM kernels for the MIPS64_GENERIC target
POWER:
- fixed compiler warnings in the POWER10 SBGEMM kernel
RISCV:
- fixed application of the INTERFACE64 option when building with CMAKE
- fix a potential misdetection of RISCV hardware as 32bit in CMAKE builds
- fixed IDAMAX and DOT kernels for C910V
- fixed corner cases in the ROT and SWAP kernels for C910V
- fixed compilation of the C910V target with recent vendor compilers
====================================================================
Version 0.3.23
01-Apr-2023

15
Jenkinsfile vendored
View File

@ -1,9 +1,14 @@
node {
stage('Checkout') {
checkout
pipeline {
agent {
docker {
image 'osuosl/ubuntu-s390x'
}
}
stages {
stage('Build') {
sh("make")
steps {
sh 'make clean && make'
}
}
}
}

16
Jenkinsfile.pwr Normal file
View File

@ -0,0 +1,16 @@
pipeline {
agent {
docker {
image 'osuosl/ubuntu-ppc64le'
}
}
stages {
stage('Build') {
steps {
sh 'sudo apt update'
sh 'sudo apt install gfortran -y'
sh 'make clean && make'
}
}
}
}

View File

@ -40,9 +40,9 @@ LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
.PHONY : all libs netlib $(RELA) test ctest shared install
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
.NOTPARALLEL : shared
all :: libs netlib $(RELA) tests shared
all :: tests
@echo
@echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
@echo
@ -150,7 +150,7 @@ ifeq ($(OSNAME), CYGWIN_NT)
endif
endif
tests : libs netlib $(RELA) shared
tests : shared
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
touch $(LIBNAME)
ifndef NO_FBLAS
@ -373,10 +373,10 @@ ifneq ($(CROSS), 1)
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING)
endif
lapack-runtest:
lapack-runtest: lapack-test
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
./testsecond; ./testdsecnd; ./testieee; ./testversion )
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING )
blas-test:

View File

@ -69,7 +69,7 @@ endif
# in GCC>=9
ifeq ($(CORE), NEOVERSEN1)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
ifeq ($(GCCVERSIONGTEQ9), 1)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
@ -92,9 +92,14 @@ endif
# in GCC>=10.4
ifeq ($(CORE), NEOVERSEV1)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
ifeq ($(GCCVERSIONGTEQ10), 1)
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-v1
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
CCOMMON_OPT += -march=armv8.4-a+sve
ifeq (1, $(ISCLANG))
CCOMMON_OPT += -mtune=cortex-x1
else
CCOMMON_OPT += -mtune=neoverse-v1
endif
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
endif
@ -122,8 +127,8 @@ endif
# in GCC>=10.4
ifeq ($(CORE), NEOVERSEN2)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
ifeq ($(GCCVERSIONGTEQ10), 1)
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
ifneq ($(OSNAME), Darwin)
CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
else
@ -155,7 +160,7 @@ endif
# Use a53 tunings because a55 is only available in GCC>=8.1
ifeq ($(CORE), CORTEXA55)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
ifeq ($(GCCVERSIONGTEQ8), 1)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ8) $(ISCLANG)))
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
@ -196,8 +201,13 @@ endif
endif
ifeq ($(CORE), THUNDERX3T110)
ifeq ($(GCCVERSIONGTEQ10), 1)
CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
CCOMMON_OPT += -march=armv8.3-a
ifeq (0, $(ISCLANG))
CCOMMON_OPT += -mtune=thunderx3t110
else
CCOMMON_OPT += -mtune=thunderx2t99
endif
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
endif
@ -225,9 +235,12 @@ endif
endif
endif
ifeq ($(GCCVERSIONGTEQ9), 1)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
ifeq ($(CORE), EMAG8180)
CCOMMON_OPT += -march=armv8-a -mtune=emag
CCOMMON_OPT += -march=armv8-a
ifeq ($(ISCLANG), 0)
CCOMMON_OPT += -mtune=emag
endif
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a -mtune=emag
endif

View File

@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.3.23
VERSION = 0.3.23.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

View File

@ -384,6 +384,11 @@ GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
endif
ifeq ($(C_COMPILER), CLANG)
CLANGVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
CLANGVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12)
endif
#
# OS dependent settings
#
@ -645,7 +650,7 @@ DYNAMIC_CORE += HASWELL ZEN
endif
ifneq ($(NO_AVX512), 1)
ifneq ($(NO_AVX2), 1)
DYNAMIC_CORE += SKYLAKEX COOPERLAKE
DYNAMIC_CORE += SKYLAKEX COOPERLAKE SAPPHIRERAPIDS
endif
endif
endif
@ -668,6 +673,7 @@ DYNAMIC_CORE += NEOVERSEN1
ifneq ($(NO_SVE), 1)
DYNAMIC_CORE += NEOVERSEV1
DYNAMIC_CORE += NEOVERSEN2
DYNAMIC_CORE += ARMV8SVE
endif
DYNAMIC_CORE += CORTEXA55
DYNAMIC_CORE += FALKOR
@ -932,8 +938,12 @@ BINARY_DEFINED = 1
endif
ifeq ($(ARCH), loongarch64)
CCOMMON_OPT += -march=loongarch64 -mabi=lp64
FCOMMON_OPT += -march=loongarch64 -mabi=lp64
LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d)
ifneq ($(LA64_ABI), lp64d)
LA64_ABI=lp64
endif
CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
endif
endif
@ -1082,8 +1092,9 @@ endif
endif
endif
ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW))
CCOMMON_OPT += -DF_INTERFACE_GFORT
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -Wall
# make single-threaded LAPACK calls thread-safe #1847
FCOMMON_OPT += -frecursive
@ -1097,6 +1108,7 @@ EXTRALIB += -lgfortran
endif
endif
endif
endif
ifdef NO_BINARY_MODE
ifeq ($(ARCH), $(filter $(ARCH),mips64))
ifdef BINARY64
@ -1763,6 +1775,8 @@ export TARGET_CORE
export NO_AVX512
export NO_AVX2
export BUILD_BFLOAT16
export NO_LSX
export NO_LASX
export SBGEMM_UNROLL_M
export SBGEMM_UNROLL_N

View File

@ -75,18 +75,31 @@ endif
ifeq ($(CORE), COOPERLAKE)
ifndef NO_AVX512
ifeq ($(C_COMPILER), GCC)
# cooperlake support was added in 10.1
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
CCOMMON_OPT += -march=cooperlake
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=cooperlake
endif
else # gcc not support, fallback to avx512
CCOMMON_OPT += -march=skylake-avx512
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=skylake-avx512
endif
endif
# cooperlake support was added in 10.1
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
CCOMMON_OPT += -march=cooperlake
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=cooperlake
endif
else # gcc not support, fallback to avx512
CCOMMON_OPT += -march=skylake-avx512
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=skylake-avx512
endif
endif
else ifeq ($(C_COMPILER), CLANG)
# cooperlake support was added in clang 9
ifeq ($(CLANGVERSIONGTEQ9), 1)
CCOMMON_OPT += -march=cooperlake
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=cooperlake
endif
else # not supported in clang, fallback to avx512
CCOMMON_OPT += -march=skylake-avx512
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=skylake-avx512
endif
endif
endif
ifeq ($(OSNAME), CYGWIN_NT)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
@ -104,18 +117,31 @@ endif
ifeq ($(CORE), SAPPHIRERAPIDS)
ifndef NO_AVX512
ifeq ($(C_COMPILER), GCC)
# sapphire rapids support was added in 11
ifeq ($(GCCVERSIONGTEQ11), 1)
CCOMMON_OPT += -march=sapphirerapids
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=sapphirerapids
endif
else # gcc not support, fallback to avx512
CCOMMON_OPT += -march=skylake-avx512
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=skylake-avx512
endif
endif
# sapphire rapids support was added in 11
ifeq ($(GCCVERSIONGTEQ11), 1)
CCOMMON_OPT += -march=sapphirerapids
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=sapphirerapids
endif
else # gcc not support, fallback to avx512
CCOMMON_OPT += -march=skylake-avx512
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=skylake-avx512
endif
endif
else ifeq ($(C_COMPILER), CLANG)
# cooperlake support was added in clang 12
ifeq ($(CLANGVERSIONGTEQ12), 1)
CCOMMON_OPT += -march=cooperlake
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=cooperlake
endif
else # not supported in clang, fallback to avx512
CCOMMON_OPT += -march=skylake-avx512
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=skylake-avx512
endif
endif
endif
ifeq ($(OSNAME), CYGWIN_NT)
CCOMMON_OPT += -fno-asynchronous-unwind-tables

View File

@ -6,11 +6,15 @@ Travis CI: [![Build Status](https://travis-ci.com/xianyi/OpenBLAS.svg?branch=dev
AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/)
Cirrus CI: [![Build Status](https://api.cirrus-ci.com/github/xianyi/OpenBLAS.svg?branch=develop)](https://cirrus-ci.com/github/xianyi/OpenBLAS)
<!-- Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/)-->
[![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
OSUOSL POWERCI [![Build Status](https://powerci.osuosl.org/buildStatus/icon?job=OpenBLAS_gh%2Fdevelop)](http://powerci.osuosl.org/job/OpenBLAS_gh/job/develop/)
OSUOSL IBMZ-CI [![Build Status](http://ibmz-ci.osuosl.org/buildStatus/icon?job=OpenBLAS-Z%2Fdevelop)](http://ibmz-ci.osuosl.org/job/OpenBLAS-Z/job/develop/)
## Introduction
OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version.

View File

@ -115,7 +115,7 @@ jobs:
mkdir build
cd build
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER="flang -I C:\Miniconda\Library\include\flang" -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
cmake --build . --config Release
ctest
@ -271,6 +271,19 @@ jobs:
- script: |
make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
- job: OSX_xbuild_DYNAMIC_ARM64
pool:
vmImage: 'macOS-11'
variables:
CC: /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX11.3.sdk -arch arm64
steps:
- script: |
ls /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs
/Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus
/Applications/Xcode_11.7.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version
make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
- job: ALPINE_MUSL
pool:
vmImage: 'ubuntu-latest'

0
benchmark/spr.c Executable file → Normal file
View File

0
benchmark/spr2.c Executable file → Normal file
View File

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
Copyright (c) 2014, 2023 The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -67,7 +67,7 @@ int main(int argc, char *argv[]){
int step = 1;
int loops = 1;
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
double time1,timeg;
@ -77,7 +77,7 @@ int main(int argc, char *argv[]){
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Loops = %d\n", from, to, step,uplo,trans,loops);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){

60
c_check
View File

@ -31,13 +31,17 @@ flags="$*"
cross_suffix=""
if [ "`dirname \"$compiler_name\"`" != '.' ]; then
cross_suffix="$cross_suffix`dirname \"$compiler_name\"`/"
if [ "`dirname "$compiler_name"`" != '.' ]; then
cross_suffix="$cross_suffix`dirname "$compiler_name"`/"
fi
bn=`basename $compiler_name`
cn=`echo $compiler_name | sed -e 's/ -.*//'`
bn=`basename "$cn"`
case "$bn" in
*-*) cross_suffix="$cross_suffix${bn%-*}-"
*-*) if [ "$bn" != '-' ]; then
cross_suffix="$cross_suffix${bn%-*}-"
fi
esac
compiler=""
@ -164,7 +168,7 @@ fi
no_msa=0
if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
tmpd="$(mktemp -d)"
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
tmpf="$tmpd/a.c"
code='"addvi.b $w0, $w1, 1"'
msa_flags='-mmsa -mfp64 -mload-store-pairs'
@ -181,6 +185,37 @@ if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
rm -rf "$tmpd"
fi
no_lsx=0
no_lasx=0
if [ "$architecture" = "loongarch64" ]; then
tmpd="$(mktemp -d)"
tmplsx="$tmpd/lsx.c"
codelsx='"vadd.b $vr0, $vr0, $vr0"'
lsx_flags='-march=loongarch64 -mlsx'
printf "#include <lsxintrin.h>\n\n" >> "$tmplsx"
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx"
args="$lsx_flags -o $tmplsx.o $tmplsx"
{
$compiler_name $flags $args >/dev/null 2>&1
} || {
no_lsx=1
}
tmplasx="$tmpd/lasx.c"
codelasx='"xvadd.b $xr0, $xr0, $xr0"'
lasx_flags='-march=loongarch64 -mlasx'
printf "#include <lasxintrin.h>\n\n" >> "$tmplasx"
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx"
args="$lasx_flags -o $tmplasx.o $tmplasx"
{
$compiler_name $flags $args >/dev/null 2>&1
} || {
no_lasx=1
}
rm -rf "$tmpd"
fi
case "$data" in
*ARCH_X86_64*) architecture=x86_64 ;;
*ARCH_X86*) architecture=x86 ;;
@ -204,7 +239,7 @@ esac
no_avx512=0
if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
tmpd=`mktemp -d`
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
tmpf="$tmpd/a.c"
code='"vbroadcastss -4 * 4(%rsi), %zmm2"'
printf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
@ -225,7 +260,7 @@ fi
no_rv64gv=0
if [ "$architecture" = "riscv64" ]; then
tmpd=`mktemp -d`
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
tmpf="$tmpd/a.c"
code='"vsetvli zero, zero, e8, m1\n"'
printf "int main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
@ -241,13 +276,16 @@ fi
no_sve=0
if [ "$architecture" = "arm64" ]; then
tmpd=`mktemp -d`
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
tmpf="$tmpd/a.c"
printf "#include <arm_sve.h>\n\n int main(void){}\n">> "$tmpf"
args=" -march=armv8-a+sve -c -o $tmpf.o $tmpf"
no_sve=0
{
$compiler_name $flags $args >/dev/null 2>&1
} || {
args=" -Msve_intrinsics -c -o $tmpf.o $tmpf"
$compiler_name $flags $args >/dev/null 2>&1
} || {
no_sve=1
}
@ -257,7 +295,7 @@ fi
c11_atomics=0
case "$data" in
*HAVE_C11*)
tmpd=`mktemp -d`
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
tmpf="$tmpd/a.c"
printf "#include <stdatomic.h>\nint main(void){}\n" >> "$tmpf"
args=" -c -o $tmpf.o $tmpf"
@ -395,6 +433,8 @@ done
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
[ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"
[ "$no_lasx" -eq 1 ] && printf "NO_LASX=1\n"
} >> "$makefile"
os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ `
@ -410,6 +450,8 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' `
[ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu"
[ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n"
[ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n"
[ "$no_lsx" -eq 1 ] && printf "#define NO_LSX\t1\n"
[ "$no_lasx" -eq 1 ] && printf "#define NO_LASX\t1\n"
} >> "$config"

View File

@ -232,6 +232,47 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
}
}
$no_lsx = 0;
$no_lasx = 0;
if (($architecture eq "loongarch64")) {
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check LSX and LASX capatibility";
} else {
$tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
$codelsx = '"vadd.b $vr0, $vr0, $vr0"';
$lsx_flags = "-march=loongarch64 -mlsx";
print $tmplsx "#include <lsxintrin.h>\n\n";
print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n";
$args = "$lsx_flags -o $tmplsx.o $tmplsx";
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_lsx = 1;
} else {
$no_lsx = 0;
}
unlink("$tmplsx.o");
$tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
$codelasx = '"xvadd.b $xr0, $xr0, $xr0"';
$lasx_flags = "-march=loongarch64 -mlasx";
print $tmplasx "#include <lasxintrin.h>\n\n";
print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n";
$args = "$lasx_flags -o $tmplasx.o $tmplasx";
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_lasx = 1;
} else {
$no_lasx = 0;
}
unlink("$tmplasx.o");
}
}
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
$architecture = e2k if ($data =~ /ARCH_E2K/);
@ -424,6 +465,8 @@ print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1;
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1;
print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1;
print MAKEFILE "NO_LSX=1\n" if $no_lsx eq 1;
print MAKEFILE "NO_LASX=1\n" if $no_lasx eq 1;
$os =~ tr/[a-z]/[A-Z]/;
$architecture =~ tr/[a-z]/[A-Z]/;
@ -437,6 +480,8 @@ print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64;
print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1;
print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1;
print CONFFILE "#define NO_LSX\t1\n" if $no_lsx eq 1;
print CONFFILE "#define NO_LASX\t1\n" if $no_lasx eq 1;
if ($os eq "LINUX") {

View File

@ -350,7 +350,7 @@ void cblas_cher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBL
void cblas_zher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, void *C, OPENBLAS_CONST blasint ldc);
void cblas_xerbla(blasint p, char *rout, char *form, ...);
void cblas_xerbla(blasint p, OPENBLAS_CONST char *rout, OPENBLAS_CONST char *form, ...);
/*** BLAS extensions ***/

View File

@ -46,7 +46,7 @@ if (DYNAMIC_ARCH)
if (ARM64)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
set(DYNAMIC_CORE "${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2")
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE)
endif ()
if (DYNAMIC_LIST)
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
@ -82,7 +82,7 @@ if (DYNAMIC_ARCH)
set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
endif ()
if (NOT NO_AVX512)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE SAPPHIRERAPIDS)
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
endif ()
if (DYNAMIC_LIST)
@ -135,7 +135,7 @@ if (ARM64)
set(BINARY_DEFINED 1)
endif ()
if (${ARCH} STREQUAL "riscv64")
if (RISCV64)
set(NO_BINARY_MODE 1)
set(BINARY_DEFINED 1)
endif ()

View File

@ -65,6 +65,14 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
endif ()
endif ()
if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC")
if (POWER)
set(CCOMMON_OPT "${CCOMMON_OPT} -tp pwr8")
else ()
set(CCOMMON_OPT "${CCOMMON_OPT} -tp px")
endif ()
endif ()
if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
if (BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
@ -172,22 +180,30 @@ endif ()
if (${CORE} STREQUAL NEOVERSEN2)
if (NOT DYNAMIC_ARCH)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
endif()
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
endif()
endif ()
endif ()
endif ()
if (${CORE} STREQUAL NEOVERSEV1)
if (NOT DYNAMIC_ARCH)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
endif()
endif()
endif ()
endif ()
@ -205,7 +221,11 @@ endif ()
if (${CORE} STREQUAL ARMV8SVE)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
endif ()
endif ()
endif ()

View File

@ -3,7 +3,8 @@
## Description: Ported from portion of OpenBLAS/Makefile.system
## Sets Fortran related variables.
if (${F_COMPILER} STREQUAL "FLANG")
if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
# This is for classic Flang. LLVM Flang is handled with gfortran below.
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
if (BINARY64 AND INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
@ -38,15 +39,17 @@ if (${F_COMPILER} STREQUAL "G95")
endif ()
endif ()
if (${F_COMPILER} STREQUAL "GFORTRAN")
if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
# ensure reentrancy of lapack codes
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
# work around ABI violation in passing string arguments from C
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
if (NOT NO_LAPACK)
set(EXTRALIB "${EXTRALIB} -lgfortran")
if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
# ensure reentrancy of lapack codes
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
# work around ABI violation in passing string arguments from C
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
if (NOT NO_LAPACK)
# Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
set(EXTRALIB "${EXTRALIB} -lgfortran")
endif ()
endif ()
if (NO_BINARY_MODE)
if (MIPS64)
@ -63,6 +66,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN")
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
endif ()
endif ()
if (RISCV64)
if (BINARY64)
if (INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
endif ()
endif ()
endif ()
else ()
if (BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
@ -121,7 +131,7 @@ if (${F_COMPILER} STREQUAL "IBM")
endif ()
endif ()
if (${F_COMPILER} STREQUAL "PGI")
if (${F_COMPILER} STREQUAL "PGI" OR ${F_COMPILER} STREQUAL "PGF95")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PGI")
set(COMMON_PROF "${COMMON_PROF} -DPGICOMPILER")
if (BINARY64)

View File

@ -124,7 +124,7 @@ set(SLASRC
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
sgesvdq.f slaorhr_col_getrfnp.f
slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f
slatrs3.f strsyl3.f sgelst.f)
slatrs3.f strsyl3.f sgelst.f sgedmd.f90 sgedmdq.f90)
set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
@ -187,7 +187,7 @@ set(CLASRC
cposv.f cposvx.f cpotrf2.f cpotri.f cpstrf.f cpstf2.f
cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f
cptcon.f cpteqr.f cptrfs.f cptsv.f cptsvx.f cpttrf.f cpttrs.f cptts2.f
crot.f cspcon.f csprfs.f cspsv.f
crot.f crscl.f cspcon.f csprfs.f cspsv.f
cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f
cstegr.f cstein.f csteqr.f csycon.f
csyrfs.f csysv.f csysvx.f csytf2.f csytrf.f csytri.f
@ -223,7 +223,7 @@ set(CLASRC
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f
cungtsqr.f cungtsqr_row.f cunhr_col.f
clatrs3.f ctrsyl3.f cgelst.f)
clatrs3.f ctrsyl3.f cgelst.f cgedmd.f90 cgedmdq.f90)
set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
@ -316,7 +316,7 @@ set(DLASRC
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f
dlatrs3.f dtrsyl3.f dgelst.f)
dlatrs3.f dtrsyl3.f dgelst.f dgedmd.f90 dgedmdq.f90)
set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
@ -381,7 +381,7 @@ set(ZLASRC
zposv.f zposvx.f zpotrf2.f zpotri.f zpotrs.f zpstrf.f zpstf2.f
zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f
zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f
zrot.f zspcon.f zsprfs.f zspsv.f
zrot.f zrscl.f zspcon.f zsprfs.f zspsv.f
zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f
zstegr.f zstein.f zsteqr.f zsycon.f
zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f
@ -419,7 +419,7 @@ set(ZLASRC
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
zungtsqr.f zungtsqr_row.f zunhr_col.f
zlatrs3.f ztrsyl3.f zgelst.f)
zlatrs3.f ztrsyl3.f zgelst.f zgedmd.f90 zgedmdq.f90)
set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
@ -436,6 +436,7 @@ if(USE_XBLAS)
set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
endif()
if(BUILD_LAPACK_DEPRECATED)
list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f
DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f
DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f)
@ -449,6 +450,7 @@ list(APPEND ZLASRC DEPRECATED/zgegs.f DEPRECATED/zgegv.f
DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f
DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f)
message(STATUS "Building deprecated routines")
endif()
set(DSLASRC spotrs.f)
@ -622,7 +624,7 @@ set(SLASRC
ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c
sgesvdq.c slaorhr_col_getrfnp.c
slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c
slatrs3.c strsyl3.c sgelst.c)
slatrs3.c strsyl3.c sgelst.c sgedmd.c sgedmdq.c)
set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c
sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c
@ -684,7 +686,7 @@ set(CLASRC
cposv.c cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c
cppcon.c cppequ.c cpprfs.c cppsv.c cppsvx.c cpptrf.c cpptri.c cpptrs.c
cptcon.c cpteqr.c cptrfs.c cptsv.c cptsvx.c cpttrf.c cpttrs.c cptts2.c
crot.c cspcon.c csprfs.c cspsv.c
crot.c crscl.c cspcon.c csprfs.c cspsv.c
cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c
cstegr.c cstein.c csteqr.c csycon.c
csyrfs.c csysv.c csysvx.c csytf2.c csytrf.c csytri.c
@ -720,7 +722,7 @@ set(CLASRC
chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c
cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c
cungtsqr.c cungtsqr_row.c cunhr_col.c
clatrs3.c ctrsyl3.c cgelst.c)
clatrs3.c ctrsyl3.c cgelst.c cgedmd.c cgedmdq.c)
set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c
cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c
@ -812,7 +814,7 @@ set(DLASRC
dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c
dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c
dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c
dlatrs3.c dtrsyl3.c dgelst.c)
dlatrs3.c dtrsyl3.c dgelst.c dgedmd.c dgedmdq.c)
set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c
dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c
@ -876,7 +878,7 @@ set(ZLASRC
zposv.c zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c
zppcon.c zppequ.c zpprfs.c zppsv.c zppsvx.c zpptrf.c zpptri.c zpptrs.c
zptcon.c zpteqr.c zptrfs.c zptsv.c zptsvx.c zpttrf.c zpttrs.c zptts2.c
zrot.c zspcon.c zsprfs.c zspsv.c
zrot.c zrscl.c zspcon.c zsprfs.c zspsv.c
zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c
zstegr.c zstein.c zsteqr.c zsycon.c
zsyrfs.c zsysv.c zsysvx.c zsytf2.c zsytrf.c zsytri.c
@ -913,7 +915,8 @@ set(ZLASRC
zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c
zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c
zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c
zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c)
zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c
zgedmd.c zgedmdq.c)
set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c
zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c
@ -930,6 +933,7 @@ if(USE_XBLAS)
set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
endif()
if(BUILD_LAPACK_DEPRECATED)
list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c
DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c
DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c)
@ -943,6 +947,7 @@ list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c
DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c
DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c)
message(STATUS "Building deprecated routines")
endif()
set(DSLASRC spotrs.c)

View File

@ -70,8 +70,6 @@ set(CSRC
lapacke_cgeqlf_work.c
lapacke_cgeqp3.c
lapacke_cgeqp3_work.c
lapacke_cgeqpf.c
lapacke_cgeqpf_work.c
lapacke_cgeqr.c
lapacke_cgeqr_work.c
lapacke_cgeqr2.c
@ -92,6 +90,10 @@ set(CSRC
lapacke_cgerqf_work.c
lapacke_cgesdd.c
lapacke_cgesdd_work.c
lapacke_cgedmd.c
lapacke_cgedmd_work.c
lapacke_cgedmdq.c
lapacke_cgedmdq_work.c
lapacke_cgesv.c
lapacke_cgesv_work.c
lapacke_cgesvd.c
@ -144,12 +146,8 @@ set(CSRC
lapacke_cggqrf_work.c
lapacke_cggrqf.c
lapacke_cggrqf_work.c
lapacke_cggsvd.c
lapacke_cggsvd_work.c
lapacke_cggsvd3.c
lapacke_cggsvd3_work.c
lapacke_cggsvp.c
lapacke_cggsvp_work.c
lapacke_cggsvp3.c
lapacke_cggsvp3_work.c
lapacke_cgtcon.c
@ -564,6 +562,8 @@ set(CSRC
lapacke_ctrsna_work.c
lapacke_ctrsyl.c
lapacke_ctrsyl_work.c
lapacke_ctrsyl3.c
lapacke_ctrsyl3_work.c
lapacke_ctrtri.c
lapacke_ctrtri_work.c
lapacke_ctrtrs.c
@ -596,6 +596,8 @@ set(CSRC
lapacke_cungtr_work.c
lapacke_cungtsqr_row.c
lapacke_cungtsqr_row_work.c
lapacke_cunhr_col.c
lapacke_cunhr_col_work.c
lapacke_cunmbr.c
lapacke_cunmbr_work.c
lapacke_cunmhr.c
@ -695,8 +697,6 @@ set(DSRC
lapacke_dgeqlf_work.c
lapacke_dgeqp3.c
lapacke_dgeqp3_work.c
lapacke_dgeqpf.c
lapacke_dgeqpf_work.c
lapacke_dgeqr.c
lapacke_dgeqr_work.c
lapacke_dgeqr2.c
@ -717,6 +717,10 @@ set(DSRC
lapacke_dgerqf_work.c
lapacke_dgesdd.c
lapacke_dgesdd_work.c
lapacke_dgedmd.c
lapacke_dgedmd_work.c
lapacke_dgedmdq.c
lapacke_dgedmdq_work.c
lapacke_dgesv.c
lapacke_dgesv_work.c
lapacke_dgesvd.c
@ -771,12 +775,8 @@ set(DSRC
lapacke_dggqrf_work.c
lapacke_dggrqf.c
lapacke_dggrqf_work.c
lapacke_dggsvd.c
lapacke_dggsvd_work.c
lapacke_dggsvd3.c
lapacke_dggsvd3_work.c
lapacke_dggsvp.c
lapacke_dggsvp_work.c
lapacke_dggsvp3.c
lapacke_dggsvp3_work.c
lapacke_dgtcon.c
@ -874,6 +874,8 @@ set(DSRC
lapacke_dorgtr_work.c
lapacke_dorgtsqr_row.c
lapacke_dorgtsqr_row_work.c
lapacke_dorhr_col.c
lapacke_dorhr_col_work.c
lapacke_dormbr.c
lapacke_dormbr_work.c
lapacke_dormhr.c
@ -1186,6 +1188,8 @@ set(DSRC
lapacke_dtrsna_work.c
lapacke_dtrsyl.c
lapacke_dtrsyl_work.c
lapacke_dtrsyl3.c
lapacke_dtrsyl3_work.c
lapacke_dtrtri.c
lapacke_dtrtri_work.c
lapacke_dtrtrs.c
@ -1275,8 +1279,6 @@ set(SSRC
lapacke_sgeqlf_work.c
lapacke_sgeqp3.c
lapacke_sgeqp3_work.c
lapacke_sgeqpf.c
lapacke_sgeqpf_work.c
lapacke_sgeqr.c
lapacke_sgeqr_work.c
lapacke_sgeqr2.c
@ -1297,6 +1299,10 @@ set(SSRC
lapacke_sgerqf_work.c
lapacke_sgesdd.c
lapacke_sgesdd_work.c
lapacke_sgedmd.c
lapacke_sgedmd_work.c
lapacke_sgedmdq.c
lapacke_sgedmdq_work.c
lapacke_sgesv.c
lapacke_sgesv_work.c
lapacke_sgesvd.c
@ -1351,12 +1357,8 @@ set(SSRC
lapacke_sggqrf_work.c
lapacke_sggrqf.c
lapacke_sggrqf_work.c
lapacke_sggsvd.c
lapacke_sggsvd_work.c
lapacke_sggsvd3.c
lapacke_sggsvd3_work.c
lapacke_sggsvp.c
lapacke_sggsvp_work.c
lapacke_sggsvp3.c
lapacke_sggsvp3_work.c
lapacke_sgtcon.c
@ -1453,6 +1455,8 @@ set(SSRC
lapacke_sorgtr_work.c
lapacke_sorgtsqr_row.c
lapacke_sorgtsqr_row_work.c
lapacke_sorhr_col.c
lapacke_sorhr_col_work.c
lapacke_sormbr.c
lapacke_sormbr_work.c
lapacke_sormhr.c
@ -1762,6 +1766,8 @@ set(SSRC
lapacke_strsna_work.c
lapacke_strsyl.c
lapacke_strsyl_work.c
lapacke_ctrsyl3.c
lapacke_ctrsyl3_work.c
lapacke_strtri.c
lapacke_strtri_work.c
lapacke_strtrs.c
@ -1849,8 +1855,6 @@ set(ZSRC
lapacke_zgeqlf_work.c
lapacke_zgeqp3.c
lapacke_zgeqp3_work.c
lapacke_zgeqpf.c
lapacke_zgeqpf_work.c
lapacke_zgeqr.c
lapacke_zgeqr_work.c
lapacke_zgeqr2.c
@ -1871,6 +1875,10 @@ set(ZSRC
lapacke_zgerqf_work.c
lapacke_zgesdd.c
lapacke_zgesdd_work.c
lapacke_zgedmd.c
lapacke_zgedmd_work.c
lapacke_zgedmdq.c
lapacke_zgedmdq_work.c
lapacke_zgesv.c
lapacke_zgesv_work.c
lapacke_zgesvd.c
@ -1925,12 +1933,8 @@ set(ZSRC
lapacke_zggqrf_work.c
lapacke_zggrqf.c
lapacke_zggrqf_work.c
lapacke_zggsvd.c
lapacke_zggsvd_work.c
lapacke_zggsvd3.c
lapacke_zggsvd3_work.c
lapacke_zggsvp.c
lapacke_zggsvp_work.c
lapacke_zggsvp3.c
lapacke_zggsvp3_work.c
lapacke_zgtcon.c
@ -2343,6 +2347,8 @@ set(ZSRC
lapacke_ztrsna_work.c
lapacke_ztrsyl.c
lapacke_ztrsyl_work.c
lapacke_ztrsyl3.c
lapacke_ztrsyl3_work.c
lapacke_ztrtri.c
lapacke_ztrtri_work.c
lapacke_ztrtrs.c
@ -2375,6 +2381,8 @@ set(ZSRC
lapacke_zungtr_work.c
lapacke_zungtsqr_row.c
lapacke_zungtsqr_row_work.c
lapacke_zunhr_col.c
lapacke_zunhr_col_work.c
lapacke_zunmbr.c
lapacke_zunmbr_work.c
lapacke_zunmhr.c
@ -2401,6 +2409,12 @@ set(ZSRC
lapacke_csyr_work.c
lapacke_ilaver.c
)
if (BUILD_LAPACK_DEPRECATED)
set(SRCS $SRCS lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c)
set(SRCD $SRCD lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c)
set(SRCC $SRCC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c)
set(SRCZ $SRCZ lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c)
endif()
set(SRCX
lapacke_cgbrfsx.c lapacke_cporfsx.c lapacke_dgerfsx.c lapacke_sgbrfsx.c lapacke_ssyrfsx.c lapacke_zherfsx.c

View File

@ -55,7 +55,7 @@ if (DEFINED TARGET)
endif ()
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
if (X86_64 AND NOT (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" OR ${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC"))
set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
endif ()
@ -280,7 +280,41 @@ if (DEFINED TARGET)
if (${TARGET} STREQUAL POWER8)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
endif()
if (${TARGET} STREQUAL NEOVERSEV1)
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
else ()
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1")
else ()
message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.")
endif()
endif()
endif()
if (${TARGET} STREQUAL NEOVERSEN2)
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
else ()
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
else ()
message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.")
endif()
endif()
endif()
if (${TARGET} STREQUAL ARMV8SVE)
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve")
else ()
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve")
endif()
endif()
endif()
if (DEFINED BINARY)
message(STATUS "Compiling a ${BINARY}-bit binary.")
endif ()

View File

@ -44,6 +44,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
set(MIPS64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
set(LOONGARCH64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*")
set(RISCV64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
if (NOT BINARY)
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
@ -60,7 +62,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
set(X86 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*|armv8.*)")
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(ARM64 1)
else()
@ -107,7 +109,7 @@ else()
endif ()
if (NOT BINARY)
if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64)
if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64)
set(BINARY 64)
else ()
set(BINARY 32)

View File

@ -87,6 +87,15 @@ macro(ParseMakefileVars MAKEFILE_IN)
#message(STATUS "skipping ${makefile_line}")
continue ()
endif ()
# Example 1: SBGEMM_SMALL_M_PERMIT =
# Unset the variable
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*$" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
set(var_name ${CMAKE_MATCH_1})
unset(${var_name})
endif()
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
#message(STATUS "match on ${line_match}")

View File

@ -525,7 +525,7 @@ static inline unsigned long long rpcc(void){
#endif // !RPCC_DEFINED
#if !defined(BLAS_LOCK_DEFINED) && defined(__GNUC__)
static void __inline blas_lock(volatile BLASULONG *address){
static __inline void blas_lock(volatile BLASULONG *address){
do {
while (*address) {YIELDING;};

View File

@ -45,7 +45,7 @@
#define WMB asm("wmb")
#define RMB asm("mb")
static void __inline blas_lock(unsigned long *address){
static __inline void blas_lock(unsigned long *address){
#ifndef __DECC
unsigned long tmp1, tmp2;
asm volatile(

View File

@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(ARMV6) || defined(ARMV7) || defined(ARMV8)
static void __inline blas_lock(volatile BLASULONG *address){
static __inline void blas_lock(volatile BLASULONG *address){
int register ret;

View File

@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef ASSEMBLER
static void __inline blas_lock(volatile BLASULONG *address){
static __inline void blas_lock(volatile BLASULONG *address){
BLASULONG ret;

View File

@ -83,6 +83,19 @@ static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
}
#ifndef NO_AFFINITY
static inline int WhereAmI(void){
int ret = 0, counter = 0;
__asm__ volatile (
"rdtimel.w %[counter], %[id]"
: [id]"=r"(ret), [counter]"=r"(counter)
:
: "memory"
);
return ret;
}
#endif
#ifdef DOUBLE
#define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory")
#else

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -45,12 +46,14 @@
typedef struct {
int dtb_entries;
int switch_ratio;
int offsetA, offsetB, align;
#if BUILD_BFLOAT16 == 1
int sbgemm_p, sbgemm_q, sbgemm_r;
int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn;
int sbgemm_align_k;
int need_amxtile_permission; // 0 default, 1 for device support amx.
void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG);
void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG);

View File

@ -91,7 +91,7 @@
void *qalloc(int flags, size_t bytes);
static void INLINE blas_lock(volatile unsigned long *address){
static INLINE void blas_lock(volatile unsigned long *address){
long int ret, val = 1;

View File

@ -45,7 +45,7 @@
#ifndef ASSEMBLER
static void __inline blas_lock(volatile unsigned long *address){
static __inline void blas_lock(volatile unsigned long *address){
long int ret = 1;

View File

@ -53,7 +53,6 @@ extern void goto_set_num_threads(int nthreads);
/* Global Parameter */
extern int blas_cpu_number;
extern int blas_num_threads;
extern int blas_num_threads_set;
extern int blas_omp_linked;
#define BLAS_LEGACY 0x8000U
@ -136,15 +135,13 @@ typedef struct blas_queue {
#ifdef SMP_SERVER
extern int blas_server_avail;
extern int blas_omp_number_max;
static __inline int num_cpu_avail(int level) {
#ifdef USE_OPENMP
int openmp_nthreads;
if (blas_num_threads_set == 0)
openmp_nthreads=omp_get_max_threads();
else
openmp_nthreads=blas_cpu_number;
#endif
#ifndef USE_OPENMP
@ -156,7 +153,13 @@ int openmp_nthreads;
) return 1;
#ifdef USE_OPENMP
if (blas_cpu_number != openmp_nthreads) {
if (openmp_nthreads > blas_omp_number_max){
#ifdef DEBUG
fprintf(stderr,"WARNING - more OpenMP threads requested (%d) than available (%d)\n",openmp_nthreads,blas_omp_number_max);
#endif
openmp_nthreads = blas_omp_number_max;
}
if (blas_cpu_number != openmp_nthreads) {
goto_set_num_threads(openmp_nthreads);
}
#endif

View File

@ -54,7 +54,7 @@
#define __volatile__
#endif
static void __inline blas_lock(volatile BLASULONG *address){
static __inline void blas_lock(volatile BLASULONG *address){
int ret;

View File

@ -70,7 +70,7 @@
#define RMB
#endif
static void __inline blas_lock(volatile BLASULONG *address){
static __inline void blas_lock(volatile BLASULONG *address){
#ifndef C_MSVC

View File

@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef ASSEMBLER
/*
static void __inline blas_lock(volatile BLASULONG *address){
static __inline void blas_lock(volatile BLASULONG *address){
BLASULONG ret;

View File

@ -267,8 +267,9 @@ int detect(void)
}
#else
#ifdef __APPLE__
sysctlbyname("hw.cpufamily",&value,&length,NULL,0);
if (value ==131287967|| value == 458787763 ) return CPU_VORTEX;
sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0);
if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1
if (value64 == 3660830781) return CPU_VORTEX; //A15/M2
#endif
return CPU_ARMV8;
#endif

View File

@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include <stdint.h>
#include <sys/auxv.h>
/* If LASX extension instructions supported,
* using core LOONGSON3R5
@ -46,9 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_LOONGSON3R5 1
#define CPU_LOONGSON2K1000 2
#define LOONGARCH_CFG2 0x02
#define LOONGARCH_LASX 1<<7
#define LOONGARCH_LSX 1<<6
#define LA_HWCAP_LSX (1<<4)
#define LA_HWCAP_LASX (1<<5)
static char *cpuname[] = {
"LOONGSONGENERIC",
@ -64,17 +64,11 @@ static char *cpuname_lower[] = {
int detect(void) {
#ifdef __linux
uint32_t reg = 0;
int flag = (int)getauxval(AT_HWCAP);
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(reg)
: "r"(LOONGARCH_CFG2)
);
if (reg & LOONGARCH_LASX)
if (flag & LA_HWCAP_LASX)
return CPU_LOONGSON3R5;
else if (reg & LOONGARCH_LSX)
else if (flag & LA_HWCAP_LSX)
return CPU_LOONGSON2K1000;
else
return CPU_GENERIC;

View File

@ -1479,6 +1479,8 @@ int get_cpuname(void){
else
return CPUTYPE_NEHALEM;
case 15: // Sapphire Rapids
if(support_amx_bf16())
return CPUTYPE_SAPPHIRERAPIDS;
if(support_avx512_bf16())
return CPUTYPE_COOPERLAKE;
if(support_avx512())
@ -1549,6 +1551,7 @@ int get_cpuname(void){
case 7: // Raptor Lake
case 10:
case 15:
case 14: // Alder Lake N
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
@ -1845,7 +1848,8 @@ static char *cpuname[] = {
"ZEN",
"SKYLAKEX",
"DHYANA",
"COOPERLAKE"
"COOPERLAKE",
"SAPPHIRERAPIDS",
};
static char *lowercpuname[] = {
@ -1902,7 +1906,8 @@ static char *lowercpuname[] = {
"zen",
"skylakex",
"dhyana",
"cooperlake"
"cooperlake",
"sapphirerapids",
};
static char *corename[] = {
@ -1936,7 +1941,8 @@ static char *corename[] = {
"ZEN",
"SKYLAKEX",
"DHYANA",
"COOPERLAKE"
"COOPERLAKE",
"SAPPHIRERAPIDS",
};
static char *corename_lower[] = {
@ -1970,7 +1976,8 @@ static char *corename_lower[] = {
"zen",
"skylakex",
"dhyana",
"cooperlake"
"cooperlake",
"sapphirerapids",
};
@ -2276,16 +2283,18 @@ int get_coretype(void){
return CORE_NEHALEM;
}
if (model == 15) { // Sapphire Rapids
if(support_amx_bf16())
return CORE_SAPPHIRERAPIDS;
if(support_avx512_bf16())
return CPUTYPE_COOPERLAKE;
return CORE_COOPERLAKE;
if(support_avx512())
return CPUTYPE_SKYLAKEX;
return CORE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
return CORE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
return CORE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
return CORE_NEHALEM;
}
break;
@ -2352,6 +2361,7 @@ int get_coretype(void){
case 7: // Raptor Lake
case 10:
case 15:
case 14: // Alder Lake N
#ifndef NO_AVX2
if(support_avx2())
return CORE_HASWELL;

View File

@ -208,7 +208,7 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
ifeq ($(USE_OPENMP), 1)
ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(C_COMPILER), CLANG)
CEXTRALIB = -lomp
CEXTRALIB += -lomp
endif
endif
ifeq ($(F_COMPILER), NAG)

270
docs/distributing.md Normal file
View File

@ -0,0 +1,270 @@
# Guidance for redistributing OpenBLAS
*We note that this document contains recommendations only - packagers and other
redistributors are in charge of how OpenBLAS is built and distributed in their
systems, and may have good reasons to deviate from the guidance given on this
page. These recommendations are aimed at general packaging systems, with a user
base that typically is large, open source (or freely available at least), and
doesn't behave uniformly or that the packager is directly connected with.*
OpenBLAS has a large number of build-time options which can be used to change
how it behaves at runtime, how artifacts or symbols are named, etc. Variation
in build configuration can be necessary to acheive a given end goal within a
distribution or as an end user. However, such variation can also make it more
difficult to build on top of OpenBLAS and ship code or other packages in a way
that works across many different distros. Here we provide guidance about the
most important build options, what effects they may have when changed, and
which ones to default to.
The Make and CMake build systems provide equivalent options and yield more or
less the same artifacts, but not exactly (the CMake builds are still
experimental). You can choose either one and the options will function in the
same way, however the CMake outputs may require some renaming. To review
available build options, see `Makefile.rule` or `CMakeLists.txt` in the root of
the repository.
Build options typically fall into two categories: (a) options that affect the
user interface, such as library and symbol names or APIs that are made
available, and (b) options that affect performance and runtime behavior, such
as threading behavior or CPU architecture-specific code paths. The user
interface options are more important to keep aligned between distributions,
while for the performance-related options there are typically more reasons to
make choices that deviate from the defaults.
Here are recommendations for user interface related packaging choices where it
is not likely to be a good idea to deviate (typically these are the default
settings):
1. Include CBLAS. The CBLAS interface is widely used and it doesn't affect
binary size much, so don't turn it off.
2. Include LAPACK and LAPACKE. The LAPACK interface is also widely used, and
while it does make up a significant part of the binary size of the installed
library, that does not outweigh the regression in usability when deviating
from the default here.[^1]
3. Always distribute the pkg-config (`.pc`) and CMake `.cmake`) dependency
detection files. These files are used by build systems when users want to
link against OpenBLAS, and there is no benefit of leaving them out.
4. Provide the LP64 interface by default, and if in addition to that you choose
to provide an ILP64 interface build as well, use a symbol suffix to avoid
symbol name clashes (see the next section).
[^1] All major distributions do include LAPACK as of mid 2023 as far as we
know. Older versions of Arch Linux did not, and that was known to cause
problems.
## ILP64 interface builds
The LP64 (32-bit integer) interface is the default build, and has
well-established C and Fortran APIs as determined by the reference (Netlib)
BLAS and LAPACK libraries. The ILP64 (64-bit integer) interface however does
not have a standard API: symbol names and shared/static library names can be
produced in multiple ways, and this tends to make it difficult to use.
As of today there is an agreed-upon way of choosing names for OpenBLAS between
a number of key users/redistributors, which is the closest thing to a standard
that there is now. However, there is an ongoing standardization effort in the
reference BLAS and LAPACK libraries, which differs from the current OpenBLAS
agreed-upon convention. In this section we'll aim to explain both.
Those two methods are fairly similar, and have a key thing in common: *using a
symbol suffix*. This is good practice; it is recommended that if you distribute
an ILP64 build, to have it use a symbol suffix containing `64` in the name.
This avoids potential symbol clashes when different packages which depend on
OpenBLAS load both an LP64 and an ILP64 library into memory at the same time.
### The current OpenBLAS agreed-upon ILP64 convention
This convention comprises the shared library name and the symbol suffix in the
shared library. The symbol suffix to use is `64_`, implying that the library
name will be `libopenblas64_.so` and the symbols in that library end in `64_`.
The central issue where this was discussed is
[openblas#646](https://github.com/xianyi/OpenBLAS/issues/646), and adopters
include Fedora, Julia, NumPy and SciPy - SuiteSparse already used it as well.
To build shared and static libraries with the currently recommended ILP64
conventions with Make:
```bash
$ make INTERFACE64=1 SYMBOLSUFFIX=64_
```
This will produce libraries named `libopenblas64_.so|a`, a pkg-config file
named `openblas64.pc`, and CMake and header files.
Installing locally and inspecting the output will show a few more details:
```bash
$ make install PREFIX=$PWD/../openblas/make64 INTERFACE64=1 SYMBOLSUFFIX=64_
$ tree . # output slightly edited down
.
├── include
│   ├── cblas.h
│   ├── f77blas.h
│   ├── lapacke_config.h
│   ├── lapacke.h
│   ├── lapacke_mangling.h
│   ├── lapacke_utils.h
│   ├── lapack.h
│   └── openblas_config.h
└── lib
├── cmake
│   └── openblas
│   ├── OpenBLASConfig.cmake
│   └── OpenBLASConfigVersion.cmake
├── libopenblas64_.a
├── libopenblas64_.so
└── pkgconfig
└── openblas64.pc
```
A key point are the symbol names. These will equal the LP64 symbol names, then
(for Fortran only) the compiler mangling, and then the `64_` symbol suffix.
Hence to obtain the final symbol names, we need to take into account which
Fortran compiler we are using. For the most common cases (e.g., gfortran, Intel
Fortran, or Flang), that means appending a single underscore. In that case, the
result is:
| base API name | binary symbol name | call from Fortran code | call from C code |
|---------------|--------------------|------------------------|-----------------------|
| `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` |
| `cblas_dgemm` | `cblas_dgemm64_` | n/a | `cblas_dgemm64_(...)` |
It is quite useful to have these symbol names be as uniform as possible across
different packaging systems.
The equivalent build options with CMake are:
```bash
$ mkdir build && cd build
$ cmake .. -DINTERFACE64=1 -DSYMBOLSUFFIX=64_ -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON
$ cmake --build . -j
```
Note that the result is not 100% identical to the Make result. For example, the
library name ends in `_64` rather than `64_` - it is recommended to rename them
to match the Make library names (also update the `libsuffix` entry in
`openblas64.pc` to match that rename).
```bash
$ cmake --install . --prefix $PWD/../../openblas/cmake64
$ tree .
.
├── include
│   └── openblas64
│   ├── cblas.h
│   ├── f77blas.h
│   ├── lapacke_config.h
│   ├── lapacke_example_aux.h
│   ├── lapacke.h
│   ├── lapacke_mangling.h
│   ├── lapacke_utils.h
│   ├── lapack.h
│   ├── openblas64
│   │   └── lapacke_mangling.h
│   └── openblas_config.h
└── lib
├── cmake
│   └── OpenBLAS64
│   ├── OpenBLAS64Config.cmake
│   ├── OpenBLAS64ConfigVersion.cmake
│   ├── OpenBLAS64Targets.cmake
│   └── OpenBLAS64Targets-noconfig.cmake
├── libopenblas_64.a
├── libopenblas_64.so -> libopenblas_64.so.0
└── pkgconfig
└── openblas64.pc
```
### The upcoming standardized ILP64 convention
While the `64_` convention above got some adoption, it's slightly hacky and is
implemented through the use of `objcopy`. An effort is ongoing for a more
broadly adopted convention in the reference BLAS and LAPACK libraries, using
(a) the `_64` suffix, and (b) applying that suffix _before_ rather than after
Fortran compiler mangling. The central issue for this is
[lapack#666](https://github.com/Reference-LAPACK/lapack/issues/666).
For the most common cases of compiler mangling (a single `_` appended), the end
result will be:
| base API name | binary symbol name | call from Fortran code | call from C code |
|---------------|--------------------|------------------------|-----------------------|
| `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` |
| `cblas_dgemm` | `cblas_dgemm_64` | n/a | `cblas_dgemm_64(...)` |
For other compiler mangling schemes, replace the trailing `_` by the scheme in use.
The shared library name for this `_64` convention should be `libopenblas_64.so`.
Note: it is not yet possible to produce an OpenBLAS build which employs this
convention! Once reference BLAS and LAPACK with support for `_64` have been
released, a future OpenBLAS release will support it. For now, please use the
older `64_` scheme and avoid using the name `libopenblas_64.so`; it should be
considered reserved for future use of the `_64` standard as prescribed by
reference BLAS/LAPACK.
## Performance and runtime behavior related build options
For these options there are multiple reasonable or common choices.
### Threading related options
OpenBLAS can be built as a multi-threaded or single-threaded library, with the
default being multi-threaded. It's expected that the default `libopenblas`
library is multi-threaded; if you'd like to also distribute single-threaded
builds, consider naming them `libopenblas_sequential`.
OpenBLAS can be built with pthreads or OpenMP as the threading model, with the
default being pthreads. Both options are commonly used, and the choice here
should not influence the shared library name. The choice will be captured by
the `.pc` file. E.g.,:
```bash
$ pkg-config --libs openblas
-fopenmp -lopenblas
$ cat openblas.pc
...
openblas_config= ... USE_OPENMP=0 MAX_THREADS=24
```
The maximum number of threads users will be able to use is determined at build
time by the `NUM_THREADS` build option. It defaults to 24, and there's a wide
range of values that are reasonable to use (up to 256). 64 is a typical choice
here; there is a memory footprint penalty that is linear in `NUM_THREADS`.
Please see `Makefile.rule` for more details.
### CPU architecture related options
OpenBLAS contains a lot of CPU architecture-specific optimizations, hence when
distributing to a user base with a variety of hardware, it is recommended to
enable CPU architecture runtime detection. This will dynamically select
optimized kernels for individual APIs. To do this, use the `DYNAMIC_ARCH=1`
build option. This is usually done on all common CPU families, except when
there are known issues.
In case the CPU architecture is known (e.g. you're building binaries for macOS
M1 users), it is possible to specify the target architecture directly with the
`TARGET=` build option.
`DYNAMIC_ARCH` and `TARGET` are covered in more detail in the main `README.md`
in this repository.
## Real-world examples
OpenBLAS is likely to be distributed in one of these distribution models:
1. As a standalone package, or multiple packages, in a packaging ecosystem like
a Linux distro, Homebrew, conda-forge or MSYS2.
2. Vendored as part of a larger package, e.g. in Julia, NumPy, SciPy, or R.
3. Locally, e.g. making available as a build on a single HPC cluster.
The guidance on this page is most important for models (1) and (2). These links
to build recipes for a representative selection of packaging systems may be
helpful as a reference:
- [Fedora](https://src.fedoraproject.org/rpms/openblas/blob/rawhide/f/openblas.spec)
- [Debian](https://salsa.debian.org/science-team/openblas/-/blob/master/debian/rules)
- [Homebrew](https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/openblas.rb)
- [MSYS2](https://github.com/msys2/MINGW-packages/blob/master/mingw-w64-openblas/PKGBUILD)
- [conda-forge](https://github.com/conda-forge/openblas-feedstock/blob/main/recipe/build.sh)
- [NumPy/SciPy](https://github.com/MacPython/openblas-libs/blob/main/tools/build_openblas.sh)
- [Nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/development/libraries/science/math/openblas/default.nix)

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -44,10 +45,6 @@
#define DIVIDE_RATE 2
#endif
#ifndef SWITCH_RATIO
#define SWITCH_RATIO 2
#endif
//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@ -1015,6 +1012,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
BLASLONG divN, divT;
int mode;
#if defined(DYNAMIC_ARCH)
int switch_ratio = gotoblas->switch_ratio;
#else
int switch_ratio = SWITCH_RATIO;
#endif
if (range_m) {
BLASLONG m_from = *(((BLASLONG *)range_m) + 0);
BLASLONG m_to = *(((BLASLONG *)range_m) + 1);
@ -1030,7 +1033,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
}
*/
if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) {
if ((args -> m < nthreads * switch_ratio) || (args -> n < nthreads * switch_ratio)) {
GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0);
return 0;
}
@ -1038,7 +1041,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
divT = nthreads;
divN = 1;
while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) {
while ((GEMM3M_P * divT > m * switch_ratio) && (divT > 1)) {
do {
divT --;
divN = 1;

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -44,10 +45,6 @@
#define DIVIDE_RATE 2
#endif
#ifndef SWITCH_RATIO
#define SWITCH_RATIO 2
#endif
//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@ -528,7 +525,13 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
int mode, mask;
double dnum, di, dinum;
if ((nthreads == 1) || (args -> n < nthreads * SWITCH_RATIO)) {
#if defined(DYNAMIC_ARCH)
int switch_ratio = gotoblas->switch_ratio;
#else
int switch_ratio = SWITCH_RATIO;
#endif
if ((nthreads == 1) || (args->n < nthreads * switch_ratio)) {
SYRK_LOCAL(args, range_m, range_n, sa, sb, 0);
return 0;
}

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -44,10 +45,6 @@
#define DIVIDE_RATE 2
#endif
#ifndef SWITCH_RATIO
#define SWITCH_RATIO 2
#endif
#ifndef GEMM_PREFERED_SIZE
#define GEMM_PREFERED_SIZE 1
#endif
@ -577,6 +574,11 @@ InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
BLASLONG width, i, j, k, js;
BLASLONG m, n, n_from, n_to;
int mode;
#if defined(DYNAMIC_ARCH)
int switch_ratio = gotoblas->switch_ratio;
#else
int switch_ratio = SWITCH_RATIO;
#endif
/* Get execution mode */
#ifndef COMPLEX
@ -698,8 +700,8 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
num_parts = 0;
while (n > 0){
width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts);
if (width < SWITCH_RATIO) {
width = SWITCH_RATIO;
if (width < switch_ratio) {
width = switch_ratio;
}
width = round_up(n, width, GEMM_PREFERED_SIZE);
@ -746,6 +748,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
BLASLONG m = args -> m;
BLASLONG n = args -> n;
BLASLONG nthreads_m, nthreads_n;
#if defined(DYNAMIC_ARCH)
int switch_ratio = gotoblas->switch_ratio;
#else
int switch_ratio = SWITCH_RATIO;
#endif
/* Get dimensions from index ranges if available */
if (range_m) {
@ -755,21 +762,21 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
n = range_n[1] - range_n[0];
}
/* Partitions in m should have at least SWITCH_RATIO rows */
if (m < 2 * SWITCH_RATIO) {
/* Partitions in m should have at least switch_ratio rows */
if (m < 2 * switch_ratio) {
nthreads_m = 1;
} else {
nthreads_m = args -> nthreads;
while (m < nthreads_m * SWITCH_RATIO) {
while (m < nthreads_m * switch_ratio) {
nthreads_m = nthreads_m / 2;
}
}
/* Partitions in n should have at most SWITCH_RATIO * nthreads_m columns */
if (n < SWITCH_RATIO * nthreads_m) {
/* Partitions in n should have at most switch_ratio * nthreads_m columns */
if (n < switch_ratio * nthreads_m) {
nthreads_n = 1;
} else {
nthreads_n = (n + SWITCH_RATIO * nthreads_m - 1) / (SWITCH_RATIO * nthreads_m);
nthreads_n = (n + switch_ratio * nthreads_m - 1) / (switch_ratio * nthreads_m);
if (nthreads_m * nthreads_n > args -> nthreads) {
nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m);
}

View File

@ -973,7 +973,7 @@ void goto_set_num_threads(int num_threads) {
increased_threads = 1;
for(i = blas_num_threads - 1; i < num_threads - 1; i++){
for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0);
thread_status[i].status = THREAD_STATUS_WAKEUP;

View File

@ -68,6 +68,7 @@
#endif
int blas_server_avail = 0;
int blas_omp_number_max = 0;
extern int openblas_omp_adaptive_env();
@ -100,8 +101,6 @@ static void adjust_thread_buffers() {
void goto_set_num_threads(int num_threads) {
blas_num_threads_set = 1;
if (num_threads < 0) blas_num_threads_set = 0;
if (num_threads < 1) num_threads = blas_num_threads;
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
@ -125,6 +124,8 @@ void openblas_set_num_threads(int num_threads) {
}
int blas_thread_init(void){
if(blas_omp_number_max <= 0)
blas_omp_number_max = omp_get_max_threads();
blas_get_cpu_number();

View File

@ -568,7 +568,7 @@ void goto_set_num_threads(int num_threads)
blas_server_avail = 1;
}
for(i = blas_num_threads - 1; i < num_threads - 1; i++){
for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i,

View File

@ -220,6 +220,19 @@ extern gotoblas_t gotoblas_COOPERLAKE;
#else
#define gotoblas_COOPERLAKE gotoblas_PRESCOTT
#endif
#ifdef DYN_SAPPHIRERAPIDS
extern gotoblas_t gotoblas_SAPPHIRERAPIDS;
#elif defined(DYN_SKYLAKEX)
#define gotoblas_SAPPHIRERAPIDS gotoblas_SKYLAKEX
#elif defined(DYN_HASWELL)
#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM
#else
#define gotoblas_SAPPHIRERAPIDS gotoblas_PRESCOTT
#endif
#else // not DYNAMIC_LIST
@ -268,9 +281,11 @@ extern gotoblas_t gotoblas_ZEN;
#ifndef NO_AVX512
extern gotoblas_t gotoblas_SKYLAKEX;
extern gotoblas_t gotoblas_COOPERLAKE;
extern gotoblas_t gotoblas_SAPPHIRERAPIDS;
#else
#define gotoblas_SKYLAKEX gotoblas_HASWELL
#define gotoblas_COOPERLAKE gotoblas_HASWELL
#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL
#endif
#endif
#else
@ -279,6 +294,7 @@ extern gotoblas_t gotoblas_COOPERLAKE;
#define gotoblas_HASWELL gotoblas_NEHALEM
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
#define gotoblas_COOPERLAKE gotoblas_NEHALEM
#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM
#define gotoblas_BULLDOZER gotoblas_BARCELONA
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
@ -378,6 +394,31 @@ int support_avx512_bf16(){
#endif
}
#define BIT_AMX_TILE 0x01000000
#define BIT_AMX_BF16 0x00400000
#define BIT_AMX_ENBD 0x00060000
int support_amx_bf16() {
#if !defined(NO_AVX) && !defined(NO_AVX512)
int eax, ebx, ecx, edx;
int ret=0;
if (!support_avx512())
return 0;
// CPUID.7.0:EDX indicates AMX support
cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) {
// CPUID.D.0:EAX[17:18] indicates AMX enabled
cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD)
ret = 1;
}
return ret;
#else
return 0;
#endif
}
extern void openblas_warning(int verbose, const char * msg);
#define FALLBACK_VERBOSE 1
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
@ -689,6 +730,8 @@ static gotoblas_t *get_coretype(void){
}
}
if (model == 15){ // Sapphire Rapids
if(support_amx_bf16())
return &gotoblas_SAPPHIRERAPIDS;
if(support_avx512_bf16())
return &gotoblas_COOPERLAKE;
if (support_avx512())
@ -941,7 +984,8 @@ static char *corename[] = {
"Excavator",
"Zen",
"SkylakeX",
"Cooperlake"
"Cooperlake",
"SapphireRapids"
};
char *gotoblas_corename(void) {
@ -1006,6 +1050,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_ZEN) return corename[23];
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
if (gotoblas == &gotoblas_COOPERLAKE) return corename[25];
if (gotoblas == &gotoblas_SAPPHIRERAPIDS) return corename[26];
return corename[0];
}

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2;
#else
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
#endif
#ifdef DYN_ARMV8SVE
extern gotoblas_t gotoblas_ARMV8SVE;
#else
#define gotoblas_ARMV8SVE gotoblas_ARMV8
#endif
#ifdef DYN_CORTEX_A55
extern gotoblas_t gotoblas_CORTEXA55;
#else
@ -128,17 +134,21 @@ extern gotoblas_t gotoblas_NEOVERSEN1;
#ifndef NO_SVE
extern gotoblas_t gotoblas_NEOVERSEV1;
extern gotoblas_t gotoblas_NEOVERSEN2;
extern gotoblas_t gotoblas_ARMV8SVE;
#else
#define gotoblas_NEOVERSEV1 gotoblas_ARMV8
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
#define gotoblas_ARMV8SVE gotoblas_ARMV8
#endif
extern gotoblas_t gotoblas_THUNDERX3T110;
extern gotoblas_t gotoblas_CORTEXA55;
#endif
extern void openblas_warning(int verbose, const char * msg);
#define FALLBACK_VERBOSE 1
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"
#define NUM_CORETYPES 13
#define NUM_CORETYPES 16
/*
* In case asm/hwcap.h is outdated on the build system, make sure
@ -147,6 +157,9 @@ extern void openblas_warning(int verbose, const char * msg);
#ifndef HWCAP_CPUID
#define HWCAP_CPUID (1 << 11)
#endif
#ifndef HWCAP_SVE
#define HWCAP_SVE (1 << 22)
#endif
#define get_cpu_ftr(id, var) ({ \
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \
@ -168,6 +181,7 @@ static char *corename[] = {
"neoversen2",
"thunderx3t110",
"cortexa55",
"armv8sve",
"unknown"
};
@ -187,6 +201,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12];
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
if (gotoblas == &gotoblas_CORTEXA55) return corename[14];
if (gotoblas == &gotoblas_ARMV8SVE) return corename[15];
return corename[NUM_CORETYPES];
}
@ -221,6 +236,7 @@ static gotoblas_t *force_coretype(char *coretype) {
case 12: return (&gotoblas_NEOVERSEN2);
case 13: return (&gotoblas_THUNDERX3T110);
case 14: return (&gotoblas_CORTEXA55);
case 15: return (&gotoblas_ARMV8SVE);
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
@ -281,9 +297,17 @@ static gotoblas_t *get_coretype(void) {
return &gotoblas_NEOVERSEN1;
#ifndef NO_SVE
case 0xd49:
return &gotoblas_NEOVERSEN2;
if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
return &gotoblas_NEOVERSEN1;
} else
return &gotoblas_NEOVERSEN2;
case 0xd40:
return &gotoblas_NEOVERSEV1;
if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
return &gotoblas_NEOVERSEN1;
}else
return &gotoblas_NEOVERSEV1;
#endif
case 0xd05: // Cortex A55
return &gotoblas_CORTEXA55;
@ -332,6 +356,12 @@ static gotoblas_t *get_coretype(void) {
snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
openblas_warning(1, coremsg);
}
#ifndef NO_SVE
if ((getauxval(AT_HWCAP) & HWCAP_SVE)) {
return &gotoblas_ARMV8SVE;
}
#endif
return NULL;
#endif
}

View File

@ -422,8 +422,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s
*/
int blas_num_threads = 0;
int blas_num_threads_set = 0;
int goto_get_num_procs (void) {
return blas_cpu_number;
}
@ -1996,8 +1994,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s
*/
int blas_num_threads = 0;
int blas_num_threads_set = 0;
int goto_get_num_procs (void) {
return blas_cpu_number;
}
@ -3015,6 +3011,8 @@ void *blas_memory_alloc(int procpos){
#endif
if (memory_overflowed) goto terminate;
fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n");
fprintf(stderr,"To avoid this warning, please rebuild your copy of OpenBLAS with a larger NUM_THREADS setting\n");
fprintf(stderr,"or set the environment variable OPENBLAS_NUM_THREADS to %d or lower\n", NUM_BUFFERS);
memory_overflowed=1;
new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t));
newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct));

View File

@ -283,7 +283,6 @@ The numbers of threads in the thread pool.
This value is equal or large than blas_cpu_number. This means some threads are sleep.
*/
int blas_num_threads = 0;
int blas_num_threads_set = 0;
int goto_get_num_procs (void) {
return blas_cpu_number;

View File

@ -21,7 +21,7 @@ blasobjsc="
chbmv chemm chemv cher2 cher2k cher cherk scabs1 scamax
chpmv chpr2 chpr crotg cscal csrot csscal cswap scamin scasum scnrm2
csymm csyr2k csyrk ctbmv ctbsv ctpmv ctpsv ctrmm ctrmv ctrsm
ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum"
ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum cgemmt"
blasobjsd="
damax damin dasum daxpy daxpby dcabs1 dcopy ddot dgbmv dgemm
@ -29,7 +29,7 @@ blasobjsd="
dscal dsdot dspmv dspr2 dimatcopy domatcopy
dspr dswap dsymm dsymv dsyr2 dsyr2k dsyr dsyrk dtbmv dtbsv
dtpmv dtpsv dtrmm dtrmv dtrsm dtrsv
idamax idamin idmax idmin dgeadd dsum"
idamax idamin idmax idmin dgeadd dsum dgemmt"
blasobjss="
isamax isamin ismax ismin
@ -38,7 +38,7 @@ blasobjss="
smax smin snrm2 simatcopy somatcopy
srot srotg srotm srotmg ssbmv sscal sspmv sspr2 sspr sswap
ssymm ssymv ssyr2 ssyr2k ssyr ssyrk stbmv stbsv stpmv stpsv
strmm strmv strsm strsv sgeadd ssum"
strmm strmv strsm strsv sgeadd ssum sgemmt"
blasobjsz="
izamax izamin
@ -48,7 +48,7 @@ blasobjsz="
zhpr zrotg zscal zswap zsymm zsyr2k zsyrk ztbmv
ztbsv ztpmv ztpsv ztrmm ztrmv ztrsm ztrsv
zomatcopy zimatcopy dzamax dzamin dzasum dznrm2
zgeadd dzsum"
zgeadd dzsum zgemmt"
blasobjs="lsame xerbla"
bfblasobjs="sbgemm sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod"
@ -58,7 +58,7 @@ cblasobjsc="
cblas_cher cblas_cherk cblas_chpmv cblas_chpr2 cblas_chpr cblas_cscal cblas_caxpby
cblas_csscal cblas_cswap cblas_csymm cblas_csyr2k cblas_csyrk cblas_ctbmv cblas_cgeadd
cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv
cblas_scnrm2 cblas_scasum
cblas_scnrm2 cblas_scasum cblas_cgemmt
cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy
"
cblasobjsd="
@ -67,7 +67,7 @@ cblasobjsd="
cblas_drot cblas_drotg cblas_drotm cblas_drotmg cblas_dsbmv cblas_dscal cblas_dsdot
cblas_dspmv cblas_dspr2 cblas_dspr cblas_dswap cblas_dsymm cblas_dsymv cblas_dsyr2
cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv
cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd
cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt
cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy
"
@ -78,7 +78,7 @@ cblasobjss="
cblas_srotm cblas_srotmg cblas_ssbmv cblas_sscal cblas_sspmv cblas_sspr2 cblas_sspr
cblas_sswap cblas_ssymm cblas_ssymv cblas_ssyr2 cblas_ssyr2k cblas_ssyr cblas_ssyrk
cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm
cblas_strsv cblas_sgeadd
cblas_strsv cblas_sgeadd cblas_sgemmt
cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy
"
@ -89,7 +89,7 @@ cblasobjsz="
cblas_zhpr cblas_zscal cblas_zswap cblas_zsymm cblas_zsyr2k cblas_zsyrk
cblas_ztbmv cblas_ztbsv cblas_ztpmv cblas_ztpsv cblas_ztrmm cblas_ztrmv cblas_ztrsm
cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub
cblas_zaxpby cblas_zgeadd
cblas_zaxpby cblas_zgeadd cblas_zgemmt
cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy
"
@ -716,6 +716,7 @@ lapackobjs2z="$lapackobjs2z
# functions added for lapack-3.7.0
lapackobjs2s="$lapackobjs2s
slarfy
ssyconvf
strevc3
sgelqt
sgelqt3
@ -843,6 +844,23 @@ lapackobjs2z="$lapackobjs2z
zungtsqr_row
"
#functions added for lapack-3.11
lapackobjs2c="$lapackobjs2c
cgedmd
cgedmdq
"
lapackobjs2d="$lapackobjs2d
dgedmd
dgedmdq
"
lapackobjs2s="$lapackobjs2s
sgedmd
sgedmdq
"
lapackobjs2z="$lapackobjs2z
zgedmd
zgedmdq
"
lapack_extendedprecision_objs="
zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx
dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx
@ -1012,6 +1030,10 @@ lapackeobjsc="
LAPACKE_cgebrd_work
LAPACKE_cgecon
LAPACKE_cgecon_work
LAPACKE_cgedmd
LAPACKE_cgedmd_work
LAPACKE_cgedmdq
LAPACKE_cgedmdq_work
LAPACKE_cgeequ
LAPACKE_cgeequ_work
LAPACKE_cgeequb
@ -1671,6 +1693,10 @@ lapackeobjsd="
LAPACKE_dgebrd_work
LAPACKE_dgecon
LAPACKE_dgecon_work
LAPACKE_dgedmd
LAPACKE_dgedmd_work
LAPACKE_dgedmdq
LAPACKE_dgedmdq_work
LAPACKE_dgeequ
LAPACKE_dgeequ_work
LAPACKE_dgeequb
@ -2284,6 +2310,10 @@ lapackeobjss="
LAPACKE_sgebrd_work
LAPACKE_sgecon
LAPACKE_sgecon_work
LAPACKE_sgedmd
LAPACKE_sgedmd_work
LAPACKE_sgedmdq
LAPACKE_sgedmdq_work
LAPACKE_sgeequ
LAPACKE_sgeequ_work
LAPACKE_sgeequb
@ -2893,6 +2923,10 @@ lapackeobjsz="
LAPACKE_zgebrd_work
LAPACKE_zgecon
LAPACKE_zgecon_work
LAPACKE_zgedmd
LAPACKE_zgedmd_work
LAPACKE_zgedmdq
LAPACKE_zgedmdq_work
LAPACKE_zgeequ
LAPACKE_zgeequ_work
LAPACKE_zgeequb

View File

@ -21,7 +21,7 @@
chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax,
chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2,
csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm,
ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum);
ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt);
@blasobjsd = (
damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm,
@ -29,7 +29,7 @@
dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy,
dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv,
dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv,
idamax,idamin,idmax,idmin,dgeadd,dsum);
idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt);
@blasobjss = (
isamax,isamin,ismax,ismin,
@ -38,7 +38,7 @@
smax,smin,snrm2,simatcopy,somatcopy,
srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap,
ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv,
strmm,strmv,strsm,strsv, sgeadd,ssum);
strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt);
@blasobjsz = (
izamax,izamin,,
@ -48,7 +48,7 @@
zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv,
ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv,
zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2,
zgeadd, dzsum);
zgeadd, dzsum, zgemmt);
@blasobjs = (lsame, xerbla);
@bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
@ -60,7 +60,7 @@
cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv,
cblas_scnrm2, cblas_scasum,
cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy
);
cblas_cgemmt);
@cblasobjsd = (
cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot,
cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2,
@ -69,7 +69,7 @@
cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv,
cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd,
cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy
);
cblas_dgemmt);
@cblasobjss = (
cblas_sasum, cblas_saxpy, cblas_saxpby,
@ -80,7 +80,7 @@
cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm,
cblas_strsv, cblas_sgeadd,
cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy
);
cblas_sgemmt);
@cblasobjsz = (
cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal,
cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm,
@ -90,7 +90,7 @@
cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub,
cblas_zaxpby, cblas_zgeadd,
cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy
);
cblas_zgemmt);
@cblasobjs = ( cblas_xerbla );

View File

@ -101,7 +101,14 @@ else
*flang*)
vendor=FLANG
openmp='-fopenmp'
;;
data=`$compiler -v 2>&1 > /dev/null `
v="${data#*version *}"
v="${v%%*.}"
major="${v%%.*}"
if [ "$major" -ge 17 ]; then
vendor=FLANGNEW
fi
;;
*ifort*|*ifx*)
vendor=INTEL
openmp='-fopenmp'

View File

@ -1930,15 +1930,15 @@ printf("ELF_VERSION=2\n");
#ifdef MAKE_NB_JOBS
#if MAKE_NB_JOBS > 0
printf("MAKE += -j %d\n", MAKE_NB_JOBS);
printf("MAKEFLAGS += -j %d\n", MAKE_NB_JOBS);
#else
// Let make use parent -j argument or -j1 if there
// is no make parent
#endif
#elif NO_PARALLEL_MAKE==1
printf("MAKE += -j 1\n");
printf("MAKEFLAGS += -j 1\n");
#else
printf("MAKE += -j %d\n", get_num_cores());
printf("MAKEFLAGS += -j %d\n", get_num_cores());
#endif
break;

View File

@ -68,7 +68,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
info = 0;
if (lda < MAX(1, m)) info = 6;
if (lda < MAX(1, m)) info = 5;
if (ldc < MAX(1, m)) info = 8;
if (n < 0) info = 2;

View File

@ -154,6 +154,23 @@ static size_t zgemm_small_kernel_b0[] = {
#endif
#endif
#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
#define XFEATURE_XTILEDATA 18
#define ARCH_REQ_XCOMP_PERM 0x1023
static int openblas_amxtile_permission = 0;
static int init_amxtile_permission() {
long status =
syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);
if (status != 0) {
fprintf(stderr, "XTILEDATA permission not granted in your device(Linux, "
"Intel Sapphier Rapids), skip sbgemm calculation\n");
return -1;
}
openblas_amxtile_permission = 1;
return 0;
}
#endif
#ifndef CBLAS
void NAME(char *TRANSA, char *TRANSB,
@ -455,6 +472,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
#endif
#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
#if defined(DYNAMIC_ARCH)
if (gotoblas->need_amxtile_permission &&
openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) {
return;
}
#endif
#if !defined(DYNAMIC_ARCH) && defined(SAPPHIRERAPIDS)
if (openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) {
return;
}
#endif
#endif // defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
if ((args.m == 0) || (args.n == 0)) return;
#if 0

View File

@ -35,29 +35,26 @@
#include <stdio.h>
#include <stdlib.h>
#include "common.h"
#ifdef FUNCTION_PROFILE
#include "functable.h"
#endif
#ifndef COMPLEX
#define SMP_THRESHOLD_MIN 65536.0
#ifdef XDOUBLE
#define ERROR_NAME "QGEMT "
#define ERROR_NAME "QGEMMT "
#elif defined(DOUBLE)
#define ERROR_NAME "DGEMT "
#define ERROR_NAME "DGEMMT "
#elif defined(BFLOAT16)
#define ERROR_NAME "SBGEMT "
#define ERROR_NAME "SBGEMMT "
#else
#define ERROR_NAME "SGEMT "
#define ERROR_NAME "SGEMMT "
#endif
#else
#define SMP_THRESHOLD_MIN 8192.0
#ifdef XDOUBLE
#define ERROR_NAME "XGEMT "
#define ERROR_NAME "XGEMMT "
#elif defined(DOUBLE)
#define ERROR_NAME "ZGEMT "
#define ERROR_NAME "ZGEMMT "
#else
#define ERROR_NAME "CGEMT "
#define ERROR_NAME "CGEMMT "
#endif
#endif
@ -68,18 +65,19 @@
#ifndef CBLAS
void NAME(char *UPLO, char *TRANSA, char *TRANSB,
blasint * M, blasint * N, blasint * K,
blasint * M, blasint * K,
FLOAT * Alpha,
IFLOAT * a, blasint * ldA,
IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
{
blasint m, n, k;
blasint m, k;
blasint lda, ldb, ldc;
int transa, transb, uplo;
blasint info;
char transA, transB, Uplo;
blasint nrowa, nrowb;
IFLOAT *buffer;
IFLOAT *aa, *bb;
FLOAT *cc;
@ -92,7 +90,6 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
PRINT_DEBUG_NAME;
m = *M;
n = *N;
k = *K;
#if defined(COMPLEX)
@ -159,32 +156,39 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
if (Uplo == 'L')
uplo = 1;
nrowa = m;
if (transa) nrowa = k;
nrowb = k;
if (transb) nrowb = m;
info = 0;
if (uplo < 0)
info = 14;
if (ldc < m)
if (ldc < MAX(1, m))
info = 13;
if (ldb < MAX(1, nrowa))
info = 10;
if (lda < MAX(1, nrowb))
info = 8;
if (k < 0)
info = 5;
if (n < 0)
info = 4;
if (m < 0)
info = 3;
info = 4;
if (transb < 0)
info = 2;
info = 3;
if (transa < 0)
info = 2;
if (uplo < 0)
info = 1;
if (info) {
if (info != 0) {
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
#else
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M,
blasint N, blasint k,
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m,
blasint k,
#ifndef COMPLEX
FLOAT alpha,
IFLOAT * A, blasint LDA,
@ -205,17 +209,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
int transa, transb, uplo;
blasint info;
blasint m, n, lda, ldb;
blasint lda, ldb;
FLOAT *a, *b;
XFLOAT *buffer;
PRINT_DEBUG_CNAME;
uplo = -1;
transa = -1;
transb = -1;
info = 0;
if (order == CblasColMajor) {
if (Uplo == CblasUpper) uplo = 0;
if (Uplo == CblasLower) uplo = 1;
if (TransA == CblasNoTrans)
transa = 0;
@ -248,9 +255,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
transb = 3;
#endif
m = M;
n = N;
a = (void *)A;
b = (void *)B;
lda = LDA;
@ -258,23 +262,31 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
info = -1;
if (ldc < m)
blasint nrowa, nrowb;
nrowa = m;
if (transa) nrowa = k;
nrowb = k;
if (transb) nrowb = m;
if (ldc < MAX(1, m))
info = 13;
if (ldb < MAX(1, nrowb))
info = 10;
if (lda < MAX(1, nrowa))
info = 8;
if (k < 0)
info = 5;
if (n < 0)
info = 4;
if (m < 0)
info = 3;
info = 4;
if (transb < 0)
info = 2;
info = 3;
if (transa < 0)
info = 2;
if (uplo < 0)
info = 1;
}
if (order == CblasRowMajor) {
m = N;
n = M;
a = (void *)B;
b = (void *)A;
@ -282,6 +294,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
lda = LDB;
ldb = LDA;
if (Uplo == CblasUpper) uplo = 0;
if (Uplo == CblasLower) uplo = 1;
if (TransB == CblasNoTrans)
transa = 0;
if (TransB == CblasTrans)
@ -315,29 +330,30 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
info = -1;
if (ldc < m)
blasint ncola, ncolb;
ncola = k;
if (transa) ncola = m;
ncolb = m;
if (transb) ncolb = k;
if (ldc < MAX(1,m))
info = 13;
if (ldb < MAX(1, ncolb))
info = 10;
if (lda < MAX(1, ncola))
info = 8;
if (k < 0)
info = 5;
if (n < 0)
info = 4;
if (m < 0)
info = 3;
info = 4;
if (transb < 0)
info = 2;
info = 3;
if (transa < 0)
info = 2;
if (uplo < 0)
info = 1;
}
uplo = -1;
if (Uplo == CblasUpper)
uplo = 0;
if (Uplo == CblasLower)
uplo = 1;
if (uplo < 0)
info = 14;
if (info >= 0) {
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
@ -407,37 +423,35 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
if ((m == 0) || (n == 0))
if (m == 0)
return;
IDEBUG_START;
FUNCTION_PROFILE_START();
const blasint incb = (transb == 0) ? 1 : ldb;
if (uplo == 1) {
for (i = 0; i < n; i++) {
j = n - i;
for (i = 0; i < m; i++) {
j = m - i;
l = j;
#if defined(COMPLEX)
aa = a + i * 2;
bb = b + i * ldb * 2;
if (transa) {
l = k;
aa = a + lda * i * 2;
bb = b + i * 2;
}
if (transb)
bb = b + i * 2;
cc = c + i * 2 * ldc + i * 2;
#else
aa = a + i;
bb = b + i * ldb;
if (transa) {
l = k;
aa = a + lda * i;
bb = b + i;
}
if (transb)
bb = b + i;
cc = c + i * ldc + i;
#endif
@ -458,8 +472,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
IDEBUG_START;
FUNCTION_PROFILE_START();
buffer_size = j + k + 128 / sizeof(FLOAT);
#ifdef WINDOWS_ABI
buffer_size += 160 / sizeof(FLOAT);
@ -479,20 +491,34 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
#if defined(COMPLEX)
if (!transa)
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
aa, lda, bb, incb, cc, 1,
buffer);
else
(gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
aa, lda, bb, incb, cc, 1,
buffer);
#else
if (!transa)
(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
bb, incb, cc, 1, buffer);
else
(gemv[(int)transa]) (k, j, 0, alpha, aa, lda,
bb, incb, cc, 1, buffer);
#endif
#ifdef SMP
} else {
if (!transa)
(gemv_thread[(int)transa]) (j, k, alpha, aa,
lda, bb, incb, cc,
1, buffer,
nthreads);
else
(gemv_thread[(int)transa]) (k, j, alpha, aa,
lda, bb, incb, cc,
1, buffer,
nthreads);
}
#endif
@ -501,21 +527,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
}
} else {
for (i = 0; i < n; i++) {
for (i = 0; i < m; i++) {
j = i + 1;
l = j;
#if defined COMPLEX
bb = b + i * ldb * 2;
if (transa) {
l = k;
if (transb) {
bb = b + i * 2;
}
cc = c + i * 2 * ldc;
#else
bb = b + i * ldb;
if (transa) {
l = k;
if (transb) {
bb = b + i;
}
cc = c + i * ldc;
@ -537,8 +561,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
IDEBUG_START;
FUNCTION_PROFILE_START();
buffer_size = j + k + 128 / sizeof(FLOAT);
#ifdef WINDOWS_ABI
buffer_size += 160 / sizeof(FLOAT);
@ -558,30 +580,39 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
#if defined(COMPLEX)
if (!transa)
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
a, lda, bb, incb, cc, 1,
buffer);
else
(gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
a, lda, bb, incb, cc, 1,
buffer);
#else
if (!transa)
(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
incb, cc, 1, buffer);
else
(gemv[(int)transa]) (k, j, 0, alpha, a, lda, bb,
incb, cc, 1, buffer);
#endif
#ifdef SMP
} else {
if (!transa)
(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
bb, incb, cc, 1,
buffer, nthreads);
else
(gemv_thread[(int)transa]) (k, j, alpha, a, lda,
bb, incb, cc, 1,
buffer, nthreads);
}
#endif
STACK_FREE(buffer);
}
}
FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE,
args.m * args.k + args.k * args.n +
args.m * args.n, 2 * args.m * args.n * args.k);
IDEBUG_END;

View File

@ -100,13 +100,13 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
if ( order == BlasColMajor)
{
if ( trans == BlasNoTrans && *ldb < *rows ) info = 9;
if ( trans == BlasTrans && *ldb < *cols ) info = 9;
if ( trans == BlasNoTrans && *ldb < *rows ) info = 8;
if ( trans == BlasTrans && *ldb < *cols ) info = 8;
}
if ( order == BlasRowMajor)
{
if ( trans == BlasNoTrans && *ldb < *cols ) info = 9;
if ( trans == BlasTrans && *ldb < *rows ) info = 9;
if ( trans == BlasNoTrans && *ldb < *cols ) info = 8;
if ( trans == BlasTrans && *ldb < *rows ) info = 8;
}
if ( order == BlasColMajor && *lda < *rows ) info = 7;
@ -120,17 +120,20 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
#ifdef NEW_IMATCOPY
if ( *lda == *ldb && *rows == *cols) {
if ( *lda == *ldb ) {
if ( order == BlasColMajor )
{
if ( trans == BlasNoTrans )
{
IMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda );
return;
}
else
else if ( *rows == *cols )
{
IMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda );
return;
}
}
else
@ -138,26 +141,23 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
if ( trans == BlasNoTrans )
{
IMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda );
return;
}
else
else if ( *rows == *cols )
{
IMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda );
return;
}
}
return;
}
#endif
if ( *lda > *ldb )
msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT);
else
msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT);
msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT);
b = malloc(msize);
if ( b == NULL )
{
printf("Memory alloc failed\n");
printf("Memory alloc failed in imatcopy\n");
exit(1);
}
@ -165,26 +165,26 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
{
if ( trans == BlasNoTrans )
{
OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *ldb );
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *ldb, a, *ldb );
OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *rows );
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *rows, a, *ldb );
}
else
{
OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *ldb );
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, b, *ldb, a, *ldb );
OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *cols );
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, b, *cols, a, *ldb );
}
}
else
{
if ( trans == BlasNoTrans )
{
OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *ldb );
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *ldb, a, *ldb );
OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *cols );
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *cols, a, *ldb );
}
else
{
OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *ldb );
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, b, *ldb, a, *ldb );
OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *rows );
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, b, *rows, a, *ldb );
}
}

View File

@ -54,6 +54,21 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
if (n <= 0) return 0.;
#ifndef COMPLEX
if (n == 1)
#ifdef DOUBLE
return fabs(x[0]);
#else
return fabsf(x[0]);
#endif
#endif
if (incx < 0)
#ifdef COMPLEX
x -= (n - 1) * incx * 2;
#else
x -= (n - 1) * incx;
#endif
IDEBUG_START;
FUNCTION_PROFILE_START();
@ -82,6 +97,22 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
if (n <= 0) return 0.;
#ifndef COMPLEX
if (n == 1)
#ifdef DOUBLE
return fabs(x[0]);
#else
return fabsf(x[0]);
#endif
#endif
if (incx < 0)
#ifdef COMPLEX
x -= (n - 1) * incx * 2;
#else
x -= (n - 1) * incx;
#endif
IDEBUG_START;
FUNCTION_PROFILE_START();

View File

@ -1,9 +1,11 @@
#include <math.h>
#include <float.h>
#include "common.h"
#ifdef FUNCTION_PROFILE
#include "functable.h"
#endif
#ifndef CBLAS
void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
@ -14,17 +16,27 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
#endif
#ifdef DOUBLE
long double safmin = DBL_MIN;
#else
long double safmin = FLT_MIN;
#endif
#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86)
long double da = *DA;
long double db = *DB;
long double c;
long double s;
long double r, roe, z;
long double r, z;
long double sigma, dascal,dbscal;
long double ada = fabsl(da);
long double adb = fabsl(db);
long double scale = ada + adb;
long double maxab = MAX(ada,adb);
long double safmax;
long double scale;
#ifndef CBLAS
PRINT_DEBUG_NAME;
@ -32,17 +44,25 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
PRINT_DEBUG_CNAME;
#endif
roe = db;
if (ada > adb) roe = da;
if (scale == ZERO) {
if (adb == ZERO) {
*C = ONE;
*S = ZERO;
*DA = ZERO;
*DB = ZERO;
} else if (ada == ZERO) {
*C = ZERO;
*S = ONE;
*DA = *DB;
*DB = ONE;
} else {
r = sqrt(da * da + db * db);
if (roe < 0) r = -r;
safmax = 1./safmin;
scale = MIN(MAX(safmin,maxab), safmax);
if (ada > adb)
sigma = copysign(1.,da);
else
sigma = copysign(1.,db);
dascal = da / scale;
dbscal = db / scale;
r = sigma * (scale * sqrt(dascal * dascal + dbscal * dbscal));
c = da / r;
s = db / r;
z = ONE;
@ -65,11 +85,22 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
FLOAT db = *DB;
FLOAT c = *C;
FLOAT s = *S;
FLOAT r, roe, z;
FLOAT sigma;
FLOAT r, z;
FLOAT ada = fabs(da);
FLOAT adb = fabs(db);
FLOAT scale = ada + adb;
FLOAT maxab = MAX(ada,adb);
long double safmax ;
FLOAT scale ;
safmax = 1./safmin;
scale = MIN(MAX(safmin,maxab), safmax);
if (ada > adb)
sigma = copysign(1.,da);
else
sigma = copysign(1.,db);
#ifndef CBLAS
PRINT_DEBUG_NAME;
@ -77,20 +108,21 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
PRINT_DEBUG_CNAME;
#endif
roe = db;
if (ada > adb) roe = da;
if (scale == ZERO) {
if (adb == ZERO) {
*C = ONE;
*S = ZERO;
*DA = ZERO;
*DB = ZERO;
} else if (ada == ZERO) {
*C = ZERO;
*S = ONE;
*DA = *DB;
*DB = ONE;
} else {
FLOAT aa = da / scale;
FLOAT bb = db / scale;
r = scale * sqrt(aa * aa + bb * bb);
if (roe < 0) r = -r;
r = sigma * scale * sqrt(aa * aa + bb * bb);
c = da / r;
s = db / r;
z = ONE;

View File

@ -166,7 +166,7 @@ void NAME(char *SIDE, char *UPLO,
int nodes;
#endif
# if defined(SMP)
int MN;
double MN;
#endif
blasint info;
int side;
@ -264,7 +264,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
int nodes;
#endif
#if defined(SMP)
int MN;
double MN;
#endif
PRINT_DEBUG_CNAME;

View File

@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS,
FLOAT *sa, *sb;
#ifdef SMP
int NNK;
double NNK;
#ifdef USE_SIMPLE_THREADED_LEVEL3
#ifndef COMPLEX
#ifdef XDOUBLE
@ -232,7 +232,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
FLOAT *sa, *sb;
#ifdef SMP
int NNK;
double NNK;
#ifdef USE_SIMPLE_THREADED_LEVEL3
#ifndef COMPLEX

View File

@ -125,27 +125,33 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
#ifdef NEW_IMATCOPY
if (*lda == *ldb && *cols == *rows) {
if (*lda == *ldb ) {
if ( order == BlasColMajor )
{
if ( trans == BlasNoTrans )
{
IMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda );
return;
}
if ( trans == BlasConj )
{
IMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda );
return;
}
if ( trans == BlasTrans )
if ( trans == BlasTrans && *rows == *cols )
{
IMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda );
return;
}
if ( trans == BlasTransConj )
if ( trans == BlasTransConj && *rows == *cols )
{
IMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda );
return;
}
}
else
{
@ -153,67 +159,59 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
if ( trans == BlasNoTrans )
{
IMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda );
return;
}
if ( trans == BlasConj )
{
IMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda );
return;
}
if ( trans == BlasTrans )
if ( trans == BlasTrans && *rows == *cols )
{
IMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda );
return;
}
if ( trans == BlasTransConj )
if ( trans == BlasTransConj && *rows == *cols )
{
IMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda );
return;
}
}
return;
}
#endif
if ( *lda > *ldb )
msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT) * 2;
else
msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT) * 2;
b = malloc(msize);
if ( b == NULL )
{
printf("Memory alloc failed in zimatcopy\n");
exit(1);
}
msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT) * 2;
b = malloc(msize);
if ( b == NULL )
{
printf("Memory alloc failed in zimatcopy\n");
exit(1);
}
if ( order == BlasColMajor )
{
if ( trans == BlasNoTrans )
{
OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
free(b);
return;
OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
}
if ( trans == BlasConj )
else if ( trans == BlasConj )
{
OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
free(b);
return;
OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
}
if ( trans == BlasTrans )
else if ( trans == BlasTrans )
{
OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
free(b);
return;
OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
}
if ( trans == BlasTransConj )
else if ( trans == BlasTransConj )
{
OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
free(b);
return;
OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
}
}
@ -222,34 +220,27 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
if ( trans == BlasNoTrans )
{
OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
free(b);
return;
OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
}
if ( trans == BlasConj )
else if ( trans == BlasConj )
{
OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
free(b);
return;
OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
}
if ( trans == BlasTrans )
else if ( trans == BlasTrans )
{
OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
free(b);
return;
OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
}
if ( trans == BlasTransConj )
else if ( trans == BlasTransConj )
{
OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
free(b);
return;
OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
}
}
free(b);
return;

View File

@ -1,9 +1,11 @@
#include <math.h>
#include <float.h>
#include "common.h"
#ifdef FUNCTION_PROFILE
#include "functable.h"
#endif
#ifndef CBLAS
void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
@ -14,123 +16,166 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
FLOAT *S = (FLOAT*) VS;
#endif /* CBLAS */
#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86)
long double da_r = *(DA + 0);
long double da_i = *(DA + 1);
long double db_r = *(DB + 0);
long double db_i = *(DB + 1);
long double r;
long double ada = fabsl(da_r) + fabsl(da_i);
PRINT_DEBUG_NAME;
IDEBUG_START;
FUNCTION_PROFILE_START();
if (ada == ZERO) {
*C = ZERO;
*(S + 0) = ONE;
*(S + 1) = ZERO;
*(DA + 0) = db_r;
*(DA + 1) = db_i;
} else {
long double alpha_r, alpha_i;
ada = sqrt(da_r * da_r + da_i * da_i);
r = sqrt(da_r * da_r + da_i * da_i + db_r * db_r + db_i * db_i);
alpha_r = da_r / ada;
alpha_i = da_i / ada;
*(C + 0) = ada / r;
*(S + 0) = (alpha_r * db_r + alpha_i *db_i) / r;
*(S + 1) = (alpha_i * db_r - alpha_r *db_i) / r;
*(DA + 0) = alpha_r * r;
*(DA + 1) = alpha_i * r;
}
#ifdef DOUBLE
long double safmin = DBL_MIN;
long double rtmin = sqrt(DBL_MIN/DBL_EPSILON);
#else
FLOAT da_r = *(DA + 0);
FLOAT da_i = *(DA + 1);
FLOAT db_r = *(DB + 0);
FLOAT db_i = *(DB + 1);
FLOAT r;
FLOAT ada = fabs(da_r) + fabs(da_i);
FLOAT adb;
PRINT_DEBUG_NAME;
IDEBUG_START;
FUNCTION_PROFILE_START();
if (ada == ZERO) {
*C = ZERO;
*(S + 0) = ONE;
*(S + 1) = ZERO;
*(DA + 0) = db_r;
*(DA + 1) = db_i;
} else {
FLOAT scale;
FLOAT aa_r, aa_i, bb_r, bb_i;
FLOAT alpha_r, alpha_i;
aa_r = fabs(da_r);
aa_i = fabs(da_i);
if (aa_i > aa_r) {
aa_r = fabs(da_i);
aa_i = fabs(da_r);
}
if (aa_r == ZERO) {
ada = 0.;
} else {
scale = (aa_i / aa_r);
ada = aa_r * sqrt(ONE + scale * scale);
}
bb_r = fabs(db_r);
bb_i = fabs(db_i);
if (bb_i > bb_r) {
bb_r = fabs(bb_i);
bb_i = fabs(bb_r);
}
if (bb_r == ZERO) {
adb = 0.;
} else {
scale = (bb_i / bb_r);
adb = bb_r * sqrt(ONE + scale * scale);
}
scale = ada + adb;
aa_r = da_r / scale;
aa_i = da_i / scale;
bb_r = db_r / scale;
bb_i = db_i / scale;
r = scale * sqrt(aa_r * aa_r + aa_i * aa_i + bb_r * bb_r + bb_i * bb_i);
alpha_r = da_r / ada;
alpha_i = da_i / ada;
*(C + 0) = ada / r;
*(S + 0) = (alpha_r * db_r + alpha_i *db_i) / r;
*(S + 1) = (alpha_i * db_r - alpha_r *db_i) / r;
*(DA + 0) = alpha_r * r;
*(DA + 1) = alpha_i * r;
}
long double safmin = FLT_MIN;
long double rtmin = sqrt(FLT_MIN/FLT_EPSILON);
#endif
FUNCTION_PROFILE_END(4, 4, 4);
IDEBUG_END;
FLOAT da_r = *(DA+0);
FLOAT da_i = *(DA+1);
FLOAT db_r = *(DB+0);
FLOAT db_i = *(DB+1);
//long double r;
FLOAT *r, *S1=(FLOAT *)malloc(2*sizeof(FLOAT));
FLOAT *R=(FLOAT *)malloc(2*sizeof(FLOAT));
long double d;
return;
FLOAT ada = da_r * da_r + da_i * da_i;
FLOAT adb = db_r * db_r + db_i * db_i;
FLOAT adart = sqrt( da_r * da_r + da_i * da_i);
FLOAT adbrt = sqrt( db_r * db_r + db_i * db_i);
PRINT_DEBUG_NAME;
IDEBUG_START;
FUNCTION_PROFILE_START();
if (db_r == ZERO && db_i == ZERO) {
*C = ONE;
*(S + 0) = ZERO;
*(S + 1) = ZERO;
return;
}
long double safmax = 1./safmin;
#if defined DOUBLE
long double rtmax = safmax /DBL_EPSILON;
#else
long double rtmax = safmax /FLT_EPSILON;
#endif
*(S1 + 0) = *(DB + 0);
*(S1 + 1) = *(DB + 1) *-1;
if (da_r == ZERO && da_i == ZERO) {
*C = ZERO;
if (db_r == ZERO) {
(*DA) = fabsl(db_i);
*S = *S1 /da_r;
*(S+1) = *(S1+1) /da_r;
return;
} else if ( db_i == ZERO) {
*DA = fabsl(db_r);
*S = *S1 /da_r;
*(S+1) = *(S1+1) /da_r;
return;
} else {
long double g1 = MAX( fabsl(db_r), fabsl(db_i));
rtmax =sqrt(safmax/2.);
if (g1 > rtmin && g1 < rtmax) { // unscaled
d = sqrt(adb);
*S = *S1 /d;
*(S+1) = *(S1+1) /d;
*DA = d ;
*(DA+1) = ZERO;
return;
} else { // scaled algorithm
long double u = MIN ( safmax, MAX ( safmin, g1));
FLOAT gs_r = db_r/u;
FLOAT gs_i = db_i/u;
d = sqrt ( gs_r*gs_r + gs_i*gs_i);
*S = gs_r / d;
*(S + 1) = (gs_i * -1) / d;
*DA = d * u;
*(DA+1) = ZERO;
return;
}
}
} else {
FLOAT f1 = MAX ( fabsl(da_r), fabsl(da_i));
FLOAT g1 = MAX ( fabsl(db_r), fabsl(db_i));
rtmax = sqrt(safmax / 4.);
if ( f1 > rtmin && f1 < rtmax && g1 > rtmin && g1 < rtmax) { //unscaled
long double h = ada + adb;
double adahsq = sqrt(ada * h);
if (ada >= h *safmin) {
*C = sqrt(ada/h);
*R = *DA / *C;
*(R+1) = *(DA+1) / *(C+1);
rtmax *= 2.;
if ( ada > rtmin && h < rtmax) { // no risk of intermediate overflow
*S = *S1 * (*DA / adahsq) - *(S1+1)* (*(DA+1)/adahsq);
*(S+1) = *S1 * (*(DA+1) / adahsq) + *(S1+1) * (*DA/adahsq);
} else {
*S = *S1 * (*R/h) - *(S1+1) * (*(R+1)/h);
*(S+1) = *S1 * (*(R+1)/h) + *(S1+1) * (*(R)/h);
}
} else {
*C = ada / adahsq;
if (*C >= safmin)
*R = *DA / *C;
else
*R = *DA * (h / adahsq);
*S = *S1 * ada / adahsq;
*(S+1) = *(S1+1) * ada / adahsq;
}
*DA=*R;
*(DA+1)=*(R+1);
return;
} else { // scaled
FLOAT fs_r, fs_i, gs_r, gs_i;
long double v,w,f2,g2,h;
long double u = MIN ( safmax, MAX ( safmin, MAX(f1,g1)));
gs_r = db_r/u;
gs_i = db_i/u;
g2 = sqrt ( gs_r*gs_r + gs_i*gs_i);
if (f1 /u < rtmin) {
v = MIN (safmax, MAX (safmin, f1));
w = v / u;
fs_r = *DA/ v;
fs_i = *(DA+1) / v;
f2 = sqrt ( fs_r*fs_r + fs_i*fs_i);
h = f2 * w * w + g2;
} else { // use same scaling for both
w = 1.;
fs_r = *DA/ u;
fs_i = *(DA+1) / u;
f2 = sqrt ( fs_r*fs_r + fs_i*fs_i);
h = f2 + g2;
}
if ( f2 >= h * safmin) {
*C = sqrt ( f2 / h );
*DA = fs_r / *C;
*(DA+1) = fs_i / *C;
rtmax *= 2;
if ( f2 > rtmin && h < rtmax) {
*S = gs_r * (fs_r /sqrt(f2*h)) - gs_i * (fs_i / sqrt(f2*h));
*(S+1) = gs_r * (fs_i /sqrt(f2*h)) + gs_i * -1. * (fs_r / sqrt(f2*h));
} else {
*S = gs_r * (*DA/h) - gs_i * (*(DA+1) / h);
*(S+1) = gs_r * (*(DA+1) /h) + gs_i * -1. * (*DA / h);
}
} else { // intermediates might overflow
d = sqrt ( f2 * h);
*C = f2 /d;
if (*C >= safmin) {
*DA = fs_r / *C;
*(DA+1) = fs_i / *C;
} else {
*DA = fs_r * (h / d);
*(DA+1) = fs_i / (h / d);
}
*S = gs_r * (fs_r /d) - gs_i * (fs_i / d);
*(S+1) = gs_r * (fs_i /d) + gs_i * -1. * (fs_r / d);
}
*C *= w;
*DA *= u;
*(DA+1) *= u;
return;
}
}
}

View File

@ -33,7 +33,7 @@ endif
ifdef TARGET_CORE
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
ifeq ($(GCCVERSIONGTEQ10), 1)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12)))
override CFLAGS += -march=sapphirerapids
else
override CFLAGS += -march=skylake-avx512 -mavx512f
@ -48,7 +48,7 @@ ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
endif
else ifeq ($(TARGET_CORE), COOPERLAKE)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
ifeq ($(GCCVERSIONGTEQ10), 1)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(CLANGVERSIONGTEQ9)))
override CFLAGS += -march=cooperlake
else
override CFLAGS += -march=skylake-avx512 -mavx512f
@ -77,6 +77,12 @@ else ifeq ($(TARGET_CORE), ZEN)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
else ifeq ($(TARGET_CORE), LOONGSON3R4)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS)
else ifneq ($(filter NEOVERSEN2 NEOVERSEV1, $(TARGET_CORE)),)
ifeq ($(C_COMPILER), PGI)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -Msve_intrinsics
else
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
endif
else
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
endif

View File

@ -35,6 +35,12 @@ USE_TRMM = 1
endif
endif
ifneq ($(DYNAMIC_ARCH), 1)
ifeq ($(TARGET), MIPS64_GENERIC)
USE_TRMM = 1
endif
endif
ifeq ($(CORE), HASWELL)
USE_TRMM = 1
endif

View File

@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT absxi = 0.0;
if (n <= 0 || inc_x <= 0) return(0.0);
if (n <= 0 || inc_x == 0) return(0.0);
if ( n == 1 ) return( ABS(x[0]) );
n *= inc_x;

View File

@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG inc_x2;
FLOAT temp;
if (n <= 0 || inc_x <= 0) return(0.0);
if (n <= 0 || inc_x == 0) return(0.0);
inc_x2 = 2 * inc_x;

View File

@ -57,7 +57,7 @@ CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
DAXPYKERNEL = daxpy_thunderx2t99.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
@ -81,45 +81,35 @@ DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SASUMKERNEL = sasum_thunderx2t99.c
DASUMKERNEL = dasum_thunderx2t99.c
CASUMKERNEL = casum_thunderx2t99.c
ZASUMKERNEL = zasum_thunderx2t99.c
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SCOPYKERNEL = copy_thunderx2t99.c
DCOPYKERNEL = copy_thunderx2t99.c
CCOPYKERNEL = copy_thunderx2t99.c
ZCOPYKERNEL = copy_thunderx2t99.c
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SSWAPKERNEL = swap_thunderx2t99.S
DSWAPKERNEL = swap_thunderx2t99.S
CSWAPKERNEL = swap_thunderx2t99.S
ZSWAPKERNEL = swap_thunderx2t99.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
ISAMAXKERNEL = iamax_thunderx2t99.c
IDAMAXKERNEL = iamax_thunderx2t99.c
ICAMAXKERNEL = izamax_thunderx2t99.c
IZAMAXKERNEL = izamax_thunderx2t99.c
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
SNRM2KERNEL = scnrm2_thunderx2t99.c
DNRM2KERNEL = dznrm2_thunderx2t99.c
CNRM2KERNEL = scnrm2_thunderx2t99.c
ZNRM2KERNEL = dznrm2_thunderx2t99.c
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
SDOTKERNEL = ../generic/dot.c
else
SDOTKERNEL = dot.S
endif
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DDOTKERNEL = dot.c
SDOTKERNEL = dot.c
CDOTKERNEL = zdot_thunderx2t99.c
ZDOTKERNEL = zdot_thunderx2t99.c
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S
@ -128,10 +118,10 @@ SGEMM_BETA = sgemm_beta.S
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
SGEMMINCOPY = sgemm_ncopy_sve_v1.c
SGEMMITCOPY = sgemm_tcopy_sve_v1.c
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S
SGEMMINCOPY = gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c
SGEMMITCOPY = gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
@ -149,8 +139,8 @@ SSYMMLCOPY_M = symm_lcopy_sve.c
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
DGEMMINCOPY = gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c
DGEMMITCOPY = gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
@ -170,8 +160,8 @@ DSYMMLCOPY_M = symm_lcopy_sve.c
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
CGEMMINCOPY = cgemm_ncopy_sve_v1.c
CGEMMITCOPY = cgemm_tcopy_sve_v1.c
CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
@ -194,8 +184,8 @@ CSYMMLCOPY_M = zsymm_lcopy_sve.c
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
ZGEMMINCOPY = zgemm_ncopy_sve_v1.c
ZGEMMITCOPY = zgemm_tcopy_sve_v1.c
ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c

View File

@ -1,189 +1 @@
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = daxpy_thunderx2t99.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SASUMKERNEL = sasum_thunderx2t99.c
DASUMKERNEL = dasum_thunderx2t99.c
CASUMKERNEL = casum_thunderx2t99.c
ZASUMKERNEL = zasum_thunderx2t99.c
SCOPYKERNEL = copy_thunderx2t99.c
DCOPYKERNEL = copy_thunderx2t99.c
CCOPYKERNEL = copy_thunderx2t99.c
ZCOPYKERNEL = copy_thunderx2t99.c
SSWAPKERNEL = swap_thunderx2t99.S
DSWAPKERNEL = swap_thunderx2t99.S
CSWAPKERNEL = swap_thunderx2t99.S
ZSWAPKERNEL = swap_thunderx2t99.S
ISAMAXKERNEL = iamax_thunderx2t99.c
IDAMAXKERNEL = iamax_thunderx2t99.c
ICAMAXKERNEL = izamax_thunderx2t99.c
IZAMAXKERNEL = izamax_thunderx2t99.c
SNRM2KERNEL = scnrm2_thunderx2t99.c
DNRM2KERNEL = dznrm2_thunderx2t99.c
CNRM2KERNEL = scnrm2_thunderx2t99.c
ZNRM2KERNEL = dznrm2_thunderx2t99.c
DDOTKERNEL = dot.c
SDOTKERNEL = dot.c
CDOTKERNEL = zdot_thunderx2t99.c
ZDOTKERNEL = zdot_thunderx2t99.c
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S
SGEMM_BETA = sgemm_beta.S
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
ifeq ($(SGEMM_UNROLL_M), 16)
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
else
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
endif
ifeq ($(SGEMM_UNROLL_M), 4)
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
else
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
endif
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(SGEMM_UNROLL_N), 16)
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
else
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
endif
ifeq ($(SGEMM_UNROLL_N), 4)
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
else
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
endif
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
ifeq ($(DGEMM_UNROLL_M), 8)
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
else
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
endif
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(DGEMM_UNROLL_N), 4)
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
else
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
endif
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
include $(KERNELDIR)/KERNEL.ARMV8SVE

View File

@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow3 x15
#define pA x16
#define alphaR w17
#define alphaI w18
#define alphaI w19
#define alpha0_R s10
#define alphaV0_R v10.s[0]

View File

@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow3 x15
#define pA x16
#define alphaR w17
#define alphaI w18
#define alphaI w19
#define alpha0_R s10
#define alphaV0_R v10.s[0]

View File

@ -240,7 +240,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pB, pB, 32
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNELv1x4_M1
@ -276,9 +275,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1rw z15.s, p0/z, [pB, 28]
add pB, pB, 32
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNELv1x4_M2
@ -313,11 +309,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri z23.s, p1/m, z2.s, z15.s
ld1rw z15.s, p0/z, [pB, 28]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
add pB, pB, 32
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
.endm
.macro KERNELv1x4_E
@ -341,10 +333,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ii z22.s, p1/m, z3.s, z15.s
OP_ri z23.s, p1/m, z2.s, z15.s
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
.endm
.macro KERNELv1x4_SUB
@ -383,13 +371,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ii z22.s, p1/m, z1.s, z15.s
OP_ri z23.s, p1/m, z0.s, z15.s
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
.endm
.macro SAVEv1x4
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
fmla z24.s, p1/m, z16.s, alphaz_R
fmls z24.s, p1/m, z17.s, alphaz_I
@ -407,8 +391,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
st2w {z26.s, z27.s}, p1, [pCRow1]
add pCRow1, pCRow1, lanes, lsl #3
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld2w {z28.s, z29.s}, p1/z, [pCRow2]
fmla z28.s, p1/m, z20.s, alphaz_R
fmls z28.s, p1/m, z21.s, alphaz_I
@ -425,12 +407,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla z31.s, p1/m, z23.s, alphaz_R
st2w {z30.s, z31.s}, p1, [pCRow3]
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
.endm
/******************************************************************************/
@ -466,8 +444,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVEv1x2
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
fmla z24.s, p1/m, z16.s, alphaz_R
fmls z24.s, p1/m, z17.s, alphaz_I
@ -485,10 +461,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
st2w {z26.s, z27.s}, p1, [pCRow1]
add pCRow1, pCRow1, lanes, lsl #3
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
.endm
/******************************************************************************/
@ -516,8 +488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVEv1x1
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
fmla z24.s, p1/m, z16.s, alphaz_R
fmls z24.s, p1/m, z17.s, alphaz_I
@ -527,8 +497,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
.endm
/******************************************************************************/
@ -553,9 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alphaR, s0
dup alphaz_R, alphaR
fmov alphaI, s1
@ -676,10 +641,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
bne .Lcgemm_kernel_L4_Mv1_46
.Lcgemm_kernel_L4_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x4
.Lcgemm_kernel_L4_Mv1_END:

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b32(j, n);
svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
uint32_t active = svcntp_b32(svptrue_b32(), pg);
do {
@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
aoffset += active * lda * 2;
j += svcntw();
pg = svwhilelt_b32(j, n);
pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
active = svcntp_b32(svptrue_b32(), pg);

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b32(j, n);
svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
uint32_t active = svcntp_b32(svptrue_b32(), pg);
do {
@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
aoffset += active * 2;
j += svcntw();
pg = svwhilelt_b32(j, n);
pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));

View File

@ -49,10 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow3 x15
#define pA x16
#define alphaR w17
#define alphaI w18
#define temp x19
#define tempOffset x20
#define tempK x21
#define alphaI w19
#define temp x20
#define tempOffset x21
#define tempK x22
#define alpha0_R s10
#define alphaV0_R v10.s[0]

View File

@ -1,79 +0,0 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
// TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;
svint64_t lda_vec = svindex_s64(0LL, lda);
uint64_t sve_size = svcntd();
aoffset = a;
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b64(j, n);
uint64_t active = svcntp_b64(svptrue_b64(), pg);
do {
aoffset1 = aoffset;
uint64_t i_cnt = m;
while (i_cnt--) {
svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
svst1_f64(pg, (double *) boffset, a_vec);
aoffset1++;
boffset += active;
}
aoffset += sve_size * lda;
j += svcntd();
pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
return 0;
}

View File

@ -1,77 +0,0 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
// TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;
uint64_t sve_size = svcntd();
aoffset = a;
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b64(j, n);
uint64_t active = svcntp_b64(svptrue_b64(), pg);
do {
aoffset1 = aoffset;
uint64_t i_cnt = m;
while (i_cnt--) {
svfloat64_t a_vec = svld1(pg, (double *)aoffset1);
svst1_f64(pg, (double *) boffset, a_vec);
aoffset1 += lda;
boffset += active;
}
aoffset += sve_size;
j += svcntd();
pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
return 0;
}

View File

@ -50,8 +50,8 @@ static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) {
BLASLONG sve_width = SVE_WIDTH;
for (BLASLONG i = 0; i < n; i += sve_width * 2) {
svbool_t pg_a = SVE_WHILELT(i, n);
svbool_t pg_b = SVE_WHILELT(i + sve_width, n);
svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n);
svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n);
SVE_TYPE x_vec_a = svld1(pg_a, &x[i]);
SVE_TYPE y_vec_a = svld1(pg_a, &y[i]);

View File

@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <float.h>
#include <arm_neon.h>
#if defined(SMP)
@ -404,7 +404,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
#else
nrm2_compute(n, x, inc_x, &ssq, &scale);
#endif
if (fabs(scale) <1.e-300) return 0.;
volatile FLOAT sca = fabs(scale);
if (sca < DBL_MIN) return 0.;
ssq = sqrt(ssq) * scale;
return ssq;

View File

@ -0,0 +1,121 @@
/***************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdint.h>
#include <stdio.h>
#include <arm_sve.h>
#include "common.h"
#ifdef DOUBLE
#define COUNT "cntd"
#define SV_TYPE svfloat64_t
#define SV_INDEX svuint64_t
#define SV_INDEXER svindex_u64
#define SV_TRUE svptrue_b64
#define SV_WHILE svwhilelt_b64
#else
#define COUNT "cntw"
#define SV_TYPE svfloat32_t
#define SV_INDEX svuint32_t
#define SV_INDEXER svindex_u32
#define SV_TRUE svptrue_b32
#define SV_WHILE svwhilelt_b32
#endif
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \
a_vec_real = svld1_gather_index(pg, a_offset_inner, lda_vec); \
a_vec_imag = svld1_gather_index(pg, a_offset_inner + 1, lda_vec); \
svst2(pg, b_offset, svcreate2(a_vec_real, a_vec_imag)); \
a_offset_inner += 2; \
b_offset += active * 2;
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
uint64_t sve_size;
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : );
IFLOAT *a_offset, *a_offset_inner, *b_offset;
a_offset = a;
b_offset = b;
SV_INDEX lda_vec = SV_INDEXER(0LL, lda * 2);
SV_TYPE a_vec_real;
SV_TYPE a_vec_imag;
svbool_t pg_true = SV_TRUE();
BLASLONG single_vectors_n = n & -sve_size;
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
a_offset_inner = a_offset;
svbool_t pg = pg_true;
uint64_t active = sve_size;
uint64_t i_cnt = m >> 2;
while (i_cnt--) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 2) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 1) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
a_offset += sve_size * lda * 2;
}
BLASLONG remaining_n = n - single_vectors_n;
if (remaining_n) {
a_offset_inner = a_offset;
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
uint64_t active = remaining_n;
uint64_t i_cnt = m >> 2;
while (i_cnt--) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 2) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 1) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
}
return 0;
}

View File

@ -0,0 +1,131 @@
/***************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdint.h>
#include <stdio.h>
#include <arm_sve.h>
#include "common.h"
#ifdef DOUBLE
#define COUNT "cntd"
#define SV_TYPE svfloat64_t
#define SV_INDEX svuint64_t
#define SV_INDEXER svindex_u64
#define SV_TRUE svptrue_b64
#define SV_WHILE svwhilelt_b64
#define SV_PREFETCH svprfd_gather_index
#else
#define COUNT "cntw"
#define SV_TYPE svfloat32_t
#define SV_INDEX svuint32_t
#define SV_INDEXER svindex_u32
#define SV_TRUE svptrue_b32
#define SV_WHILE svwhilelt_b32
#define SV_PREFETCH svprfw_gather_index
#endif
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \
a_vec = svld1_gather_index(pg, a_offset_inner, lda_vec); \
svst1(pg, b_offset, a_vec); \
a_offset_inner++; \
b_offset += active;
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
uint64_t sve_size;
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : );
IFLOAT *a_offset, *a_offset_inner, *b_offset;
a_offset = a;
b_offset = b;
SV_INDEX lda_vec = SV_INDEXER(0LL, lda);
SV_TYPE a_vec;
svbool_t pg_true = SV_TRUE();
BLASLONG single_vectors_n = n & -sve_size;
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
a_offset_inner = a_offset;
svbool_t pg = pg_true;
uint64_t active = sve_size;
uint64_t i_cnt = m >> 3;
while (i_cnt--) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 4) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 2) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 1) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
a_offset += sve_size * lda;
}
BLASLONG remaining_n = n - single_vectors_n;
if (remaining_n) {
a_offset_inner = a_offset;
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
uint64_t active = remaining_n;
uint64_t i_cnt = m >> 2;
while (i_cnt--) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 2) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 1) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
}
return 0;
}

View File

@ -0,0 +1,115 @@
/***************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdint.h>
#include <stdio.h>
#include <arm_sve.h>
#include "common.h"
#ifdef DOUBLE
#define COUNT "cntd"
#define SV_TYPE svfloat64x2_t
#define SV_TRUE svptrue_b64
#define SV_WHILE svwhilelt_b64
#else
#define COUNT "cntw"
#define SV_TYPE svfloat32x2_t
#define SV_TRUE svptrue_b32
#define SV_WHILE svwhilelt_b32
#endif
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \
a_vec = svld2(pg, a_offset_inner); \
svst2(pg, b_offset, a_vec); \
a_offset_inner += lda * 2; \
b_offset += active * 2;
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
uint64_t sve_size = svcntw();
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : );
IFLOAT *a_offset, *a_offset_inner, *b_offset;
a_offset = a;
b_offset = b;
SV_TYPE a_vec;
svbool_t pg_true = SV_TRUE();
BLASLONG single_vectors_n = n & -sve_size;
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
a_offset_inner = a_offset;
svbool_t pg = pg_true;
uint64_t active = sve_size;
uint64_t i_cnt = m >> 2;
while (i_cnt--) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 2) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 1) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
a_offset += sve_size * 2;
}
BLASLONG remaining_n = n - single_vectors_n;
if (remaining_n) {
a_offset_inner = a_offset;
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
uint64_t active = remaining_n;
uint64_t i_cnt = m >> 2;
while (i_cnt--) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 2) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 1) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
}
return 0;
}

View File

@ -0,0 +1,125 @@
/***************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdint.h>
#include <stdio.h>
#include <arm_sve.h>
#include "common.h"
#ifdef DOUBLE
#define COUNT "cntd"
#define SV_TYPE svfloat64_t
#define SV_TRUE svptrue_b64
#define SV_WHILE svwhilelt_b64
#else
#define COUNT "cntw"
#define SV_TYPE svfloat32_t
#define SV_TRUE svptrue_b32
#define SV_WHILE svwhilelt_b32
#endif
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \
a_vec = svld1(pg, a_offset_inner); \
svst1(pg, b_offset, a_vec); \
a_offset_inner += lda; \
b_offset += active;
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
uint64_t sve_size = svcntw();
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : );
IFLOAT *a_offset, *a_offset_inner, *b_offset;
a_offset = a;
b_offset = b;
SV_TYPE a_vec;
svbool_t pg_true = SV_TRUE();
BLASLONG single_vectors_n = n & -sve_size;
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
a_offset_inner = a_offset;
svbool_t pg = pg_true;
uint64_t active = sve_size;
uint64_t i_cnt = m >> 3;
while (i_cnt--) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 4) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 2) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 1) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
a_offset += sve_size;
}
BLASLONG remaining_n = n - single_vectors_n;
if (remaining_n) {
a_offset_inner = a_offset;
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
uint64_t active = remaining_n;
uint64_t i_cnt = m >> 2;
while (i_cnt--) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 2) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 1) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
}
return 0;
}

0
kernel/arm64/sgemm_beta.S Executable file → Normal file
View File

View File

@ -1,78 +0,0 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
// TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;
svint32_t lda_vec = svindex_s32(0LL, lda);
uint32_t sve_size = svcntw();
aoffset = a;
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b32(j, n);
uint32_t active = svcntp_b32(svptrue_b32(), pg);
do {
aoffset1 = aoffset;
uint32_t i_cnt = m;
while (i_cnt--) {
svfloat32_t a_vec = svld1_gather_index(pg, (float *) aoffset1, lda_vec);
svst1_f32(pg, (float *) boffset, a_vec);
aoffset1++;
boffset += active;
}
aoffset += sve_size * lda;
j += svcntw();
pg = svwhilelt_b32(j, n);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));
return 0;
}

View File

@ -1,77 +0,0 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
// TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;
uint32_t sve_size = svcntw();
aoffset = a;
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b32(j, n);
uint32_t active = svcntp_b32(svptrue_b32(), pg);
do {
aoffset1 = aoffset;
uint32_t i_cnt = m;
while (i_cnt--) {
svfloat32_t a_vec = svld1(pg, (float *) aoffset1);
svst1_f32(pg, (float *) boffset, a_vec);
aoffset1 += lda;
boffset += active;
}
aoffset += sve_size;
j += svcntw();
pg = svwhilelt_b32(j, n);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));
return 0;
}

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n);
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL);
@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size;
posX_vec = svdup_s64(posX);
j += sve_size;
pg = svwhilelt_b64(j, n);
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
int32_t N = n;
int32_t j = 0;
svbool_t pg = svwhilelt_b32(j, N);
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1);
@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size;
posX_vec = svdup_s32(posX);
j += sve_size;
pg = svwhilelt_b32(j, N);
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n);
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL);
@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size;
posX_vec = svdup_s64(posX);
j += sve_size;
pg = svwhilelt_b64(j, n);
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
int32_t N = n;
int32_t j = 0;
svbool_t pg = svwhilelt_b32(j, N);
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1);
@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size;
posX_vec = svdup_s32(posX);
j += sve_size;
pg = svwhilelt_b32(j, N);
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));

View File

@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao;
#ifdef DOUBLE
svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n);
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, n);
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do
@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active;
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, n);
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif

View File

@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao;
js = 0;
#ifdef DOUBLE
svbool_t pn = svwhilelt_b64(js, n);
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
svbool_t pn = svwhilelt_b32(js, n);
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do
@ -122,11 +122,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active;
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, n);
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif

View File

@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao;
#ifdef DOUBLE
svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n);
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, n);
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do
@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active;
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, n);
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif

Some files were not shown because too many files have changed in this diff Show More