Merge branch 'xianyi:develop' into nanobench

This commit is contained in:
Christopher Sidebottom 2023-08-14 15:45:22 +01:00 committed by GitHub
commit 82827762c0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
810 changed files with 43015 additions and 5473 deletions

View File

@ -2,9 +2,166 @@ macos_instance:
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
task:
name: AppleM1/LLVM
compile_script:
- brew install llvm
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- make TARGET=VORTEX USE_OPENMP=1 CC=clang
task:
name: AppleM1/LLVM/ILP64
compile_script:
- brew install llvm
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1
task:
name: AppleM1/LLVM/CMAKE
compile_script:
- brew install llvm
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- mkdir build
- cd build
- cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
- make
task:
name: AppleM1/GCC/MAKE/OPENMP
compile_script:
- brew install gcc@11
- export PATH=/opt/homebrew/bin:$PATH
- export LDFLAGS="-L/opt/homebrew/lib"
- export CPPFLAGS="-I/opt/homebrew/include"
- make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
macos_instance:
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
task:
name: AppleM1/LLVM x86_64 xbuild
compile_script:
- #brew install llvm
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- export ARCHS="i386 x86_64"
- export ARCHS_STANDARD="i386 x86_64"
- export ARCHS_STANDARD_32_64_BIT="i386 x86_64"
- export ARCHS_STANDARD_64_BIT=x86_64
- export ARCHS_STANDARD_INCLUDING_64_BIT="i386 x86_64"
- export ARCHS_UNIVERSAL_IPHONE_OS="i386 x86_64"
- export VALID_ARCHS="i386 x86_64"
- xcrun --sdk macosx --show-sdk-path
- xcodebuild -version
- export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX12.3.sdk -arch x86_64"
- make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
always:
config_artifacts:
path: "*conf*"
type: text/plain
# lib_artifacts:
# path: "libopenblas*"
# type: application/octet-streamm
macos_instance:
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
task:
name: AppleM1/LLVM armv8-ios xbuild
compile_script:
- #brew install llvm
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
- make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1
always:
config_artifacts:
path: "*conf*"
type: text/plain
macos_instance:
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
task:
name: AppleM1/LLVM armv7-androidndk xbuild
compile_script:
- #brew install android-ndk
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- find /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b -name "armv7a-linux-androideabi*-ranlib"
- #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b/AndroidNDK8937393.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
always:
config_artifacts:
path: "*conf*"
type: text/plain
task:
name: NeoverseN1
arm_container:
image: node:latest
compile_script:
- make
task:
name: NeoverseN1-ILP64
arm_container:
image: node:latest
compile_script:
- make INTERFACE64=1
task:
name: NeoverseN1-OMP
arm_container:
image: node:latest
cpu: 8
compile_script:
- make USE_OPENMP=1
FreeBSD_task:
name: FreeBSD-gcc12
freebsd_instance:
image_family: freebsd-13-2
install_script:
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
compile_script:
- ls -l /usr/local/lib
- gmake CC=gcc
FreeBSD_task:
name: freebsd-gcc12-ilp64
freebsd_instance:
image_family: freebsd-13-2
install_script:
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
compile_script:
- ls -l /usr/local/lib
- gmake CC=gcc INTERFACE64=1
#task:
# name: Windows/LLVM16 --- too slow ---
# windows_container:
# image: cirrusci/windowsservercore:cmake-2021.12.07
# install_script:
# - choco list --localonly
# - choco install -y llvm
# - # choco install -y cmake --installargs '"ADD_CMAKE_TO_PATH=System"'
# - choco install -y ninja
# - refreshenv
# - cd "c:/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/VC/Auxiliary/Build"
# - vcvarsall x64
# - cd "C:\Users\ContainerAdministrator\AppData\Local\Temp\cirrus-ci-build"
# - cmake -S . -B build -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release
# - cd build
# - cmake --build .
# - ctest

121
.github/workflows/c910v.yml vendored Normal file
View File

@ -0,0 +1,121 @@
name: c910v qemu test
on: [push, pull_request]
permissions:
contents: read # to fetch code (actions/checkout)
jobs:
TEST:
runs-on: ubuntu-latest
env:
xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282
toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz
strategy:
fail-fast: false
matrix:
include:
- target: RISCV64_GENERIC
triple: riscv64-linux-gnu
apt_triple: riscv64-linux-gnu
opts: NO_SHARED=1 TARGET=RISCV64_GENERIC
- target: C910V
triple: riscv64-unknown-linux-gnu
apt_triple: riscv64-linux-gnu
opts: NO_SHARED=1 TARGET=C910V
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: install build deps
run: |
sudo apt-get update
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross
- name: checkout qemu
uses: actions/checkout@v3
with:
repository: T-head-Semi/qemu
path: qemu
ref: 1e692ebb43d396c52352406323fc782c1ac99a42
- name: build qemu
run: |
# Force use c910v qemu-user
wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
cd qemu
patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
make -j$(nproc)
make install
- name: Compilation cache
uses: actions/cache@v3
with:
path: ~/.ccache
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
restore-keys: |
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
ccache-${{ runner.os }}-${{ matrix.target }}
- name: Configure ccache
run: |
test -d ~/.ccache || mkdir -p ~/.ccache
echo "max_size = 300M" > ~/.ccache/ccache.conf
echo "compression = true" >> ~/.ccache/ccache.conf
ccache -s
- name: build OpenBLAS
run: |
wget ${xuetie_toolchain}/${toolchain_file_name}
tar -xvf ${toolchain_file_name} -C /opt
export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1/bin:$PATH"
make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
qemu-riscv64 ./utest/openblas_utest
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat1
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat2 < ./ctest/sin2
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat2 < ./ctest/din2
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat2 < ./ctest/cin2
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat2 < ./ctest/zin2
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat3 < ./ctest/sin3
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat3 < ./ctest/din3
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat3 < ./ctest/cin3
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat3 < ./ctest/zin3
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat1
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat1
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat1
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat1
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat1
rm -f ./test/?BLAT2.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat
rm -f ./test/?BLAT2.SUMM
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat
rm -f ./test/?BLAT3.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat
rm -f ./test/?BLAT3.SUMM
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat

View File

@ -151,40 +151,53 @@ jobs:
strategy:
fail-fast: false
matrix:
msystem: [MINGW64, MINGW32, CLANG64]
msystem: [MINGW64, MINGW32, CLANG64, CLANG32]
idx: [int32, int64]
build-type: [Release]
include:
- msystem: MINGW64
idx: int32
target-prefix: mingw-w64-x86_64
fc-pkg: mingw-w64-x86_64-gcc-fortran
fc-pkg: fc
- msystem: MINGW32
idx: int32
target-prefix: mingw-w64-i686
fc-pkg: mingw-w64-i686-gcc-fortran
fc-pkg: fc
- msystem: CLANG64
idx: int32
target-prefix: mingw-w64-clang-x86_64
fc-pkg: fc
# Compiling with Flang 16 seems to cause test errors on machines
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
no-avx512-flags: -DNO_AVX512=1
- msystem: CLANG32
idx: int32
target-prefix: mingw-w64-clang-i686
fc-pkg: cc
c-lapack-flags: -DC_LAPACK=ON
- msystem: MINGW64
idx: int64
idx64-flags: -DBINARY=64 -DINTERFACE64=1
target-prefix: mingw-w64-x86_64
fc-pkg: mingw-w64-x86_64-gcc-fortran
fc-pkg: fc
- msystem: CLANG64
idx: int64
idx64-flags: -DBINARY=64 -DINTERFACE64=1
target-prefix: mingw-w64-clang-x86_64
c-lapack-flags: -DC_LAPACK=ON
fc-pkg: fc
# Compiling with Flang 16 seems to cause test errors on machines
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
no-avx512-flags: -DNO_AVX512=1
- msystem: MINGW64
idx: int32
target-prefix: mingw-w64-x86_64
fc-pkg: mingw-w64-x86_64-gcc-fortran
fc-pkg: fc
build-type: None
exclude:
- msystem: MINGW32
idx: int64
- msystem: CLANG32
idx: int64
defaults:
run:
@ -209,7 +222,7 @@ jobs:
install: >-
base-devel
${{ matrix.target-prefix }}-cc
${{ matrix.fc-pkg }}
${{ matrix.target-prefix }}-${{ matrix.fc-pkg }}
${{ matrix.target-prefix }}-cmake
${{ matrix.target-prefix }}-ninja
${{ matrix.target-prefix }}-ccache
@ -217,14 +230,21 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v3
- name: Compilation cache
uses: actions/cache@v3
with:
# It looks like this path needs to be hard-coded.
path: C:/msys64/home/runneradmin/.ccache
- name: Prepare ccache
# Get cache location of ccache
# Create key that is used in action/cache/restore and action/cache/save steps
id: ccache-prepare
run: |
echo "ccachedir=$(cygpath -m $(ccache -k cache_dir))" >> $GITHUB_OUTPUT
# We include the commit sha in the cache key, as new cache entries are
# only created if there is no existing entry for the key yet.
key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}
echo "key=ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}" >> $GITHUB_OUTPUT
- name: Restore ccache
uses: actions/cache/restore@v3
with:
path: ${{ steps.ccache-prepare.outputs.ccachedir }}
key: ${{ steps.ccache-prepare.outputs.key }}
# Restore a matching ccache cache entry. Prefer same branch.
restore-keys: |
ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}
@ -234,9 +254,10 @@ jobs:
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota.
run: |
which ccache
test -d ~/.ccache || mkdir -p ~/.ccache
echo "max_size = 250M" > ~/.ccache/ccache.conf
echo "compression = true" >> ~/.ccache/ccache.conf
test -d ${{ steps.ccache-prepare.outputs.ccachedir }} || mkdir -p ${{ steps.ccache-prepare.outputs.ccachedir }}
echo "max_size = 250M" > ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf
echo "compression = true" >> ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf
ccache -p
ccache -s
echo $HOME
cygpath -w $HOME
@ -253,6 +274,7 @@ jobs:
-DTARGET=CORE2 \
${{ matrix.idx64-flags }} \
${{ matrix.c-lapack-flags }} \
${{ matrix.no-avx512-flags }} \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
..
@ -264,10 +286,30 @@ jobs:
continue-on-error: true
run: ccache -s
- name: Save ccache
# Save the cache after we are done (successfully) building
uses: actions/cache/save@v3
with:
path: ${{ steps.ccache-prepare.outputs.ccachedir }}
key: ${{ steps.ccache-prepare.outputs.key }}
- name: Run tests
id: run-ctest
timeout-minutes: 60
run: cd build && ctest
- name: Re-run tests
if: always() && (steps.run-ctest.outcome == 'failure')
timeout-minutes: 60
run: |
cd build
echo "::group::Re-run ctest"
ctest --rerun-failed --output-on-failure || true
echo "::endgroup::"
echo "::group::Log from these tests"
[ ! -f Testing/Temporary/LastTest.log ] || cat Testing/Temporary/LastTest.log
echo "::endgroup::"
cross_build:
runs-on: ubuntu-22.04
@ -295,6 +337,7 @@ jobs:
- name: Install Dependencies
run: |
sudo apt-get update
sudo apt-get install -y ccache gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-${{ matrix.target }}-cross
- name: Compilation cache

110
.github/workflows/loongarch64.yml vendored Normal file
View File

@ -0,0 +1,110 @@
name: loongarch64 qemu test
on: [push, pull_request]
jobs:
TEST:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- target: LOONGSONGENERIC
triple: loongarch64-unknown-linux-gnu
opts: NO_SHARED=1 TARGET=LOONGSONGENERIC
- target: LOONGSON3R5
triple: loongarch64-unknown-linux-gnu
opts: NO_SHARED=1 TARGET=LOONGSON3R5
- target: LOONGSON2K1000
triple: loongarch64-unknown-linux-gnu
opts: NO_SHARED=1 TARGET=LOONGSON2K1000
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Install APT deps
run: |
sudo add-apt-repository ppa:savoury1/virtualisation
sudo apt-get update
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
qemu-user-static
- name: Download and install loongarch64-toolchain
run: |
wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz
tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt
- name: Set env
run: |
echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV
- name: Compilation cache
uses: actions/cache@v3
with:
path: ~/.ccache
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
restore-keys: |
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
ccache-${{ runner.os }}-${{ matrix.target }}
- name: Configure ccache
run: |
test -d ~/.ccache || mkdir -p ~/.ccache
echo "max_size = 300M" > ~/.ccache/ccache.conf
echo "compression = true" >> ~/.ccache/ccache.conf
ccache -s
- name: Disable utest dsdot:dsdot_n_1
run: |
echo -n > utest/test_dsdot.c
echo "Due to the qemu versions 7.2 causing utest cases to fail,"
echo "the utest dsdot:dsdot_n_1 have been temporarily disabled."
- name: Build OpenBLAS
run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
- name: Test
run: |
qemu-loongarch64-static ./utest/openblas_utest
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1
rm -f ./test/?BLAT2.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
rm -f ./test/?BLAT2.SUMM
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
rm -f ./test/?BLAT3.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat
rm -f ./test/?BLAT3.SUMM
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat

1
.gitignore vendored
View File

@ -14,6 +14,7 @@ lapack-3.4.2
lapack-3.4.2.tgz
lapack-netlib/make.inc
lapack-netlib/lapacke/include/lapacke_mangling.h
lapack-netlib/SRC/la_constants.mod
lapack-netlib/TESTING/testing_results.txt
lapack-netlib/INSTALL/test*
lapack-netlib/TESTING/xeigtstc

View File

@ -311,19 +311,25 @@ endif()
#if (MSVC OR NOT NOFORTRAN)
if (NOT NO_CBLAS)
if (NOT ONLY_CBLAS)
# Broken without fortran on unix
add_subdirectory(utest)
add_subdirectory(utest)
endif()
endif()
if (NOT NOFORTRAN)
if (NOT ONLY_CBLAS)
# Build test and ctest
add_subdirectory(test)
endif()
if (BUILD_TESTING)
add_subdirectory(lapack-netlib/TESTING)
endif()
endif()
if(NOT NO_CBLAS)
if (NOT ONLY_CBLAS)
add_subdirectory(ctest)
endif()
endif()
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
add_subdirectory(cpp_thread_test)
@ -432,7 +438,7 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
endif()
if (NOT USE_PERL)
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
COMMENT "renaming symbols"
)
@ -543,9 +549,8 @@ configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/
install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
set(PN OpenBLAS)
set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}")
set(CMAKECONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PN}${SUFFIX64}")
configure_package_config_file(cmake/${PN}Config.cmake.in
"${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake"
INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR})

15
Jenkinsfile vendored
View File

@ -1,9 +1,14 @@
node {
stage('Checkout') {
checkout
pipeline {
agent {
docker {
image 'osuosl/ubuntu-s390x'
}
}
stages {
stage('Build') {
sh("make")
steps {
sh 'make clean && make'
}
}
}
}

16
Jenkinsfile.pwr Normal file
View File

@ -0,0 +1,16 @@
pipeline {
agent {
docker {
image 'osuosl/ubuntu-ppc64le'
}
}
stages {
stage('Build') {
steps {
sh 'sudo apt update'
sh 'sudo apt install gfortran -y'
sh 'make clean && make'
}
}
}
}

View File

@ -373,10 +373,10 @@ ifneq ($(CROSS), 1)
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING)
endif
lapack-runtest:
lapack-runtest: lapack-test
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
./testsecond; ./testdsecnd; ./testieee; ./testversion )
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING )
blas-test:

View File

@ -69,7 +69,7 @@ endif
# in GCC>=9
ifeq ($(CORE), NEOVERSEN1)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
ifeq ($(GCCVERSIONGTEQ9), 1)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
@ -92,9 +92,14 @@ endif
# in GCC>=10.4
ifeq ($(CORE), NEOVERSEV1)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
ifeq ($(GCCVERSIONGTEQ10), 1)
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-v1
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
CCOMMON_OPT += -march=armv8.4-a+sve
ifeq (1, $(ISCLANG))
CCOMMON_OPT += -mtune=cortex-x1
else
CCOMMON_OPT += -mtune=neoverse-v1
endif
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
endif
@ -122,8 +127,8 @@ endif
# in GCC>=10.4
ifeq ($(CORE), NEOVERSEN2)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
ifeq ($(GCCVERSIONGTEQ10), 1)
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
ifneq ($(OSNAME), Darwin)
CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
else
@ -155,7 +160,7 @@ endif
# Use a53 tunings because a55 is only available in GCC>=8.1
ifeq ($(CORE), CORTEXA55)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
ifeq ($(GCCVERSIONGTEQ8), 1)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ8) $(ISCLANG)))
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
@ -196,8 +201,13 @@ endif
endif
ifeq ($(CORE), THUNDERX3T110)
ifeq ($(GCCVERSIONGTEQ10), 1)
CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
CCOMMON_OPT += -march=armv8.3-a
ifeq (0, $(ISCLANG))
CCOMMON_OPT += -mtune=thunderx3t110
else
CCOMMON_OPT += -mtune=thunderx2t99
endif
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
endif
@ -225,9 +235,12 @@ endif
endif
endif
ifeq ($(GCCVERSIONGTEQ9), 1)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
ifeq ($(CORE), EMAG8180)
CCOMMON_OPT += -march=armv8-a -mtune=emag
CCOMMON_OPT += -march=armv8-a
ifeq ($(ISCLANG), 0)
CCOMMON_OPT += -mtune=emag
endif
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a -mtune=emag
endif

View File

@ -645,7 +645,7 @@ DYNAMIC_CORE += HASWELL ZEN
endif
ifneq ($(NO_AVX512), 1)
ifneq ($(NO_AVX2), 1)
DYNAMIC_CORE += SKYLAKEX COOPERLAKE
DYNAMIC_CORE += SKYLAKEX COOPERLAKE SAPPHIRERAPIDS
endif
endif
endif
@ -668,6 +668,7 @@ DYNAMIC_CORE += NEOVERSEN1
ifneq ($(NO_SVE), 1)
DYNAMIC_CORE += NEOVERSEV1
DYNAMIC_CORE += NEOVERSEN2
DYNAMIC_CORE += ARMV8SVE
endif
DYNAMIC_CORE += CORTEXA55
DYNAMIC_CORE += FALKOR
@ -932,8 +933,12 @@ BINARY_DEFINED = 1
endif
ifeq ($(ARCH), loongarch64)
CCOMMON_OPT += -march=loongarch64 -mabi=lp64
FCOMMON_OPT += -march=loongarch64 -mabi=lp64
LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d)
ifneq ($(LA64_ABI), lp64d)
LA64_ABI=lp64
endif
CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
endif
endif
@ -1082,8 +1087,9 @@ endif
endif
endif
ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW))
CCOMMON_OPT += -DF_INTERFACE_GFORT
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -Wall
# make single-threaded LAPACK calls thread-safe #1847
FCOMMON_OPT += -frecursive
@ -1097,6 +1103,7 @@ EXTRALIB += -lgfortran
endif
endif
endif
endif
ifdef NO_BINARY_MODE
ifeq ($(ARCH), $(filter $(ARCH),mips64))
ifdef BINARY64
@ -1763,6 +1770,8 @@ export TARGET_CORE
export NO_AVX512
export NO_AVX2
export BUILD_BFLOAT16
export NO_LSX
export NO_LASX
export SBGEMM_UNROLL_M
export SBGEMM_UNROLL_N

View File

@ -9,9 +9,12 @@ AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n
Cirrus CI: [![Build Status](https://api.cirrus-ci.com/github/xianyi/OpenBLAS.svg?branch=develop)](https://cirrus-ci.com/github/xianyi/OpenBLAS)
<!-- Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/)-->
[![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
OSUOSL POWERCI [![Build Status](https://powerci.osuosl.org/buildStatus/icon?job=OpenBLAS_gh%2Fdevelop)](http://powerci.osuosl.org/job/OpenBLAS_gh/job/develop/)
OSUOSL IBMZ-CI [![Build Status](http://ibmz-ci.osuosl.org/buildStatus/icon?job=OpenBLAS-Z%2Fdevelop)](http://ibmz-ci.osuosl.org/job/OpenBLAS-Z/job/develop/)
## Introduction
OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version.

View File

@ -115,7 +115,7 @@ jobs:
mkdir build
cd build
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER="flang -I C:\Miniconda\Library\include\flang" -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
cmake --build . --config Release
ctest
@ -271,6 +271,19 @@ jobs:
- script: |
make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
- job: OSX_xbuild_DYNAMIC_ARM64
pool:
vmImage: 'macOS-11'
variables:
CC: /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX11.3.sdk -arch arm64
steps:
- script: |
ls /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs
/Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus
/Applications/Xcode_11.7.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version
make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
- job: ALPINE_MUSL
pool:
vmImage: 'ubuntu-latest'

0
benchmark/spr.c Executable file → Normal file
View File

0
benchmark/spr2.c Executable file → Normal file
View File

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
Copyright (c) 2014, 2023 The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -67,7 +67,7 @@ int main(int argc, char *argv[]){
int step = 1;
int loops = 1;
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
double time1,timeg;
@ -77,7 +77,7 @@ int main(int argc, char *argv[]){
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Loops = %d\n", from, to, step,uplo,trans,loops);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){

52
c_check
View File

@ -31,11 +31,12 @@ flags="$*"
cross_suffix=""
if [ "`dirname \"$compiler_name\"`" != '.' ]; then
cross_suffix="$cross_suffix`dirname \"$compiler_name\"`/"
if [ "`dirname "$compiler_name"`" != '.' ]; then
cross_suffix="$cross_suffix`dirname "$compiler_name"`/"
fi
bn=`basename \"$compiler_name\"`
cn=`echo $compiler_name | sed -e 's/ -.*//'`
bn=`basename "$cn"`
case "$bn" in
*-*) if [ "$bn" != '-' ]; then
@ -167,7 +168,7 @@ fi
no_msa=0
if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
tmpd="$(mktemp -d)"
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
tmpf="$tmpd/a.c"
code='"addvi.b $w0, $w1, 1"'
msa_flags='-mmsa -mfp64 -mload-store-pairs'
@ -184,6 +185,37 @@ if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
rm -rf "$tmpd"
fi
no_lsx=0
no_lasx=0
if [ "$architecture" = "loongarch64" ]; then
tmpd="$(mktemp -d)"
tmplsx="$tmpd/lsx.c"
codelsx='"vadd.b $vr0, $vr0, $vr0"'
lsx_flags='-march=loongarch64 -mlsx'
printf "#include <lsxintrin.h>\n\n" >> "$tmplsx"
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx"
args="$lsx_flags -o $tmplsx.o $tmplsx"
{
$compiler_name $flags $args >/dev/null 2>&1
} || {
no_lsx=1
}
tmplasx="$tmpd/lasx.c"
codelasx='"xvadd.b $xr0, $xr0, $xr0"'
lasx_flags='-march=loongarch64 -mlasx'
printf "#include <lasxintrin.h>\n\n" >> "$tmplasx"
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx"
args="$lasx_flags -o $tmplasx.o $tmplasx"
{
$compiler_name $flags $args >/dev/null 2>&1
} || {
no_lasx=1
}
rm -rf "$tmpd"
fi
case "$data" in
*ARCH_X86_64*) architecture=x86_64 ;;
*ARCH_X86*) architecture=x86 ;;
@ -207,7 +239,7 @@ esac
no_avx512=0
if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
tmpd=`mktemp -d`
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
tmpf="$tmpd/a.c"
code='"vbroadcastss -4 * 4(%rsi), %zmm2"'
printf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
@ -228,7 +260,7 @@ fi
no_rv64gv=0
if [ "$architecture" = "riscv64" ]; then
tmpd=`mktemp -d`
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
tmpf="$tmpd/a.c"
code='"vsetvli zero, zero, e8, m1\n"'
printf "int main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
@ -244,7 +276,7 @@ fi
no_sve=0
if [ "$architecture" = "arm64" ]; then
tmpd=`mktemp -d`
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
tmpf="$tmpd/a.c"
printf "#include <arm_sve.h>\n\n int main(void){}\n">> "$tmpf"
args=" -march=armv8-a+sve -c -o $tmpf.o $tmpf"
@ -260,7 +292,7 @@ fi
c11_atomics=0
case "$data" in
*HAVE_C11*)
tmpd=`mktemp -d`
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
tmpf="$tmpd/a.c"
printf "#include <stdatomic.h>\nint main(void){}\n" >> "$tmpf"
args=" -c -o $tmpf.o $tmpf"
@ -398,6 +430,8 @@ done
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
[ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"
[ "$no_lasx" -eq 1 ] && printf "NO_LASX=1\n"
} >> "$makefile"
os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ `
@ -413,6 +447,8 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' `
[ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu"
[ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n"
[ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n"
[ "$no_lsx" -eq 1 ] && printf "#define NO_LSX\t1\n"
[ "$no_lasx" -eq 1 ] && printf "#define NO_LASX\t1\n"
} >> "$config"

View File

@ -232,6 +232,47 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
}
}
$no_lsx = 0;
$no_lasx = 0;
if (($architecture eq "loongarch64")) {
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check LSX and LASX capatibility";
} else {
$tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
$codelsx = '"vadd.b $vr0, $vr0, $vr0"';
$lsx_flags = "-march=loongarch64 -mlsx";
print $tmplsx "#include <lsxintrin.h>\n\n";
print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n";
$args = "$lsx_flags -o $tmplsx.o $tmplsx";
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_lsx = 1;
} else {
$no_lsx = 0;
}
unlink("$tmplsx.o");
$tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
$codelasx = '"xvadd.b $xr0, $xr0, $xr0"';
$lasx_flags = "-march=loongarch64 -mlasx";
print $tmplasx "#include <lasxintrin.h>\n\n";
print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n";
$args = "$lasx_flags -o $tmplasx.o $tmplasx";
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_lasx = 1;
} else {
$no_lasx = 0;
}
unlink("$tmplasx.o");
}
}
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
$architecture = e2k if ($data =~ /ARCH_E2K/);
@ -424,6 +465,8 @@ print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1;
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1;
print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1;
print MAKEFILE "NO_LSX=1\n" if $no_lsx eq 1;
print MAKEFILE "NO_LASX=1\n" if $no_lasx eq 1;
$os =~ tr/[a-z]/[A-Z]/;
$architecture =~ tr/[a-z]/[A-Z]/;
@ -437,6 +480,8 @@ print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64;
print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1;
print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1;
print CONFFILE "#define NO_LSX\t1\n" if $no_lsx eq 1;
print CONFFILE "#define NO_LASX\t1\n" if $no_lasx eq 1;
if ($os eq "LINUX") {

View File

@ -350,7 +350,7 @@ void cblas_cher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBL
void cblas_zher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, void *C, OPENBLAS_CONST blasint ldc);
void cblas_xerbla(blasint p, char *rout, char *form, ...);
void cblas_xerbla(blasint p, OPENBLAS_CONST char *rout, OPENBLAS_CONST char *form, ...);
/*** BLAS extensions ***/

View File

@ -46,7 +46,7 @@ if (DYNAMIC_ARCH)
if (ARM64)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
set(DYNAMIC_CORE "${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2")
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE)
endif ()
if (DYNAMIC_LIST)
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
@ -82,7 +82,7 @@ if (DYNAMIC_ARCH)
set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
endif ()
if (NOT NO_AVX512)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE SAPPHIRERAPIDS)
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
endif ()
if (DYNAMIC_LIST)
@ -135,7 +135,7 @@ if (ARM64)
set(BINARY_DEFINED 1)
endif ()
if (${ARCH} STREQUAL "riscv64")
if (RISCV64)
set(NO_BINARY_MODE 1)
set(BINARY_DEFINED 1)
endif ()

View File

@ -65,6 +65,14 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
endif ()
endif ()
if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC")
if (POWER)
set(CCOMMON_OPT "${CCOMMON_OPT} -tp pwr8")
else ()
set(CCOMMON_OPT "${CCOMMON_OPT} -tp px")
endif ()
endif ()
if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
if (BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -m64")

View File

@ -3,7 +3,8 @@
## Description: Ported from portion of OpenBLAS/Makefile.system
## Sets Fortran related variables.
if (${F_COMPILER} STREQUAL "FLANG")
if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
# This is for classic Flang. LLVM Flang is handled with gfortran below.
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
if (BINARY64 AND INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
@ -38,15 +39,17 @@ if (${F_COMPILER} STREQUAL "G95")
endif ()
endif ()
if (${F_COMPILER} STREQUAL "GFORTRAN")
if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
# ensure reentrancy of lapack codes
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
# work around ABI violation in passing string arguments from C
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
if (NOT NO_LAPACK)
set(EXTRALIB "${EXTRALIB} -lgfortran")
if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
# ensure reentrancy of lapack codes
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
# work around ABI violation in passing string arguments from C
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
if (NOT NO_LAPACK)
# Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
set(EXTRALIB "${EXTRALIB} -lgfortran")
endif ()
endif ()
if (NO_BINARY_MODE)
if (MIPS64)
@ -63,6 +66,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN")
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
endif ()
endif ()
if (RISCV64)
if (BINARY64)
if (INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
endif ()
endif ()
endif ()
else ()
if (BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
@ -121,7 +131,7 @@ if (${F_COMPILER} STREQUAL "IBM")
endif ()
endif ()
if (${F_COMPILER} STREQUAL "PGI")
if (${F_COMPILER} STREQUAL "PGI" OR ${F_COMPILER} STREQUAL "PGF95")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PGI")
set(COMMON_PROF "${COMMON_PROF} -DPGICOMPILER")
if (BINARY64)

View File

@ -124,7 +124,7 @@ set(SLASRC
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
sgesvdq.f slaorhr_col_getrfnp.f
slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f
slatrs3.f strsyl3.f sgelst.f)
slatrs3.f strsyl3.f sgelst.f sgedmd.f90 sgedmdq.f90)
set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
@ -187,7 +187,7 @@ set(CLASRC
cposv.f cposvx.f cpotrf2.f cpotri.f cpstrf.f cpstf2.f
cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f
cptcon.f cpteqr.f cptrfs.f cptsv.f cptsvx.f cpttrf.f cpttrs.f cptts2.f
crot.f cspcon.f csprfs.f cspsv.f
crot.f crscl.f cspcon.f csprfs.f cspsv.f
cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f
cstegr.f cstein.f csteqr.f csycon.f
csyrfs.f csysv.f csysvx.f csytf2.f csytrf.f csytri.f
@ -223,7 +223,7 @@ set(CLASRC
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f
cungtsqr.f cungtsqr_row.f cunhr_col.f
clatrs3.f ctrsyl3.f cgelst.f)
clatrs3.f ctrsyl3.f cgelst.f cgedmd.f90 cgedmdq.f90)
set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
@ -316,7 +316,7 @@ set(DLASRC
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f
dlatrs3.f dtrsyl3.f dgelst.f)
dlatrs3.f dtrsyl3.f dgelst.f dgedmd.f90 dgedmdq.f90)
set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
@ -381,7 +381,7 @@ set(ZLASRC
zposv.f zposvx.f zpotrf2.f zpotri.f zpotrs.f zpstrf.f zpstf2.f
zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f
zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f
zrot.f zspcon.f zsprfs.f zspsv.f
zrot.f zrscl.f zspcon.f zsprfs.f zspsv.f
zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f
zstegr.f zstein.f zsteqr.f zsycon.f
zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f
@ -419,7 +419,7 @@ set(ZLASRC
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
zungtsqr.f zungtsqr_row.f zunhr_col.f
zlatrs3.f ztrsyl3.f zgelst.f)
zlatrs3.f ztrsyl3.f zgelst.f zgedmd.f90 zgedmdq.f90)
set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
@ -624,7 +624,7 @@ set(SLASRC
ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c
sgesvdq.c slaorhr_col_getrfnp.c
slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c
slatrs3.c strsyl3.c sgelst.c)
slatrs3.c strsyl3.c sgelst.c sgedmd.c sgedmdq.c)
set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c
sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c
@ -686,7 +686,7 @@ set(CLASRC
cposv.c cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c
cppcon.c cppequ.c cpprfs.c cppsv.c cppsvx.c cpptrf.c cpptri.c cpptrs.c
cptcon.c cpteqr.c cptrfs.c cptsv.c cptsvx.c cpttrf.c cpttrs.c cptts2.c
crot.c cspcon.c csprfs.c cspsv.c
crot.c crscl.c cspcon.c csprfs.c cspsv.c
cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c
cstegr.c cstein.c csteqr.c csycon.c
csyrfs.c csysv.c csysvx.c csytf2.c csytrf.c csytri.c
@ -722,7 +722,7 @@ set(CLASRC
chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c
cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c
cungtsqr.c cungtsqr_row.c cunhr_col.c
clatrs3.c ctrsyl3.c cgelst.c)
clatrs3.c ctrsyl3.c cgelst.c cgedmd.c cgedmdq.c)
set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c
cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c
@ -814,7 +814,7 @@ set(DLASRC
dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c
dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c
dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c
dlatrs3.c dtrsyl3.c dgelst.c)
dlatrs3.c dtrsyl3.c dgelst.c dgedmd.c dgedmdq.c)
set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c
dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c
@ -878,7 +878,7 @@ set(ZLASRC
zposv.c zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c
zppcon.c zppequ.c zpprfs.c zppsv.c zppsvx.c zpptrf.c zpptri.c zpptrs.c
zptcon.c zpteqr.c zptrfs.c zptsv.c zptsvx.c zpttrf.c zpttrs.c zptts2.c
zrot.c zspcon.c zsprfs.c zspsv.c
zrot.c zrscl.c zspcon.c zsprfs.c zspsv.c
zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c
zstegr.c zstein.c zsteqr.c zsycon.c
zsyrfs.c zsysv.c zsysvx.c zsytf2.c zsytrf.c zsytri.c
@ -915,7 +915,8 @@ set(ZLASRC
zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c
zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c
zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c
zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c)
zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c
zgedmd.c zgedmdq.c)
set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c
zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c

View File

@ -90,6 +90,10 @@ set(CSRC
lapacke_cgerqf_work.c
lapacke_cgesdd.c
lapacke_cgesdd_work.c
lapacke_cgedmd.c
lapacke_cgedmd_work.c
lapacke_cgedmdq.c
lapacke_cgedmdq_work.c
lapacke_cgesv.c
lapacke_cgesv_work.c
lapacke_cgesvd.c
@ -558,6 +562,8 @@ set(CSRC
lapacke_ctrsna_work.c
lapacke_ctrsyl.c
lapacke_ctrsyl_work.c
lapacke_ctrsyl3.c
lapacke_ctrsyl3_work.c
lapacke_ctrtri.c
lapacke_ctrtri_work.c
lapacke_ctrtrs.c
@ -590,6 +596,8 @@ set(CSRC
lapacke_cungtr_work.c
lapacke_cungtsqr_row.c
lapacke_cungtsqr_row_work.c
lapacke_cunhr_col.c
lapacke_cunhr_col_work.c
lapacke_cunmbr.c
lapacke_cunmbr_work.c
lapacke_cunmhr.c
@ -709,6 +717,10 @@ set(DSRC
lapacke_dgerqf_work.c
lapacke_dgesdd.c
lapacke_dgesdd_work.c
lapacke_dgedmd.c
lapacke_dgedmd_work.c
lapacke_dgedmdq.c
lapacke_dgedmdq_work.c
lapacke_dgesv.c
lapacke_dgesv_work.c
lapacke_dgesvd.c
@ -862,6 +874,8 @@ set(DSRC
lapacke_dorgtr_work.c
lapacke_dorgtsqr_row.c
lapacke_dorgtsqr_row_work.c
lapacke_dorhr_col.c
lapacke_dorhr_col_work.c
lapacke_dormbr.c
lapacke_dormbr_work.c
lapacke_dormhr.c
@ -1174,6 +1188,8 @@ set(DSRC
lapacke_dtrsna_work.c
lapacke_dtrsyl.c
lapacke_dtrsyl_work.c
lapacke_dtrsyl3.c
lapacke_dtrsyl3_work.c
lapacke_dtrtri.c
lapacke_dtrtri_work.c
lapacke_dtrtrs.c
@ -1283,6 +1299,10 @@ set(SSRC
lapacke_sgerqf_work.c
lapacke_sgesdd.c
lapacke_sgesdd_work.c
lapacke_sgedmd.c
lapacke_sgedmd_work.c
lapacke_sgedmdq.c
lapacke_sgedmdq_work.c
lapacke_sgesv.c
lapacke_sgesv_work.c
lapacke_sgesvd.c
@ -1435,6 +1455,8 @@ set(SSRC
lapacke_sorgtr_work.c
lapacke_sorgtsqr_row.c
lapacke_sorgtsqr_row_work.c
lapacke_sorhr_col.c
lapacke_sorhr_col_work.c
lapacke_sormbr.c
lapacke_sormbr_work.c
lapacke_sormhr.c
@ -1744,6 +1766,8 @@ set(SSRC
lapacke_strsna_work.c
lapacke_strsyl.c
lapacke_strsyl_work.c
lapacke_ctrsyl3.c
lapacke_ctrsyl3_work.c
lapacke_strtri.c
lapacke_strtri_work.c
lapacke_strtrs.c
@ -1851,6 +1875,10 @@ set(ZSRC
lapacke_zgerqf_work.c
lapacke_zgesdd.c
lapacke_zgesdd_work.c
lapacke_zgedmd.c
lapacke_zgedmd_work.c
lapacke_zgedmdq.c
lapacke_zgedmdq_work.c
lapacke_zgesv.c
lapacke_zgesv_work.c
lapacke_zgesvd.c
@ -2319,6 +2347,8 @@ set(ZSRC
lapacke_ztrsna_work.c
lapacke_ztrsyl.c
lapacke_ztrsyl_work.c
lapacke_ztrsyl3.c
lapacke_ztrsyl3_work.c
lapacke_ztrtri.c
lapacke_ztrtri_work.c
lapacke_ztrtrs.c
@ -2351,6 +2381,8 @@ set(ZSRC
lapacke_zungtr_work.c
lapacke_zungtsqr_row.c
lapacke_zungtsqr_row_work.c
lapacke_zunhr_col.c
lapacke_zunhr_col_work.c
lapacke_zunmbr.c
lapacke_zunmbr_work.c
lapacke_zunmhr.c

View File

@ -55,7 +55,7 @@ if (DEFINED TARGET)
endif ()
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
if (X86_64 AND NOT (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" OR ${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC"))
set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
endif ()
@ -280,7 +280,29 @@ if (DEFINED TARGET)
if (${TARGET} STREQUAL POWER8)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
endif()
if (${TARGET} STREQUAL NEOVERSEV1)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1")
else ()
message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.")
endif()
endif()
if (${TARGET} STREQUAL NEOVERSEN2)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
else ()
message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.")
endif()
endif()
if (${TARGET} STREQUAL ARMV8SVE)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve")
endif()
endif()
if (DEFINED BINARY)
message(STATUS "Compiling a ${BINARY}-bit binary.")
endif ()

View File

@ -44,6 +44,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
set(MIPS64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
set(LOONGARCH64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*")
set(RISCV64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
if (NOT BINARY)
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
@ -60,7 +62,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
set(X86 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*|armv8.*)")
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(ARM64 1)
else()
@ -107,7 +109,7 @@ else()
endif ()
if (NOT BINARY)
if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64)
if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64)
set(BINARY 64)
else ()
set(BINARY 32)

View File

@ -87,6 +87,15 @@ macro(ParseMakefileVars MAKEFILE_IN)
#message(STATUS "skipping ${makefile_line}")
continue ()
endif ()
# Example 1: SBGEMM_SMALL_M_PERMIT =
# Unset the variable
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*$" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
set(var_name ${CMAKE_MATCH_1})
unset(${var_name})
endif()
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
#message(STATUS "match on ${line_match}")

View File

@ -83,6 +83,19 @@ static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
}
#ifndef NO_AFFINITY
static inline int WhereAmI(void){
int ret = 0, counter = 0;
__asm__ volatile (
"rdtimel.w %[counter], %[id]"
: [id]"=r"(ret), [counter]"=r"(counter)
:
: "memory"
);
return ret;
}
#endif
#ifdef DOUBLE
#define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory")
#else

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -45,12 +46,14 @@
typedef struct {
int dtb_entries;
int switch_ratio;
int offsetA, offsetB, align;
#if BUILD_BFLOAT16 == 1
int sbgemm_p, sbgemm_q, sbgemm_r;
int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn;
int sbgemm_align_k;
int need_amxtile_permission; // 0 default, 1 for device support amx.
void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG);
void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG);

View File

@ -53,7 +53,6 @@ extern void goto_set_num_threads(int nthreads);
/* Global Parameter */
extern int blas_cpu_number;
extern int blas_num_threads;
extern int blas_num_threads_set;
extern int blas_omp_linked;
#define BLAS_LEGACY 0x8000U
@ -136,15 +135,13 @@ typedef struct blas_queue {
#ifdef SMP_SERVER
extern int blas_server_avail;
extern int blas_omp_number_max;
static __inline int num_cpu_avail(int level) {
#ifdef USE_OPENMP
int openmp_nthreads;
if (blas_num_threads_set == 0)
openmp_nthreads=omp_get_max_threads();
else
openmp_nthreads=blas_cpu_number;
#endif
#ifndef USE_OPENMP
@ -156,7 +153,13 @@ int openmp_nthreads;
) return 1;
#ifdef USE_OPENMP
if (blas_cpu_number != openmp_nthreads) {
if (openmp_nthreads > blas_omp_number_max){
#ifdef DEBUG
fprintf(stderr,"WARNING - more OpenMP threads requested (%d) than available (%d)\n",openmp_nthreads,blas_omp_number_max);
#endif
openmp_nthreads = blas_omp_number_max;
}
if (blas_cpu_number != openmp_nthreads) {
goto_set_num_threads(openmp_nthreads);
}
#endif

View File

@ -267,9 +267,9 @@ int detect(void)
}
#else
#ifdef __APPLE__
sysctlbyname("hw.cpufamily",&value,&length,NULL,0);
if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; //A12/M1
if (value == 3660830781) return CPU_VORTEX; //A15/M2
sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0);
if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1
if (value64 == 3660830781) return CPU_VORTEX; //A15/M2
#endif
return CPU_ARMV8;
#endif

View File

@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include <stdint.h>
#include <sys/auxv.h>
/* If LASX extension instructions supported,
* using core LOONGSON3R5
@ -46,9 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_LOONGSON3R5 1
#define CPU_LOONGSON2K1000 2
#define LOONGARCH_CFG2 0x02
#define LOONGARCH_LASX 1<<7
#define LOONGARCH_LSX 1<<6
#define LA_HWCAP_LSX (1<<4)
#define LA_HWCAP_LASX (1<<5)
static char *cpuname[] = {
"LOONGSONGENERIC",
@ -64,17 +64,11 @@ static char *cpuname_lower[] = {
int detect(void) {
#ifdef __linux
uint32_t reg = 0;
int flag = (int)getauxval(AT_HWCAP);
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(reg)
: "r"(LOONGARCH_CFG2)
);
if (reg & LOONGARCH_LASX)
if (flag & LA_HWCAP_LASX)
return CPU_LOONGSON3R5;
else if (reg & LOONGARCH_LSX)
else if (flag & LA_HWCAP_LSX)
return CPU_LOONGSON2K1000;
else
return CPU_GENERIC;

View File

@ -1479,6 +1479,8 @@ int get_cpuname(void){
else
return CPUTYPE_NEHALEM;
case 15: // Sapphire Rapids
if(support_amx_bf16())
return CPUTYPE_SAPPHIRERAPIDS;
if(support_avx512_bf16())
return CPUTYPE_COOPERLAKE;
if(support_avx512())
@ -1549,6 +1551,7 @@ int get_cpuname(void){
case 7: // Raptor Lake
case 10:
case 15:
case 14: // Alder Lake N
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
@ -1845,7 +1848,8 @@ static char *cpuname[] = {
"ZEN",
"SKYLAKEX",
"DHYANA",
"COOPERLAKE"
"COOPERLAKE",
"SAPPHIRERAPIDS",
};
static char *lowercpuname[] = {
@ -1902,7 +1906,8 @@ static char *lowercpuname[] = {
"zen",
"skylakex",
"dhyana",
"cooperlake"
"cooperlake",
"sapphirerapids",
};
static char *corename[] = {
@ -2356,6 +2361,7 @@ int get_coretype(void){
case 7: // Raptor Lake
case 10:
case 15:
case 14: // Alder Lake N
#ifndef NO_AVX2
if(support_avx2())
return CORE_HASWELL;

View File

@ -208,7 +208,7 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
ifeq ($(USE_OPENMP), 1)
ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(C_COMPILER), CLANG)
CEXTRALIB = -lomp
CEXTRALIB += -lomp
endif
endif
ifeq ($(F_COMPILER), NAG)

270
docs/distributing.md Normal file
View File

@ -0,0 +1,270 @@
# Guidance for redistributing OpenBLAS
*We note that this document contains recommendations only - packagers and other
redistributors are in charge of how OpenBLAS is built and distributed in their
systems, and may have good reasons to deviate from the guidance given on this
page. These recommendations are aimed at general packaging systems, with a user
base that typically is large, open source (or freely available at least), and
doesn't behave uniformly or that the packager is directly connected with.*
OpenBLAS has a large number of build-time options which can be used to change
how it behaves at runtime, how artifacts or symbols are named, etc. Variation
in build configuration can be necessary to acheive a given end goal within a
distribution or as an end user. However, such variation can also make it more
difficult to build on top of OpenBLAS and ship code or other packages in a way
that works across many different distros. Here we provide guidance about the
most important build options, what effects they may have when changed, and
which ones to default to.
The Make and CMake build systems provide equivalent options and yield more or
less the same artifacts, but not exactly (the CMake builds are still
experimental). You can choose either one and the options will function in the
same way, however the CMake outputs may require some renaming. To review
available build options, see `Makefile.rule` or `CMakeLists.txt` in the root of
the repository.
Build options typically fall into two categories: (a) options that affect the
user interface, such as library and symbol names or APIs that are made
available, and (b) options that affect performance and runtime behavior, such
as threading behavior or CPU architecture-specific code paths. The user
interface options are more important to keep aligned between distributions,
while for the performance-related options there are typically more reasons to
make choices that deviate from the defaults.
Here are recommendations for user interface related packaging choices where it
is not likely to be a good idea to deviate (typically these are the default
settings):
1. Include CBLAS. The CBLAS interface is widely used and it doesn't affect
binary size much, so don't turn it off.
2. Include LAPACK and LAPACKE. The LAPACK interface is also widely used, and
while it does make up a significant part of the binary size of the installed
library, that does not outweigh the regression in usability when deviating
from the default here.[^1]
3. Always distribute the pkg-config (`.pc`) and CMake `.cmake`) dependency
detection files. These files are used by build systems when users want to
link against OpenBLAS, and there is no benefit of leaving them out.
4. Provide the LP64 interface by default, and if in addition to that you choose
to provide an ILP64 interface build as well, use a symbol suffix to avoid
symbol name clashes (see the next section).
[^1] All major distributions do include LAPACK as of mid 2023 as far as we
know. Older versions of Arch Linux did not, and that was known to cause
problems.
## ILP64 interface builds
The LP64 (32-bit integer) interface is the default build, and has
well-established C and Fortran APIs as determined by the reference (Netlib)
BLAS and LAPACK libraries. The ILP64 (64-bit integer) interface however does
not have a standard API: symbol names and shared/static library names can be
produced in multiple ways, and this tends to make it difficult to use.
As of today there is an agreed-upon way of choosing names for OpenBLAS between
a number of key users/redistributors, which is the closest thing to a standard
that there is now. However, there is an ongoing standardization effort in the
reference BLAS and LAPACK libraries, which differs from the current OpenBLAS
agreed-upon convention. In this section we'll aim to explain both.
Those two methods are fairly similar, and have a key thing in common: *using a
symbol suffix*. This is good practice; it is recommended that if you distribute
an ILP64 build, to have it use a symbol suffix containing `64` in the name.
This avoids potential symbol clashes when different packages which depend on
OpenBLAS load both an LP64 and an ILP64 library into memory at the same time.
### The current OpenBLAS agreed-upon ILP64 convention
This convention comprises the shared library name and the symbol suffix in the
shared library. The symbol suffix to use is `64_`, implying that the library
name will be `libopenblas64_.so` and the symbols in that library end in `64_`.
The central issue where this was discussed is
[openblas#646](https://github.com/xianyi/OpenBLAS/issues/646), and adopters
include Fedora, Julia, NumPy and SciPy - SuiteSparse already used it as well.
To build shared and static libraries with the currently recommended ILP64
conventions with Make:
```bash
$ make INTERFACE64=1 SYMBOLSUFFIX=64_
```
This will produce libraries named `libopenblas64_.so|a`, a pkg-config file
named `openblas64.pc`, and CMake and header files.
Installing locally and inspecting the output will show a few more details:
```bash
$ make install PREFIX=$PWD/../openblas/make64 INTERFACE64=1 SYMBOLSUFFIX=64_
$ tree . # output slightly edited down
.
├── include
│   ├── cblas.h
│   ├── f77blas.h
│   ├── lapacke_config.h
│   ├── lapacke.h
│   ├── lapacke_mangling.h
│   ├── lapacke_utils.h
│   ├── lapack.h
│   └── openblas_config.h
└── lib
├── cmake
│   └── openblas
│   ├── OpenBLASConfig.cmake
│   └── OpenBLASConfigVersion.cmake
├── libopenblas64_.a
├── libopenblas64_.so
└── pkgconfig
└── openblas64.pc
```
A key point are the symbol names. These will equal the LP64 symbol names, then
(for Fortran only) the compiler mangling, and then the `64_` symbol suffix.
Hence to obtain the final symbol names, we need to take into account which
Fortran compiler we are using. For the most common cases (e.g., gfortran, Intel
Fortran, or Flang), that means appending a single underscore. In that case, the
result is:
| base API name | binary symbol name | call from Fortran code | call from C code |
|---------------|--------------------|------------------------|-----------------------|
| `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` |
| `cblas_dgemm` | `cblas_dgemm64_` | n/a | `cblas_dgemm64_(...)` |
It is quite useful to have these symbol names be as uniform as possible across
different packaging systems.
The equivalent build options with CMake are:
```bash
$ mkdir build && cd build
$ cmake .. -DINTERFACE64=1 -DSYMBOLSUFFIX=64_ -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON
$ cmake --build . -j
```
Note that the result is not 100% identical to the Make result. For example, the
library name ends in `_64` rather than `64_` - it is recommended to rename them
to match the Make library names (also update the `libsuffix` entry in
`openblas64.pc` to match that rename).
```bash
$ cmake --install . --prefix $PWD/../../openblas/cmake64
$ tree .
.
├── include
│   └── openblas64
│   ├── cblas.h
│   ├── f77blas.h
│   ├── lapacke_config.h
│   ├── lapacke_example_aux.h
│   ├── lapacke.h
│   ├── lapacke_mangling.h
│   ├── lapacke_utils.h
│   ├── lapack.h
│   ├── openblas64
│   │   └── lapacke_mangling.h
│   └── openblas_config.h
└── lib
├── cmake
│   └── OpenBLAS64
│   ├── OpenBLAS64Config.cmake
│   ├── OpenBLAS64ConfigVersion.cmake
│   ├── OpenBLAS64Targets.cmake
│   └── OpenBLAS64Targets-noconfig.cmake
├── libopenblas_64.a
├── libopenblas_64.so -> libopenblas_64.so.0
└── pkgconfig
└── openblas64.pc
```
### The upcoming standardized ILP64 convention
While the `64_` convention above got some adoption, it's slightly hacky and is
implemented through the use of `objcopy`. An effort is ongoing for a more
broadly adopted convention in the reference BLAS and LAPACK libraries, using
(a) the `_64` suffix, and (b) applying that suffix _before_ rather than after
Fortran compiler mangling. The central issue for this is
[lapack#666](https://github.com/Reference-LAPACK/lapack/issues/666).
For the most common cases of compiler mangling (a single `_` appended), the end
result will be:
| base API name | binary symbol name | call from Fortran code | call from C code |
|---------------|--------------------|------------------------|-----------------------|
| `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` |
| `cblas_dgemm` | `cblas_dgemm_64` | n/a | `cblas_dgemm_64(...)` |
For other compiler mangling schemes, replace the trailing `_` by the scheme in use.
The shared library name for this `_64` convention should be `libopenblas_64.so`.
Note: it is not yet possible to produce an OpenBLAS build which employs this
convention! Once reference BLAS and LAPACK with support for `_64` have been
released, a future OpenBLAS release will support it. For now, please use the
older `64_` scheme and avoid using the name `libopenblas_64.so`; it should be
considered reserved for future use of the `_64` standard as prescribed by
reference BLAS/LAPACK.
## Performance and runtime behavior related build options
For these options there are multiple reasonable or common choices.
### Threading related options
OpenBLAS can be built as a multi-threaded or single-threaded library, with the
default being multi-threaded. It's expected that the default `libopenblas`
library is multi-threaded; if you'd like to also distribute single-threaded
builds, consider naming them `libopenblas_sequential`.
OpenBLAS can be built with pthreads or OpenMP as the threading model, with the
default being pthreads. Both options are commonly used, and the choice here
should not influence the shared library name. The choice will be captured by
the `.pc` file. E.g.,:
```bash
$ pkg-config --libs openblas
-fopenmp -lopenblas
$ cat openblas.pc
...
openblas_config= ... USE_OPENMP=0 MAX_THREADS=24
```
The maximum number of threads users will be able to use is determined at build
time by the `NUM_THREADS` build option. It defaults to 24, and there's a wide
range of values that are reasonable to use (up to 256). 64 is a typical choice
here; there is a memory footprint penalty that is linear in `NUM_THREADS`.
Please see `Makefile.rule` for more details.
### CPU architecture related options
OpenBLAS contains a lot of CPU architecture-specific optimizations, hence when
distributing to a user base with a variety of hardware, it is recommended to
enable CPU architecture runtime detection. This will dynamically select
optimized kernels for individual APIs. To do this, use the `DYNAMIC_ARCH=1`
build option. This is usually done on all common CPU families, except when
there are known issues.
In case the CPU architecture is known (e.g. you're building binaries for macOS
M1 users), it is possible to specify the target architecture directly with the
`TARGET=` build option.
`DYNAMIC_ARCH` and `TARGET` are covered in more detail in the main `README.md`
in this repository.
## Real-world examples
OpenBLAS is likely to be distributed in one of these distribution models:
1. As a standalone package, or multiple packages, in a packaging ecosystem like
a Linux distro, Homebrew, conda-forge or MSYS2.
2. Vendored as part of a larger package, e.g. in Julia, NumPy, SciPy, or R.
3. Locally, e.g. making available as a build on a single HPC cluster.
The guidance on this page is most important for models (1) and (2). These links
to build recipes for a representative selection of packaging systems may be
helpful as a reference:
- [Fedora](https://src.fedoraproject.org/rpms/openblas/blob/rawhide/f/openblas.spec)
- [Debian](https://salsa.debian.org/science-team/openblas/-/blob/master/debian/rules)
- [Homebrew](https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/openblas.rb)
- [MSYS2](https://github.com/msys2/MINGW-packages/blob/master/mingw-w64-openblas/PKGBUILD)
- [conda-forge](https://github.com/conda-forge/openblas-feedstock/blob/main/recipe/build.sh)
- [NumPy/SciPy](https://github.com/MacPython/openblas-libs/blob/main/tools/build_openblas.sh)
- [Nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/development/libraries/science/math/openblas/default.nix)

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -44,10 +45,6 @@
#define DIVIDE_RATE 2
#endif
#ifndef SWITCH_RATIO
#define SWITCH_RATIO 2
#endif
//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@ -1015,6 +1012,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
BLASLONG divN, divT;
int mode;
#if defined(DYNAMIC_ARCH)
int switch_ratio = gotoblas->switch_ratio;
#else
int switch_ratio = SWITCH_RATIO;
#endif
if (range_m) {
BLASLONG m_from = *(((BLASLONG *)range_m) + 0);
BLASLONG m_to = *(((BLASLONG *)range_m) + 1);
@ -1030,7 +1033,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
}
*/
if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) {
if ((args -> m < nthreads * switch_ratio) || (args -> n < nthreads * switch_ratio)) {
GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0);
return 0;
}
@ -1038,7 +1041,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
divT = nthreads;
divN = 1;
while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) {
while ((GEMM3M_P * divT > m * switch_ratio) && (divT > 1)) {
do {
divT --;
divN = 1;

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -44,10 +45,6 @@
#define DIVIDE_RATE 2
#endif
#ifndef SWITCH_RATIO
#define SWITCH_RATIO 2
#endif
//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@ -528,7 +525,13 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
int mode, mask;
double dnum, di, dinum;
if ((nthreads == 1) || (args -> n < nthreads * SWITCH_RATIO)) {
#if defined(DYNAMIC_ARCH)
int switch_ratio = gotoblas->switch_ratio;
#else
int switch_ratio = SWITCH_RATIO;
#endif
if ((nthreads == 1) || (args->n < nthreads * switch_ratio)) {
SYRK_LOCAL(args, range_m, range_n, sa, sb, 0);
return 0;
}

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -44,10 +45,6 @@
#define DIVIDE_RATE 2
#endif
#ifndef SWITCH_RATIO
#define SWITCH_RATIO 2
#endif
#ifndef GEMM_PREFERED_SIZE
#define GEMM_PREFERED_SIZE 1
#endif
@ -577,6 +574,11 @@ InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
BLASLONG width, i, j, k, js;
BLASLONG m, n, n_from, n_to;
int mode;
#if defined(DYNAMIC_ARCH)
int switch_ratio = gotoblas->switch_ratio;
#else
int switch_ratio = SWITCH_RATIO;
#endif
/* Get execution mode */
#ifndef COMPLEX
@ -698,8 +700,8 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
num_parts = 0;
while (n > 0){
width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts);
if (width < SWITCH_RATIO) {
width = SWITCH_RATIO;
if (width < switch_ratio) {
width = switch_ratio;
}
width = round_up(n, width, GEMM_PREFERED_SIZE);
@ -746,6 +748,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
BLASLONG m = args -> m;
BLASLONG n = args -> n;
BLASLONG nthreads_m, nthreads_n;
#if defined(DYNAMIC_ARCH)
int switch_ratio = gotoblas->switch_ratio;
#else
int switch_ratio = SWITCH_RATIO;
#endif
/* Get dimensions from index ranges if available */
if (range_m) {
@ -755,21 +762,21 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
n = range_n[1] - range_n[0];
}
/* Partitions in m should have at least SWITCH_RATIO rows */
if (m < 2 * SWITCH_RATIO) {
/* Partitions in m should have at least switch_ratio rows */
if (m < 2 * switch_ratio) {
nthreads_m = 1;
} else {
nthreads_m = args -> nthreads;
while (m < nthreads_m * SWITCH_RATIO) {
while (m < nthreads_m * switch_ratio) {
nthreads_m = nthreads_m / 2;
}
}
/* Partitions in n should have at most SWITCH_RATIO * nthreads_m columns */
if (n < SWITCH_RATIO * nthreads_m) {
/* Partitions in n should have at most switch_ratio * nthreads_m columns */
if (n < switch_ratio * nthreads_m) {
nthreads_n = 1;
} else {
nthreads_n = (n + SWITCH_RATIO * nthreads_m - 1) / (SWITCH_RATIO * nthreads_m);
nthreads_n = (n + switch_ratio * nthreads_m - 1) / (switch_ratio * nthreads_m);
if (nthreads_m * nthreads_n > args -> nthreads) {
nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m);
}

View File

@ -973,7 +973,7 @@ void goto_set_num_threads(int num_threads) {
increased_threads = 1;
for(i = blas_num_threads - 1; i < num_threads - 1; i++){
for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0);
thread_status[i].status = THREAD_STATUS_WAKEUP;

View File

@ -68,6 +68,7 @@
#endif
int blas_server_avail = 0;
int blas_omp_number_max = 0;
extern int openblas_omp_adaptive_env();
@ -100,8 +101,6 @@ static void adjust_thread_buffers() {
void goto_set_num_threads(int num_threads) {
blas_num_threads_set = 1;
if (num_threads < 0) blas_num_threads_set = 0;
if (num_threads < 1) num_threads = blas_num_threads;
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
@ -125,6 +124,8 @@ void openblas_set_num_threads(int num_threads) {
}
int blas_thread_init(void){
if(blas_omp_number_max <= 0)
blas_omp_number_max = omp_get_max_threads();
blas_get_cpu_number();

View File

@ -568,7 +568,7 @@ void goto_set_num_threads(int num_threads)
blas_server_avail = 1;
}
for(i = blas_num_threads - 1; i < num_threads - 1; i++){
for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i,

View File

@ -220,6 +220,19 @@ extern gotoblas_t gotoblas_COOPERLAKE;
#else
#define gotoblas_COOPERLAKE gotoblas_PRESCOTT
#endif
#ifdef DYN_SAPPHIRERAPIDS
extern gotoblas_t gotoblas_SAPPHIRERAPIDS;
#elif defined(DYN_SKYLAKEX)
#define gotoblas_SAPPHIRERAPIDS gotoblas_SKYLAKEX
#elif defined(DYN_HASWELL)
#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM
#else
#define gotoblas_SAPPHIRERAPIDS gotoblas_PRESCOTT
#endif
#else // not DYNAMIC_LIST
@ -268,9 +281,11 @@ extern gotoblas_t gotoblas_ZEN;
#ifndef NO_AVX512
extern gotoblas_t gotoblas_SKYLAKEX;
extern gotoblas_t gotoblas_COOPERLAKE;
extern gotoblas_t gotoblas_SAPPHIRERAPIDS;
#else
#define gotoblas_SKYLAKEX gotoblas_HASWELL
#define gotoblas_COOPERLAKE gotoblas_HASWELL
#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL
#endif
#endif
#else
@ -279,6 +294,7 @@ extern gotoblas_t gotoblas_COOPERLAKE;
#define gotoblas_HASWELL gotoblas_NEHALEM
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
#define gotoblas_COOPERLAKE gotoblas_NEHALEM
#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM
#define gotoblas_BULLDOZER gotoblas_BARCELONA
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
@ -378,6 +394,31 @@ int support_avx512_bf16(){
#endif
}
#define BIT_AMX_TILE 0x01000000
#define BIT_AMX_BF16 0x00400000
#define BIT_AMX_ENBD 0x00060000
int support_amx_bf16() {
#if !defined(NO_AVX) && !defined(NO_AVX512)
int eax, ebx, ecx, edx;
int ret=0;
if (!support_avx512())
return 0;
// CPUID.7.0:EDX indicates AMX support
cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) {
// CPUID.D.0:EAX[17:18] indicates AMX enabled
cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD)
ret = 1;
}
return ret;
#else
return 0;
#endif
}
extern void openblas_warning(int verbose, const char * msg);
#define FALLBACK_VERBOSE 1
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
@ -689,6 +730,8 @@ static gotoblas_t *get_coretype(void){
}
}
if (model == 15){ // Sapphire Rapids
if(support_amx_bf16())
return &gotoblas_SAPPHIRERAPIDS;
if(support_avx512_bf16())
return &gotoblas_COOPERLAKE;
if (support_avx512())
@ -941,7 +984,8 @@ static char *corename[] = {
"Excavator",
"Zen",
"SkylakeX",
"Cooperlake"
"Cooperlake",
"SapphireRapids"
};
char *gotoblas_corename(void) {
@ -1006,6 +1050,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_ZEN) return corename[23];
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
if (gotoblas == &gotoblas_COOPERLAKE) return corename[25];
if (gotoblas == &gotoblas_SAPPHIRERAPIDS) return corename[26];
return corename[0];
}

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2;
#else
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
#endif
#ifdef DYN_ARMV8SVE
extern gotoblas_t gotoblas_ARMV8SVE;
#else
#define gotoblas_ARMV8SVE gotoblas_ARMV8
#endif
#ifdef DYN_CORTEX_A55
extern gotoblas_t gotoblas_CORTEXA55;
#else
@ -128,17 +134,21 @@ extern gotoblas_t gotoblas_NEOVERSEN1;
#ifndef NO_SVE
extern gotoblas_t gotoblas_NEOVERSEV1;
extern gotoblas_t gotoblas_NEOVERSEN2;
extern gotoblas_t gotoblas_ARMV8SVE;
#else
#define gotoblas_NEOVERSEV1 gotoblas_ARMV8
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
#define gotoblas_ARMV8SVE gotoblas_ARMV8
#endif
extern gotoblas_t gotoblas_THUNDERX3T110;
extern gotoblas_t gotoblas_CORTEXA55;
#endif
extern void openblas_warning(int verbose, const char * msg);
#define FALLBACK_VERBOSE 1
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"
#define NUM_CORETYPES 13
#define NUM_CORETYPES 16
/*
* In case asm/hwcap.h is outdated on the build system, make sure
@ -147,6 +157,9 @@ extern void openblas_warning(int verbose, const char * msg);
#ifndef HWCAP_CPUID
#define HWCAP_CPUID (1 << 11)
#endif
#ifndef HWCAP_SVE
#define HWCAP_SVE (1 << 22)
#endif
#define get_cpu_ftr(id, var) ({ \
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \
@ -168,6 +181,7 @@ static char *corename[] = {
"neoversen2",
"thunderx3t110",
"cortexa55",
"armv8sve",
"unknown"
};
@ -187,6 +201,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12];
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
if (gotoblas == &gotoblas_CORTEXA55) return corename[14];
if (gotoblas == &gotoblas_ARMV8SVE) return corename[15];
return corename[NUM_CORETYPES];
}
@ -221,6 +236,7 @@ static gotoblas_t *force_coretype(char *coretype) {
case 12: return (&gotoblas_NEOVERSEN2);
case 13: return (&gotoblas_THUNDERX3T110);
case 14: return (&gotoblas_CORTEXA55);
case 15: return (&gotoblas_ARMV8SVE);
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
@ -281,9 +297,17 @@ static gotoblas_t *get_coretype(void) {
return &gotoblas_NEOVERSEN1;
#ifndef NO_SVE
case 0xd49:
return &gotoblas_NEOVERSEN2;
if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
return &gotoblas_NEOVERSEN1;
} else
return &gotoblas_NEOVERSEN2;
case 0xd40:
return &gotoblas_NEOVERSEV1;
if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
return &gotoblas_NEOVERSEN1;
}else
return &gotoblas_NEOVERSEV1;
#endif
case 0xd05: // Cortex A55
return &gotoblas_CORTEXA55;
@ -332,6 +356,12 @@ static gotoblas_t *get_coretype(void) {
snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
openblas_warning(1, coremsg);
}
#ifndef NO_SVE
if ((getauxval(AT_HWCAP) & HWCAP_SVE)) {
return &gotoblas_ARMV8SVE;
}
#endif
return NULL;
#endif
}

View File

@ -422,8 +422,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s
*/
int blas_num_threads = 0;
int blas_num_threads_set = 0;
int goto_get_num_procs (void) {
return blas_cpu_number;
}
@ -1996,8 +1994,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s
*/
int blas_num_threads = 0;
int blas_num_threads_set = 0;
int goto_get_num_procs (void) {
return blas_cpu_number;
}
@ -3015,6 +3011,8 @@ void *blas_memory_alloc(int procpos){
#endif
if (memory_overflowed) goto terminate;
fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n");
fprintf(stderr,"To avoid this warning, please rebuild your copy of OpenBLAS with a larger NUM_THREADS setting\n");
fprintf(stderr,"or set the environment variable OPENBLAS_NUM_THREADS to %d or lower\n", NUM_BUFFERS);
memory_overflowed=1;
new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t));
newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct));

View File

@ -283,7 +283,6 @@ The numbers of threads in the thread pool.
This value is equal or large than blas_cpu_number. This means some threads are sleep.
*/
int blas_num_threads = 0;
int blas_num_threads_set = 0;
int goto_get_num_procs (void) {
return blas_cpu_number;

View File

@ -844,6 +844,23 @@ lapackobjs2z="$lapackobjs2z
zungtsqr_row
"
#functions added for lapack-3.11
lapackobjs2c="$lapackobjs2c
cgedmd
cgedmdq
"
lapackobjs2d="$lapackobjs2d
dgedmd
dgedmdq
"
lapackobjs2s="$lapackobjs2s
sgedmd
sgedmdq
"
lapackobjs2z="$lapackobjs2z
zgedmd
zgedmdq
"
lapack_extendedprecision_objs="
zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx
dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx
@ -1013,6 +1030,10 @@ lapackeobjsc="
LAPACKE_cgebrd_work
LAPACKE_cgecon
LAPACKE_cgecon_work
LAPACKE_cgedmd
LAPACKE_cgedmd_work
LAPACKE_cgedmdq
LAPACKE_cgedmdq_work
LAPACKE_cgeequ
LAPACKE_cgeequ_work
LAPACKE_cgeequb
@ -1672,6 +1693,10 @@ lapackeobjsd="
LAPACKE_dgebrd_work
LAPACKE_dgecon
LAPACKE_dgecon_work
LAPACKE_dgedmd
LAPACKE_dgedmd_work
LAPACKE_dgedmdq
LAPACKE_dgedmdq_work
LAPACKE_dgeequ
LAPACKE_dgeequ_work
LAPACKE_dgeequb
@ -2285,6 +2310,10 @@ lapackeobjss="
LAPACKE_sgebrd_work
LAPACKE_sgecon
LAPACKE_sgecon_work
LAPACKE_sgedmd
LAPACKE_sgedmd_work
LAPACKE_sgedmdq
LAPACKE_sgedmdq_work
LAPACKE_sgeequ
LAPACKE_sgeequ_work
LAPACKE_sgeequb
@ -2894,6 +2923,10 @@ lapackeobjsz="
LAPACKE_zgebrd_work
LAPACKE_zgecon
LAPACKE_zgecon_work
LAPACKE_zgedmd
LAPACKE_zgedmd_work
LAPACKE_zgedmdq
LAPACKE_zgedmdq_work
LAPACKE_zgeequ
LAPACKE_zgeequ_work
LAPACKE_zgeequb

View File

@ -101,7 +101,14 @@ else
*flang*)
vendor=FLANG
openmp='-fopenmp'
;;
data=`$compiler -v 2>&1 > /dev/null `
v="${data#*version *}"
v="${v%%*.}"
major="${v%%.*}"
if [ "$major" -ge 17 ]; then
vendor=FLANGNEW
fi
;;
*ifort*|*ifx*)
vendor=INTEL
openmp='-fopenmp'

View File

@ -154,6 +154,23 @@ static size_t zgemm_small_kernel_b0[] = {
#endif
#endif
#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
#define XFEATURE_XTILEDATA 18
#define ARCH_REQ_XCOMP_PERM 0x1023
static int openblas_amxtile_permission = 0;
static int init_amxtile_permission() {
long status =
syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);
if (status != 0) {
fprintf(stderr, "XTILEDATA permission not granted in your device(Linux, "
"Intel Sapphier Rapids), skip sbgemm calculation\n");
return -1;
}
openblas_amxtile_permission = 1;
return 0;
}
#endif
#ifndef CBLAS
void NAME(char *TRANSA, char *TRANSB,
@ -455,6 +472,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
#endif
#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
#if defined(DYNAMIC_ARCH)
if (gotoblas->need_amxtile_permission &&
openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) {
return;
}
#endif
#if !defined(DYNAMIC_ARCH) && defined(SAPPHIRERAPIDS)
if (openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) {
return;
}
#endif
#endif // defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
if ((args.m == 0) || (args.n == 0)) return;
#if 0

View File

@ -77,6 +77,7 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
blasint info;
char transA, transB, Uplo;
blasint nrowa, nrowb;
IFLOAT *buffer;
IFLOAT *aa, *bb;
FLOAT *cc;
@ -155,29 +156,38 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
if (Uplo == 'L')
uplo = 1;
nrowa = m;
if (transa) nrowa = k;
nrowb = k;
if (transb) nrowb = m;
info = 0;
if (uplo < 0)
info = 14;
if (ldc < m)
if (ldc < MAX(1, m))
info = 13;
if (ldb < MAX(1, nrowa))
info = 10;
if (lda < MAX(1, nrowb))
info = 8;
if (k < 0)
info = 5;
if (m < 0)
info = 3;
info = 4;
if (transb < 0)
info = 2;
info = 3;
if (transa < 0)
info = 2;
if (uplo < 0)
info = 1;
if (info) {
if (info != 0) {
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
#else
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M,
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m,
blasint k,
#ifndef COMPLEX
FLOAT alpha,
@ -199,17 +209,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
int transa, transb, uplo;
blasint info;
blasint m, lda, ldb;
blasint lda, ldb;
FLOAT *a, *b;
XFLOAT *buffer;
PRINT_DEBUG_CNAME;
uplo = -1;
transa = -1;
transb = -1;
info = 0;
if (order == CblasColMajor) {
if (Uplo == CblasUpper) uplo = 0;
if (Uplo == CblasLower) uplo = 1;
if (TransA == CblasNoTrans)
transa = 0;
@ -249,15 +262,27 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
info = -1;
if (ldc < m)
blasint nrowa, nrowb;
nrowa = m;
if (transa) nrowa = k;
nrowb = k;
if (transb) nrowb = m;
if (ldc < MAX(1, m))
info = 13;
if (ldb < MAX(1, nrowb))
info = 10;
if (lda < MAX(1, nrowa))
info = 8;
if (k < 0)
info = 5;
if (m < 0)
info = 3;
info = 4;
if (transb < 0)
info = 2;
info = 3;
if (transa < 0)
info = 2;
if (uplo < 0)
info = 1;
}
@ -269,6 +294,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
lda = LDB;
ldb = LDA;
if (Uplo == CblasUpper) uplo = 0;
if (Uplo == CblasLower) uplo = 1;
if (TransB == CblasNoTrans)
transa = 0;
if (TransB == CblasTrans)
@ -302,27 +330,30 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
info = -1;
if (ldc < m)
blasint ncola, ncolb;
ncola = k;
if (transa) ncola = m;
ncolb = m;
if (transb) ncolb = k;
if (ldc < MAX(1,m))
info = 13;
if (ldb < MAX(1, ncolb))
info = 10;
if (lda < MAX(1, ncola))
info = 8;
if (k < 0)
info = 5;
if (m < 0)
info = 3;
info = 4;
if (transb < 0)
info = 2;
info = 3;
if (transa < 0)
info = 2;
if (uplo < 0)
info = 1;
}
uplo = -1;
if (Uplo == CblasUpper)
uplo = 0;
if (Uplo == CblasLower)
uplo = 1;
if (uplo < 0)
info = 14;
if (info >= 0) {
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
@ -392,7 +423,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
if ((m == 0) )
if (m == 0)
return;
IDEBUG_START;

View File

@ -100,13 +100,13 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
if ( order == BlasColMajor)
{
if ( trans == BlasNoTrans && *ldb < *rows ) info = 9;
if ( trans == BlasTrans && *ldb < *cols ) info = 9;
if ( trans == BlasNoTrans && *ldb < *rows ) info = 8;
if ( trans == BlasTrans && *ldb < *cols ) info = 8;
}
if ( order == BlasRowMajor)
{
if ( trans == BlasNoTrans && *ldb < *cols ) info = 9;
if ( trans == BlasTrans && *ldb < *rows ) info = 9;
if ( trans == BlasNoTrans && *ldb < *cols ) info = 8;
if ( trans == BlasTrans && *ldb < *rows ) info = 8;
}
if ( order == BlasColMajor && *lda < *rows ) info = 7;
@ -120,17 +120,20 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
#ifdef NEW_IMATCOPY
if ( *lda == *ldb && *rows == *cols) {
if ( *lda == *ldb ) {
if ( order == BlasColMajor )
{
if ( trans == BlasNoTrans )
{
IMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda );
return;
}
else
else if ( *rows == *cols )
{
IMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda );
return;
}
}
else
@ -138,26 +141,23 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
if ( trans == BlasNoTrans )
{
IMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda );
return;
}
else
else if ( *rows == *cols )
{
IMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda );
return;
}
}
return;
}
#endif
if ( *lda > *ldb )
msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT);
else
msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT);
msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT);
b = malloc(msize);
if ( b == NULL )
{
printf("Memory alloc failed\n");
printf("Memory alloc failed in imatcopy\n");
exit(1);
}
@ -165,26 +165,26 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
{
if ( trans == BlasNoTrans )
{
OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *ldb );
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *ldb, a, *ldb );
OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *rows );
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *rows, a, *ldb );
}
else
{
OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *ldb );
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, b, *ldb, a, *ldb );
OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *cols );
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, b, *cols, a, *ldb );
}
}
else
{
if ( trans == BlasNoTrans )
{
OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *ldb );
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *ldb, a, *ldb );
OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *cols );
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *cols, a, *ldb );
}
else
{
OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *ldb );
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, b, *ldb, a, *ldb );
OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *rows );
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, b, *rows, a, *ldb );
}
}

View File

@ -54,6 +54,21 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
if (n <= 0) return 0.;
#ifndef COMPLEX
if (n == 1)
#ifdef DOUBLE
return fabs(x[0]);
#else
return fabsf(x[0]);
#endif
#endif
if (incx < 0)
#ifdef COMPLEX
x -= (n - 1) * incx * 2;
#else
x -= (n - 1) * incx;
#endif
IDEBUG_START;
FUNCTION_PROFILE_START();
@ -82,6 +97,22 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
if (n <= 0) return 0.;
#ifndef COMPLEX
if (n == 1)
#ifdef DOUBLE
return fabs(x[0]);
#else
return fabsf(x[0]);
#endif
#endif
if (incx < 0)
#ifdef COMPLEX
x -= (n - 1) * incx * 2;
#else
x -= (n - 1) * incx;
#endif
IDEBUG_START;
FUNCTION_PROFILE_START();

View File

@ -166,7 +166,7 @@ void NAME(char *SIDE, char *UPLO,
int nodes;
#endif
# if defined(SMP)
int MN;
double MN;
#endif
blasint info;
int side;
@ -264,7 +264,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
int nodes;
#endif
#if defined(SMP)
int MN;
double MN;
#endif
PRINT_DEBUG_CNAME;

View File

@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS,
FLOAT *sa, *sb;
#ifdef SMP
int NNK;
double NNK;
#ifdef USE_SIMPLE_THREADED_LEVEL3
#ifndef COMPLEX
#ifdef XDOUBLE
@ -232,7 +232,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
FLOAT *sa, *sb;
#ifdef SMP
int NNK;
double NNK;
#ifdef USE_SIMPLE_THREADED_LEVEL3
#ifndef COMPLEX

View File

@ -125,27 +125,33 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
#ifdef NEW_IMATCOPY
if (*lda == *ldb && *cols == *rows) {
if (*lda == *ldb ) {
if ( order == BlasColMajor )
{
if ( trans == BlasNoTrans )
{
IMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda );
return;
}
if ( trans == BlasConj )
{
IMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda );
return;
}
if ( trans == BlasTrans )
if ( trans == BlasTrans && *rows == *cols )
{
IMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda );
return;
}
if ( trans == BlasTransConj )
if ( trans == BlasTransConj && *rows == *cols )
{
IMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda );
return;
}
}
else
{
@ -153,67 +159,59 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
if ( trans == BlasNoTrans )
{
IMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda );
return;
}
if ( trans == BlasConj )
{
IMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda );
return;
}
if ( trans == BlasTrans )
if ( trans == BlasTrans && *rows == *cols )
{
IMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda );
return;
}
if ( trans == BlasTransConj )
if ( trans == BlasTransConj && *rows == *cols )
{
IMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda );
return;
}
}
return;
}
#endif
if ( *lda > *ldb )
msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT) * 2;
else
msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT) * 2;
b = malloc(msize);
if ( b == NULL )
{
printf("Memory alloc failed in zimatcopy\n");
exit(1);
}
msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT) * 2;
b = malloc(msize);
if ( b == NULL )
{
printf("Memory alloc failed in zimatcopy\n");
exit(1);
}
if ( order == BlasColMajor )
{
if ( trans == BlasNoTrans )
{
OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
free(b);
return;
OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
}
if ( trans == BlasConj )
else if ( trans == BlasConj )
{
OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
free(b);
return;
OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
}
if ( trans == BlasTrans )
else if ( trans == BlasTrans )
{
OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
free(b);
return;
OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
}
if ( trans == BlasTransConj )
else if ( trans == BlasTransConj )
{
OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
free(b);
return;
OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
}
}
@ -222,34 +220,27 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
if ( trans == BlasNoTrans )
{
OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
free(b);
return;
OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
}
if ( trans == BlasConj )
else if ( trans == BlasConj )
{
OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
free(b);
return;
OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
}
if ( trans == BlasTrans )
else if ( trans == BlasTrans )
{
OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
free(b);
return;
OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
}
if ( trans == BlasTransConj )
else if ( trans == BlasTransConj )
{
OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
free(b);
return;
OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
}
}
free(b);
return;

View File

@ -33,7 +33,7 @@ endif
ifdef TARGET_CORE
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
ifeq ($(GCCVERSIONGTEQ10), 1)
ifeq ($(GCCVERSIONGTEQ11), 1)
override CFLAGS += -march=sapphirerapids
else
override CFLAGS += -march=skylake-avx512 -mavx512f

View File

@ -35,6 +35,12 @@ USE_TRMM = 1
endif
endif
ifneq ($(DYNAMIC_ARCH), 1)
ifeq ($(TARGET), MIPS64_GENERIC)
USE_TRMM = 1
endif
endif
ifeq ($(CORE), HASWELL)
USE_TRMM = 1
endif

View File

@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT absxi = 0.0;
if (n <= 0 || inc_x <= 0) return(0.0);
if (n <= 0 || inc_x == 0) return(0.0);
if ( n == 1 ) return( ABS(x[0]) );
n *= inc_x;

View File

@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG inc_x2;
FLOAT temp;
if (n <= 0 || inc_x <= 0) return(0.0);
if (n <= 0 || inc_x == 0) return(0.0);
inc_x2 = 2 * inc_x;

View File

@ -57,7 +57,7 @@ CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
DAXPYKERNEL = daxpy_thunderx2t99.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
@ -81,45 +81,35 @@ DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SASUMKERNEL = sasum_thunderx2t99.c
DASUMKERNEL = dasum_thunderx2t99.c
CASUMKERNEL = casum_thunderx2t99.c
ZASUMKERNEL = zasum_thunderx2t99.c
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SCOPYKERNEL = copy_thunderx2t99.c
DCOPYKERNEL = copy_thunderx2t99.c
CCOPYKERNEL = copy_thunderx2t99.c
ZCOPYKERNEL = copy_thunderx2t99.c
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SSWAPKERNEL = swap_thunderx2t99.S
DSWAPKERNEL = swap_thunderx2t99.S
CSWAPKERNEL = swap_thunderx2t99.S
ZSWAPKERNEL = swap_thunderx2t99.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
ISAMAXKERNEL = iamax_thunderx2t99.c
IDAMAXKERNEL = iamax_thunderx2t99.c
ICAMAXKERNEL = izamax_thunderx2t99.c
IZAMAXKERNEL = izamax_thunderx2t99.c
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
SNRM2KERNEL = scnrm2_thunderx2t99.c
DNRM2KERNEL = dznrm2_thunderx2t99.c
CNRM2KERNEL = scnrm2_thunderx2t99.c
ZNRM2KERNEL = dznrm2_thunderx2t99.c
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
SDOTKERNEL = ../generic/dot.c
else
SDOTKERNEL = dot.S
endif
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DDOTKERNEL = dot.c
SDOTKERNEL = dot.c
CDOTKERNEL = zdot_thunderx2t99.c
ZDOTKERNEL = zdot_thunderx2t99.c
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S
@ -128,10 +118,10 @@ SGEMM_BETA = sgemm_beta.S
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
SGEMMINCOPY = sgemm_ncopy_sve_v1.c
SGEMMITCOPY = sgemm_tcopy_sve_v1.c
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S
SGEMMINCOPY = gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c
SGEMMITCOPY = gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
@ -149,8 +139,8 @@ SSYMMLCOPY_M = symm_lcopy_sve.c
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
DGEMMINCOPY = gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c
DGEMMITCOPY = gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
@ -170,8 +160,8 @@ DSYMMLCOPY_M = symm_lcopy_sve.c
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
CGEMMINCOPY = cgemm_ncopy_sve_v1.c
CGEMMITCOPY = cgemm_tcopy_sve_v1.c
CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
@ -194,8 +184,8 @@ CSYMMLCOPY_M = zsymm_lcopy_sve.c
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
ZGEMMINCOPY = zgemm_ncopy_sve_v1.c
ZGEMMITCOPY = zgemm_tcopy_sve_v1.c
ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c

View File

@ -1,189 +1 @@
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = daxpy_thunderx2t99.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SASUMKERNEL = sasum_thunderx2t99.c
DASUMKERNEL = dasum_thunderx2t99.c
CASUMKERNEL = casum_thunderx2t99.c
ZASUMKERNEL = zasum_thunderx2t99.c
SCOPYKERNEL = copy_thunderx2t99.c
DCOPYKERNEL = copy_thunderx2t99.c
CCOPYKERNEL = copy_thunderx2t99.c
ZCOPYKERNEL = copy_thunderx2t99.c
SSWAPKERNEL = swap_thunderx2t99.S
DSWAPKERNEL = swap_thunderx2t99.S
CSWAPKERNEL = swap_thunderx2t99.S
ZSWAPKERNEL = swap_thunderx2t99.S
ISAMAXKERNEL = iamax_thunderx2t99.c
IDAMAXKERNEL = iamax_thunderx2t99.c
ICAMAXKERNEL = izamax_thunderx2t99.c
IZAMAXKERNEL = izamax_thunderx2t99.c
SNRM2KERNEL = scnrm2_thunderx2t99.c
DNRM2KERNEL = dznrm2_thunderx2t99.c
CNRM2KERNEL = scnrm2_thunderx2t99.c
ZNRM2KERNEL = dznrm2_thunderx2t99.c
DDOTKERNEL = dot.c
SDOTKERNEL = dot.c
CDOTKERNEL = zdot_thunderx2t99.c
ZDOTKERNEL = zdot_thunderx2t99.c
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S
SGEMM_BETA = sgemm_beta.S
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
ifeq ($(SGEMM_UNROLL_M), 16)
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
else
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
endif
ifeq ($(SGEMM_UNROLL_M), 4)
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
else
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
endif
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(SGEMM_UNROLL_N), 16)
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
else
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
endif
ifeq ($(SGEMM_UNROLL_N), 4)
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
else
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
endif
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
ifeq ($(DGEMM_UNROLL_M), 8)
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
else
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
endif
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(DGEMM_UNROLL_N), 4)
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
else
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
endif
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
include $(KERNELDIR)/KERNEL.ARMV8SVE

View File

@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow3 x15
#define pA x16
#define alphaR w17
#define alphaI w18
#define alphaI w19
#define alpha0_R s10
#define alphaV0_R v10.s[0]

View File

@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow3 x15
#define pA x16
#define alphaR w17
#define alphaI w18
#define alphaI w19
#define alpha0_R s10
#define alphaV0_R v10.s[0]

View File

@ -240,7 +240,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pB, pB, 32
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNELv1x4_M1
@ -276,9 +275,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1rw z15.s, p0/z, [pB, 28]
add pB, pB, 32
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNELv1x4_M2
@ -313,11 +309,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri z23.s, p1/m, z2.s, z15.s
ld1rw z15.s, p0/z, [pB, 28]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
add pB, pB, 32
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
.endm
.macro KERNELv1x4_E
@ -341,10 +333,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ii z22.s, p1/m, z3.s, z15.s
OP_ri z23.s, p1/m, z2.s, z15.s
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
.endm
.macro KERNELv1x4_SUB
@ -383,13 +371,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ii z22.s, p1/m, z1.s, z15.s
OP_ri z23.s, p1/m, z0.s, z15.s
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
.endm
.macro SAVEv1x4
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
fmla z24.s, p1/m, z16.s, alphaz_R
fmls z24.s, p1/m, z17.s, alphaz_I
@ -407,8 +391,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
st2w {z26.s, z27.s}, p1, [pCRow1]
add pCRow1, pCRow1, lanes, lsl #3
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld2w {z28.s, z29.s}, p1/z, [pCRow2]
fmla z28.s, p1/m, z20.s, alphaz_R
fmls z28.s, p1/m, z21.s, alphaz_I
@ -425,12 +407,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla z31.s, p1/m, z23.s, alphaz_R
st2w {z30.s, z31.s}, p1, [pCRow3]
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
.endm
/******************************************************************************/
@ -466,8 +444,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVEv1x2
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
fmla z24.s, p1/m, z16.s, alphaz_R
fmls z24.s, p1/m, z17.s, alphaz_I
@ -485,10 +461,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
st2w {z26.s, z27.s}, p1, [pCRow1]
add pCRow1, pCRow1, lanes, lsl #3
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
.endm
/******************************************************************************/
@ -516,8 +488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVEv1x1
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
fmla z24.s, p1/m, z16.s, alphaz_R
fmls z24.s, p1/m, z17.s, alphaz_I
@ -527,8 +497,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
.endm
/******************************************************************************/
@ -553,9 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alphaR, s0
dup alphaz_R, alphaR
fmov alphaI, s1
@ -676,10 +641,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
bne .Lcgemm_kernel_L4_Mv1_46
.Lcgemm_kernel_L4_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x4
.Lcgemm_kernel_L4_Mv1_END:

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b32(j, n);
svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
uint32_t active = svcntp_b32(svptrue_b32(), pg);
do {
@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
aoffset += active * lda * 2;
j += svcntw();
pg = svwhilelt_b32(j, n);
pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
active = svcntp_b32(svptrue_b32(), pg);

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b32(j, n);
svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
uint32_t active = svcntp_b32(svptrue_b32(), pg);
do {
@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
aoffset += active * 2;
j += svcntw();
pg = svwhilelt_b32(j, n);
pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));

View File

@ -49,10 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define pCRow3 x15
#define pA x16
#define alphaR w17
#define alphaI w18
#define temp x19
#define tempOffset x20
#define tempK x21
#define alphaI w19
#define temp x20
#define tempOffset x21
#define tempK x22
#define alpha0_R s10
#define alphaV0_R v10.s[0]

View File

@ -1,79 +0,0 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
// TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;
svint64_t lda_vec = svindex_s64(0LL, lda);
uint64_t sve_size = svcntd();
aoffset = a;
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b64(j, n);
uint64_t active = svcntp_b64(svptrue_b64(), pg);
do {
aoffset1 = aoffset;
uint64_t i_cnt = m;
while (i_cnt--) {
svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
svst1_f64(pg, (double *) boffset, a_vec);
aoffset1++;
boffset += active;
}
aoffset += sve_size * lda;
j += svcntd();
pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
return 0;
}

View File

@ -1,77 +0,0 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
// TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;
uint64_t sve_size = svcntd();
aoffset = a;
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b64(j, n);
uint64_t active = svcntp_b64(svptrue_b64(), pg);
do {
aoffset1 = aoffset;
uint64_t i_cnt = m;
while (i_cnt--) {
svfloat64_t a_vec = svld1(pg, (double *)aoffset1);
svst1_f64(pg, (double *) boffset, a_vec);
aoffset1 += lda;
boffset += active;
}
aoffset += sve_size;
j += svcntd();
pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
return 0;
}

View File

@ -50,8 +50,8 @@ static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) {
BLASLONG sve_width = SVE_WIDTH;
for (BLASLONG i = 0; i < n; i += sve_width * 2) {
svbool_t pg_a = SVE_WHILELT(i, n);
svbool_t pg_b = SVE_WHILELT(i + sve_width, n);
svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n);
svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n);
SVE_TYPE x_vec_a = svld1(pg_a, &x[i]);
SVE_TYPE y_vec_a = svld1(pg_a, &y[i]);

View File

@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#include <float.h>
#include <arm_neon.h>
#if defined(SMP)
@ -404,7 +404,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
#else
nrm2_compute(n, x, inc_x, &ssq, &scale);
#endif
if (fabs(scale) <1.e-300) return 0.;
volatile FLOAT sca = fabs(scale);
if (sca < DBL_MIN) return 0.;
ssq = sqrt(ssq) * scale;
return ssq;

View File

@ -0,0 +1,121 @@
/***************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdint.h>
#include <stdio.h>
#include <arm_sve.h>
#include "common.h"
#ifdef DOUBLE
#define COUNT "cntd"
#define SV_TYPE svfloat64_t
#define SV_INDEX svuint64_t
#define SV_INDEXER svindex_u64
#define SV_TRUE svptrue_b64
#define SV_WHILE svwhilelt_b64
#else
#define COUNT "cntw"
#define SV_TYPE svfloat32_t
#define SV_INDEX svuint32_t
#define SV_INDEXER svindex_u32
#define SV_TRUE svptrue_b32
#define SV_WHILE svwhilelt_b32
#endif
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \
a_vec_real = svld1_gather_index(pg, a_offset_inner, lda_vec); \
a_vec_imag = svld1_gather_index(pg, a_offset_inner + 1, lda_vec); \
svst2(pg, b_offset, svcreate2(a_vec_real, a_vec_imag)); \
a_offset_inner += 2; \
b_offset += active * 2;
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
uint64_t sve_size;
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : );
IFLOAT *a_offset, *a_offset_inner, *b_offset;
a_offset = a;
b_offset = b;
SV_INDEX lda_vec = SV_INDEXER(0LL, lda * 2);
SV_TYPE a_vec_real;
SV_TYPE a_vec_imag;
svbool_t pg_true = SV_TRUE();
BLASLONG single_vectors_n = n & -sve_size;
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
a_offset_inner = a_offset;
svbool_t pg = pg_true;
uint64_t active = sve_size;
uint64_t i_cnt = m >> 2;
while (i_cnt--) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 2) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 1) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
a_offset += sve_size * lda * 2;
}
BLASLONG remaining_n = n - single_vectors_n;
if (remaining_n) {
a_offset_inner = a_offset;
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
uint64_t active = remaining_n;
uint64_t i_cnt = m >> 2;
while (i_cnt--) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 2) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 1) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
}
return 0;
}

View File

@ -0,0 +1,131 @@
/***************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdint.h>
#include <stdio.h>
#include <arm_sve.h>
#include "common.h"
#ifdef DOUBLE
#define COUNT "cntd"
#define SV_TYPE svfloat64_t
#define SV_INDEX svuint64_t
#define SV_INDEXER svindex_u64
#define SV_TRUE svptrue_b64
#define SV_WHILE svwhilelt_b64
#define SV_PREFETCH svprfd_gather_index
#else
#define COUNT "cntw"
#define SV_TYPE svfloat32_t
#define SV_INDEX svuint32_t
#define SV_INDEXER svindex_u32
#define SV_TRUE svptrue_b32
#define SV_WHILE svwhilelt_b32
#define SV_PREFETCH svprfw_gather_index
#endif
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \
a_vec = svld1_gather_index(pg, a_offset_inner, lda_vec); \
svst1(pg, b_offset, a_vec); \
a_offset_inner++; \
b_offset += active;
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
uint64_t sve_size;
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : );
IFLOAT *a_offset, *a_offset_inner, *b_offset;
a_offset = a;
b_offset = b;
SV_INDEX lda_vec = SV_INDEXER(0LL, lda);
SV_TYPE a_vec;
svbool_t pg_true = SV_TRUE();
BLASLONG single_vectors_n = n & -sve_size;
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
a_offset_inner = a_offset;
svbool_t pg = pg_true;
uint64_t active = sve_size;
uint64_t i_cnt = m >> 3;
while (i_cnt--) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 4) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 2) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 1) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
a_offset += sve_size * lda;
}
BLASLONG remaining_n = n - single_vectors_n;
if (remaining_n) {
a_offset_inner = a_offset;
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
uint64_t active = remaining_n;
uint64_t i_cnt = m >> 2;
while (i_cnt--) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 2) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 1) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
}
return 0;
}

View File

@ -0,0 +1,115 @@
/***************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdint.h>
#include <stdio.h>
#include <arm_sve.h>
#include "common.h"
#ifdef DOUBLE
#define COUNT "cntd"
#define SV_TYPE svfloat64x2_t
#define SV_TRUE svptrue_b64
#define SV_WHILE svwhilelt_b64
#else
#define COUNT "cntw"
#define SV_TYPE svfloat32x2_t
#define SV_TRUE svptrue_b32
#define SV_WHILE svwhilelt_b32
#endif
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \
a_vec = svld2(pg, a_offset_inner); \
svst2(pg, b_offset, a_vec); \
a_offset_inner += lda * 2; \
b_offset += active * 2;
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
uint64_t sve_size = svcntw();
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : );
IFLOAT *a_offset, *a_offset_inner, *b_offset;
a_offset = a;
b_offset = b;
SV_TYPE a_vec;
svbool_t pg_true = SV_TRUE();
BLASLONG single_vectors_n = n & -sve_size;
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
a_offset_inner = a_offset;
svbool_t pg = pg_true;
uint64_t active = sve_size;
uint64_t i_cnt = m >> 2;
while (i_cnt--) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 2) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 1) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
a_offset += sve_size * 2;
}
BLASLONG remaining_n = n - single_vectors_n;
if (remaining_n) {
a_offset_inner = a_offset;
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
uint64_t active = remaining_n;
uint64_t i_cnt = m >> 2;
while (i_cnt--) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 2) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 1) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
}
return 0;
}

View File

@ -0,0 +1,125 @@
/***************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdint.h>
#include <stdio.h>
#include <arm_sve.h>
#include "common.h"
#ifdef DOUBLE
#define COUNT "cntd"
#define SV_TYPE svfloat64_t
#define SV_TRUE svptrue_b64
#define SV_WHILE svwhilelt_b64
#else
#define COUNT "cntw"
#define SV_TYPE svfloat32_t
#define SV_TRUE svptrue_b32
#define SV_WHILE svwhilelt_b32
#endif
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \
a_vec = svld1(pg, a_offset_inner); \
svst1(pg, b_offset, a_vec); \
a_offset_inner += lda; \
b_offset += active;
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
uint64_t sve_size = svcntw();
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : );
IFLOAT *a_offset, *a_offset_inner, *b_offset;
a_offset = a;
b_offset = b;
SV_TYPE a_vec;
svbool_t pg_true = SV_TRUE();
BLASLONG single_vectors_n = n & -sve_size;
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
a_offset_inner = a_offset;
svbool_t pg = pg_true;
uint64_t active = sve_size;
uint64_t i_cnt = m >> 3;
while (i_cnt--) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 4) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 2) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 1) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
a_offset += sve_size;
}
BLASLONG remaining_n = n - single_vectors_n;
if (remaining_n) {
a_offset_inner = a_offset;
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
uint64_t active = remaining_n;
uint64_t i_cnt = m >> 2;
while (i_cnt--) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 2) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 1) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
}
return 0;
}

0
kernel/arm64/sgemm_beta.S Executable file → Normal file
View File

View File

@ -1,78 +0,0 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
// TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;
svint32_t lda_vec = svindex_s32(0LL, lda);
uint32_t sve_size = svcntw();
aoffset = a;
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b32(j, n);
uint32_t active = svcntp_b32(svptrue_b32(), pg);
do {
aoffset1 = aoffset;
uint32_t i_cnt = m;
while (i_cnt--) {
svfloat32_t a_vec = svld1_gather_index(pg, (float *) aoffset1, lda_vec);
svst1_f32(pg, (float *) boffset, a_vec);
aoffset1++;
boffset += active;
}
aoffset += sve_size * lda;
j += svcntw();
pg = svwhilelt_b32(j, n);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));
return 0;
}

View File

@ -1,77 +0,0 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include <arm_sve.h>
// TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;
uint32_t sve_size = svcntw();
aoffset = a;
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b32(j, n);
uint32_t active = svcntp_b32(svptrue_b32(), pg);
do {
aoffset1 = aoffset;
uint32_t i_cnt = m;
while (i_cnt--) {
svfloat32_t a_vec = svld1(pg, (float *) aoffset1);
svst1_f32(pg, (float *) boffset, a_vec);
aoffset1 += lda;
boffset += active;
}
aoffset += sve_size;
j += svcntw();
pg = svwhilelt_b32(j, n);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));
return 0;
}

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n);
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL);
@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size;
posX_vec = svdup_s64(posX);
j += sve_size;
pg = svwhilelt_b64(j, n);
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
int32_t N = n;
int32_t j = 0;
svbool_t pg = svwhilelt_b32(j, N);
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1);
@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size;
posX_vec = svdup_s32(posX);
j += sve_size;
pg = svwhilelt_b32(j, N);
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n);
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL);
@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size;
posX_vec = svdup_s64(posX);
j += sve_size;
pg = svwhilelt_b64(j, n);
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
int32_t N = n;
int32_t j = 0;
svbool_t pg = svwhilelt_b32(j, N);
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1);
@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size;
posX_vec = svdup_s32(posX);
j += sve_size;
pg = svwhilelt_b32(j, N);
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));

View File

@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao;
#ifdef DOUBLE
svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n);
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, n);
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do
@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active;
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, n);
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif

View File

@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao;
js = 0;
#ifdef DOUBLE
svbool_t pn = svwhilelt_b64(js, n);
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
svbool_t pn = svwhilelt_b32(js, n);
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do
@ -122,11 +122,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active;
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, n);
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif

View File

@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao;
#ifdef DOUBLE
svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n);
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, n);
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do
@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active;
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, n);
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif

View File

@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao;
js = 0;
#ifdef DOUBLE
svbool_t pn = svwhilelt_b64(js, n);
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
svbool_t pn = svwhilelt_b32(js, n);
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do
@ -121,11 +121,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active;
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, n);
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif

View File

@ -56,13 +56,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
#ifdef DOUBLE
int64_t js = 0;
svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n);
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
int32_t N = n;
int32_t js = 0;
svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, N);
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do {
@ -106,11 +106,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, N);
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
jj = offset;
#ifdef DOUBLE
int64_t js = 0;
svbool_t pn = svwhilelt_b64(js, n);
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
int32_t N = n;
int32_t js = 0;
svbool_t pn = svwhilelt_b32(js, N);
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do {
@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, N);
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -56,13 +57,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
#ifdef DOUBLE
int64_t js = 0;
svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n);
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
int32_t N = n;
int32_t js = 0;
svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, N);
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do {
@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, N);
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
jj = offset;
#ifdef DOUBLE
int64_t js = 0;
svbool_t pn = svwhilelt_b64(js, n);
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
int32_t N = n;
int32_t js = 0;
svbool_t pn = svwhilelt_b32(js, N);
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do {
@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, N);
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif

View File

@ -239,8 +239,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1rd z15.d, p0/z, [pB, 56]
add pB, pB, 64
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNELv1x4_M1
@ -276,9 +274,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1rd z15.d, p0/z, [pB, 56]
add pB, pB, 64
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm
.macro KERNELv1x4_M2
@ -313,11 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri z23.d, p1/m, z2.d, z15.d
ld1rd z15.d, p0/z, [pB, 56]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
add pB, pB, 64
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
.endm
.macro KERNELv1x4_E
@ -340,11 +331,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ir z23.d, p1/m, z3.d, z14.d
OP_ii z22.d, p1/m, z3.d, z15.d
OP_ri z23.d, p1/m, z2.d, z15.d
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
.endm
.macro KERNELv1x4_SUB
@ -382,14 +368,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ir z23.d, p1/m, z1.d, z14.d
OP_ii z22.d, p1/m, z1.d, z15.d
OP_ri z23.d, p1/m, z0.d, z15.d
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
.endm
.macro SAVEv1x4
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld2d {z24.d, z25.d}, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaz_R
fmls z24.d, p1/m, z17.d, alphaz_I
@ -407,7 +388,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
st2d {z26.d, z27.d}, p1, [pCRow1]
add pCRow1, pCRow1, lanes, lsl #4
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld2d {z28.d, z29.d}, p1/z, [pCRow2]
fmla z28.d, p1/m, z20.d, alphaz_R
@ -425,12 +405,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla z31.d, p1/m, z23.d, alphaz_R
st2d {z30.d, z31.d}, p1, [pCRow3]
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
.endm
/******************************************************************************/
@ -466,8 +442,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVEv1x2
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld2d {z24.d, z25.d}, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaz_R
fmls z24.d, p1/m, z17.d, alphaz_I
@ -485,10 +459,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
st2d {z26.d, z27.d}, p1, [pCRow1]
add pCRow1, pCRow1, lanes, lsl #4
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
.endm
/******************************************************************************/
@ -516,8 +486,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm
.macro SAVEv1x1
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld2d {z24.d, z25.d}, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaz_R
fmls z24.d, p1/m, z17.d, alphaz_I
@ -527,8 +495,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
.endm
/******************************************************************************/
@ -553,9 +519,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alphaR, d0
dup alphaz_R, alphaR
fmov alphaI, d1
@ -676,10 +639,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
bne .Lzgemm_kernel_L4_Mv1_46
.Lzgemm_kernel_L4_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x4
.Lzgemm_kernel_L4_Mv1_END:

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b64(j, n);
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
uint64_t active = svcntp_b64(svptrue_b64(), pg);
do {
@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
aoffset += active * lda * 2;
j += svcntd();
pg = svwhilelt_b64(j, n);
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg);

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
boffset = b;
j = 0;
svbool_t pg = svwhilelt_b64(j, n);
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
uint64_t active = svcntp_b64(svptrue_b64(), pg);
do {
@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
aoffset += active * 2;
j += svcntd();
pg = svwhilelt_b64(j, n);
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n);
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL);
@ -79,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
gat_ind = svadd_m(cmp, gat_ind, lda_vec);
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
if (offset <= 0) {
svbool_t off_g = svwhilelt_b64(offset, 0LL);
svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL);
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
}
@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size;
posX_vec = svdup_s64(posX);
j += sve_size;
pg = svwhilelt_b64(j, n);
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
@ -117,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
int32_t j = 0;
int32_t N = n;
svbool_t pg = svwhilelt_b32(j, N);
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1);
@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
gat_ind = svadd_m(cmp, gat_ind, lda_vec);
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
if (offset <= 0) {
svbool_t off_g = svwhilelt_b32(offset, 0);
svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0);
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
}
@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size;
posX_vec = svdup_s32(posX);
j += sve_size;
pg = svwhilelt_b32(j, N);
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n);
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL);
@ -80,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
data_vec_imag = svneg_z(pg, data_vec_imag);
if (offset <= 0) {
svbool_t off_g = svwhilelt_b64(offset, 0LL);
svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL);
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
}
@ -100,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size;
posX_vec = svdup_s64(posX);
j += sve_size;
pg = svwhilelt_b64(j, n);
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
#else
@ -116,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
int32_t j = 0;
int32_t N = n;
svbool_t pg = svwhilelt_b32(j, N);
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1);
@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
data_vec_imag = svneg_z(pg, data_vec_imag);
if (offset <= 0) {
svbool_t off_g = svwhilelt_b32(offset, 0);
svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0);
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
}
@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size;
posX_vec = svdup_s32(posX);
j += sve_size;
pg = svwhilelt_b32(j, N);
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n);
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL);
@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size;
posX_vec = svdup_s64(posX);
j += sve_size;
pg = svwhilelt_b64(j, n);
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
int32_t N = n;
int32_t j = 0;
svbool_t pg = svwhilelt_b32(j, N);
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1);
@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size;
posX_vec = svdup_s32(posX);
j += sve_size;
pg = svwhilelt_b32(j, N);
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n);
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL);
@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size;
posX_vec = svdup_s64(posX);
j += sve_size;
pg = svwhilelt_b64(j, n);
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg));
@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
int32_t N = n;
int32_t j = 0;
svbool_t pg = svwhilelt_b32(j, N);
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1);
@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size;
posX_vec = svdup_s32(posX);
j += sve_size;
pg = svwhilelt_b32(j, N);
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg));

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao;
#ifdef DOUBLE
svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n);
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, n);
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do
@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active;
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, n);
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao;
js = 0;
#ifdef DOUBLE
svbool_t pn = svwhilelt_b64(js, n);
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
svbool_t pn = svwhilelt_b32(js, n);
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do
@ -129,11 +130,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active;
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, n);
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao;
#ifdef DOUBLE
svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n);
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, n);
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do
@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active;
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, n);
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao;
js = 0;
#ifdef DOUBLE
svbool_t pn = svwhilelt_b64(js, n);
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
svbool_t pn = svwhilelt_b32(js, n);
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do
@ -128,11 +129,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active;
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, n);
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
#ifdef DOUBLE
int64_t js = 0;
svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n);
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
int32_t N = n;
int32_t js = 0;
svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, N);
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do {
@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, N);
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
jj = offset;
#ifdef DOUBLE
int64_t js = 0;
svbool_t pn = svwhilelt_b64(js, n);
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn);
#else
int32_t N = n;
int32_t js = 0;
svbool_t pn = svwhilelt_b32(js, N);
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
int n_active = svcntp_b32(svptrue_b32(), pn);
#endif
do {
@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
js += n_active;
#ifdef DOUBLE
pn = svwhilelt_b64(js, n);
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn));
#else
pn = svwhilelt_b32(js, N);
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn));
#endif

Some files were not shown because too many files have changed in this diff Show More