Merge pull request #4606 from OpenMathLib/develop
Merge develop branch for 0.3.27
This commit is contained in:
commit
8f3bb62254
86
.cirrus.yml
86
.cirrus.yml
|
@ -1,44 +1,44 @@
|
||||||
macos_instance:
|
macos_instance:
|
||||||
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
|
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
|
||||||
|
|
||||||
task:
|
#task:
|
||||||
name: AppleM1/LLVM
|
# name: AppleM1/LLVM
|
||||||
compile_script:
|
# compile_script:
|
||||||
- brew install llvm
|
# - brew install llvm
|
||||||
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||||
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||||
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||||
- make TARGET=VORTEX USE_OPENMP=1 CC=clang
|
# - make TARGET=VORTEX USE_OPENMP=1 CC=clang
|
||||||
|
|
||||||
task:
|
#task:
|
||||||
name: AppleM1/LLVM/ILP64
|
# name: AppleM1/LLVM/ILP64
|
||||||
compile_script:
|
# compile_script:
|
||||||
- brew install llvm
|
# - brew install llvm
|
||||||
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||||
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||||
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||||
- make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1
|
# - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1
|
||||||
|
|
||||||
task:
|
#task:
|
||||||
name: AppleM1/LLVM/CMAKE
|
# name: AppleM1/LLVM/CMAKE
|
||||||
compile_script:
|
# compile_script:
|
||||||
- brew install llvm
|
# - brew install llvm
|
||||||
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||||
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||||
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||||
- mkdir build
|
# - mkdir build
|
||||||
- cd build
|
# - cd build
|
||||||
- cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
|
# - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
|
||||||
- make -j 4
|
# - make -j 4
|
||||||
|
|
||||||
task:
|
#task:
|
||||||
name: AppleM1/GCC/MAKE/OPENMP
|
# name: AppleM1/GCC/MAKE/OPENMP
|
||||||
compile_script:
|
# compile_script:
|
||||||
- brew install gcc@11
|
# - brew install gcc@11
|
||||||
- export PATH=/opt/homebrew/bin:$PATH
|
# - export PATH=/opt/homebrew/bin:$PATH
|
||||||
- export LDFLAGS="-L/opt/homebrew/lib"
|
# - export LDFLAGS="-L/opt/homebrew/lib"
|
||||||
- export CPPFLAGS="-I/opt/homebrew/include"
|
# - export CPPFLAGS="-I/opt/homebrew/include"
|
||||||
- make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
|
# - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
|
||||||
|
|
||||||
macos_instance:
|
macos_instance:
|
||||||
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
|
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
|
||||||
|
@ -58,8 +58,8 @@ task:
|
||||||
- export VALID_ARCHS="i386 x86_64"
|
- export VALID_ARCHS="i386 x86_64"
|
||||||
- xcrun --sdk macosx --show-sdk-path
|
- xcrun --sdk macosx --show-sdk-path
|
||||||
- xcodebuild -version
|
- xcodebuild -version
|
||||||
- export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
- export CC=/Applications/Xcode-15.3.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||||
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX12.3.sdk -arch x86_64"
|
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-15.3.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.4.sdk -arch x86_64"
|
||||||
- make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
|
- make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
|
||||||
always:
|
always:
|
||||||
config_artifacts:
|
config_artifacts:
|
||||||
|
@ -78,8 +78,8 @@ task:
|
||||||
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||||
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||||
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||||
- export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
- export CC=/Applications/Xcode-15.3.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||||
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
|
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-15.3.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.4.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||||
- make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1
|
- make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1
|
||||||
always:
|
always:
|
||||||
config_artifacts:
|
config_artifacts:
|
||||||
|
@ -91,14 +91,16 @@ macos_instance:
|
||||||
task:
|
task:
|
||||||
name: AppleM1/LLVM armv7-androidndk xbuild
|
name: AppleM1/LLVM armv7-androidndk xbuild
|
||||||
compile_script:
|
compile_script:
|
||||||
- #brew install android-ndk
|
- brew install android-ndk
|
||||||
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||||
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||||
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||||
- find /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b -name "armv7a-linux-androideabi*-ranlib"
|
- ls /System/Volumes/Data/opt/homebrew
|
||||||
|
- ls -l /System/Volumes/Data/opt/homebrew/Caskroom/
|
||||||
|
- find /System/Volumes/Data/opt/homebrew -name "armv7a-linux-androideabi*-ranlib"
|
||||||
- #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
- #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||||
- #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
|
- #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||||
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b/AndroidNDK8937393.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
|
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/26c/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
|
||||||
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
|
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
|
||||||
always:
|
always:
|
||||||
config_artifacts:
|
config_artifacts:
|
||||||
|
|
|
@ -0,0 +1,149 @@
|
||||||
|
name: apple m
|
||||||
|
|
||||||
|
on: [push, pull_request]
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read # to fetch code (actions/checkout)
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||||
|
runs-on: macos-14
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
build: [cmake, make]
|
||||||
|
fortran: [gfortran]
|
||||||
|
openmp: [0, 1]
|
||||||
|
ilp64: [0, 1]
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Print system information
|
||||||
|
run: |
|
||||||
|
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||||
|
cat /proc/cpuinfo
|
||||||
|
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||||
|
sysctl -a | grep machdep.cpu
|
||||||
|
else
|
||||||
|
echo "::error::$RUNNER_OS not supported"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Install Dependencies
|
||||||
|
run: |
|
||||||
|
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||||
|
sudo apt-get install -y gfortran cmake ccache libtinfo5
|
||||||
|
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||||
|
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
|
||||||
|
brew reinstall gcc
|
||||||
|
brew install coreutils cmake ccache
|
||||||
|
brew install llvm
|
||||||
|
else
|
||||||
|
echo "::error::$RUNNER_OS not supported"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Compilation cache
|
||||||
|
uses: actions/cache@v3
|
||||||
|
with:
|
||||||
|
path: ~/.ccache
|
||||||
|
# We include the commit sha in the cache key, as new cache entries are
|
||||||
|
# only created if there is no existing entry for the key yet.
|
||||||
|
# GNU make and cmake call the compilers differently. It looks like
|
||||||
|
# that causes the cache to mismatch. Keep the ccache for both build
|
||||||
|
# tools separate to avoid polluting each other.
|
||||||
|
key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
|
||||||
|
# Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
|
||||||
|
restore-keys: |
|
||||||
|
ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}-${{ github.ref }}
|
||||||
|
ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}
|
||||||
|
ccache-${{ runner.os }}-${{ matrix.build }}
|
||||||
|
|
||||||
|
- name: Configure ccache
|
||||||
|
run: |
|
||||||
|
if [ "${{ matrix.build }}" = "make" ]; then
|
||||||
|
# Add ccache to path
|
||||||
|
if [ "$RUNNER_OS" = "Linux" ]; then
|
||||||
|
echo "/usr/lib/ccache" >> $GITHUB_PATH
|
||||||
|
elif [ "$RUNNER_OS" = "macOS" ]; then
|
||||||
|
echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH
|
||||||
|
echo "/opt/homebrew/opt/llvm/bin" >>$GITHUB_PATH
|
||||||
|
echo "" >>$GITHUB_PATH
|
||||||
|
else
|
||||||
|
echo "::error::$RUNNER_OS not supported"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
|
||||||
|
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||||
|
echo "max_size = 300M" > ~/.ccache/ccache.conf
|
||||||
|
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||||
|
ccache -s
|
||||||
|
|
||||||
|
- name: Build OpenBLAS
|
||||||
|
run: |
|
||||||
|
export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||||
|
export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||||
|
export CC="/opt/homebrew/opt/llvm/bin/clang"
|
||||||
|
case "${{ matrix.build }}" in
|
||||||
|
"make")
|
||||||
|
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=${{matrix.openmp}} INTERFACE64=${{matrix.ilp64}} FC="ccache ${{ matrix.fortran }}"
|
||||||
|
;;
|
||||||
|
"cmake")
|
||||||
|
export LDFLAGS="$LDFLAGS -Wl,-ld_classic"
|
||||||
|
mkdir build && cd build
|
||||||
|
cmake -DDYNAMIC_ARCH=1 \
|
||||||
|
-DUSE_OPENMP=${{matrix.openmp}} \
|
||||||
|
-DINTERFACE64=${{matrix.ilp64}} \
|
||||||
|
-DNOFORTRAN=0 \
|
||||||
|
-DBUILD_WITHOUT_LAPACK=0 \
|
||||||
|
-DCMAKE_VERBOSE_MAKEFILE=ON \
|
||||||
|
-DCMAKE_BUILD_TYPE=Release \
|
||||||
|
-DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
|
||||||
|
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||||
|
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
|
||||||
|
..
|
||||||
|
cmake --build .
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "::error::Configuration not supported"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
- name: Show ccache status
|
||||||
|
continue-on-error: true
|
||||||
|
run: ccache -s
|
||||||
|
|
||||||
|
- name: Run tests
|
||||||
|
timeout-minutes: 60
|
||||||
|
run: |
|
||||||
|
case "${{ matrix.build }}" in
|
||||||
|
"make")
|
||||||
|
MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0'
|
||||||
|
echo "::group::Tests in 'test' directory"
|
||||||
|
make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
|
||||||
|
echo "::endgroup::"
|
||||||
|
echo "::group::Tests in 'ctest' directory"
|
||||||
|
make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
|
||||||
|
echo "::endgroup::"
|
||||||
|
echo "::group::Tests in 'utest' directory"
|
||||||
|
make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
|
||||||
|
echo "::endgroup::"
|
||||||
|
;;
|
||||||
|
"cmake")
|
||||||
|
cd build && ctest
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo "::error::Configuration not supported"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
|
@ -14,8 +14,8 @@ jobs:
|
||||||
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
env:
|
env:
|
||||||
xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282
|
xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1698113812618
|
||||||
toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz
|
toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0-20231018.tar.gz
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
|
@ -76,7 +76,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
wget ${xuetie_toolchain}/${toolchain_file_name}
|
wget ${xuetie_toolchain}/${toolchain_file_name}
|
||||||
tar -xvf ${toolchain_file_name} -C /opt
|
tar -xvf ${toolchain_file_name} -C /opt
|
||||||
export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1/bin:$PATH"
|
export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0/bin:$PATH"
|
||||||
|
|
||||||
make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
|
make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
|
||||||
|
|
||||||
|
|
|
@ -42,6 +42,7 @@ jobs:
|
||||||
- name: Install Dependencies
|
- name: Install Dependencies
|
||||||
run: |
|
run: |
|
||||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||||
|
sudo apt-get update
|
||||||
sudo apt-get install -y gfortran cmake ccache libtinfo5
|
sudo apt-get install -y gfortran cmake ccache libtinfo5
|
||||||
elif [ "$RUNNER_OS" == "macOS" ]; then
|
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||||
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
|
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
|
||||||
|
|
|
@ -0,0 +1,253 @@
|
||||||
|
name: riscv64 zvl256b qemu test
|
||||||
|
|
||||||
|
on: [push, pull_request]
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read # to fetch code (actions/checkout)
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
TEST:
|
||||||
|
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
env:
|
||||||
|
triple: riscv64-unknown-linux-gnu
|
||||||
|
riscv_gnu_toolchain: https://github.com/riscv-collab/riscv-gnu-toolchain
|
||||||
|
riscv_gnu_toolchain_version: 13.2.0
|
||||||
|
riscv_gnu_toolchain_nightly_download_path: /releases/download/2024.02.02/riscv64-glibc-ubuntu-22.04-llvm-nightly-2024.02.02-nightly.tar.gz
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- target: RISCV64_ZVL128B
|
||||||
|
opts: TARGET=RISCV64_ZVL128B BINARY=64 ARCH=riscv64
|
||||||
|
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=128,elen=64
|
||||||
|
- target: RISCV64_ZVL256B
|
||||||
|
opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64
|
||||||
|
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: install build deps
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install autoconf automake autotools-dev ninja-build make \
|
||||||
|
libgomp1-riscv64-cross ccache
|
||||||
|
wget ${riscv_gnu_toolchain}/${riscv_gnu_toolchain_nightly_download_path}
|
||||||
|
tar -xvf $(basename ${riscv_gnu_toolchain_nightly_download_path}) -C /opt
|
||||||
|
|
||||||
|
- name: Compilation cache
|
||||||
|
uses: actions/cache@v3
|
||||||
|
with:
|
||||||
|
path: ~/.ccache
|
||||||
|
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
|
||||||
|
restore-keys: |
|
||||||
|
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
|
||||||
|
ccache-${{ runner.os }}-${{ matrix.target }}
|
||||||
|
|
||||||
|
- name: Configure ccache
|
||||||
|
run: |
|
||||||
|
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||||
|
echo "max_size = 300M" > ~/.ccache/ccache.conf
|
||||||
|
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||||
|
ccache -s
|
||||||
|
|
||||||
|
- name: build OpenBLAS libs
|
||||||
|
run: |
|
||||||
|
export PATH="/opt/riscv/bin:$PATH"
|
||||||
|
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
|
||||||
|
CC='ccache clang --rtlib=compiler-rt -target ${triple} --sysroot /opt/riscv/sysroot --gcc-toolchain=/opt/riscv/lib/gcc/riscv64-unknown-linux-gnu/${riscv_gnu_toolchain_version}/' \
|
||||||
|
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
|
||||||
|
RANLIB='ccache ${triple}-ranlib' \
|
||||||
|
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
|
||||||
|
HOSTCC=gcc HOSTFC=gfortran -j$(nproc)
|
||||||
|
|
||||||
|
- name: build OpenBLAS tests
|
||||||
|
run: |
|
||||||
|
export PATH="/opt/riscv/bin:$PATH"
|
||||||
|
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
|
||||||
|
CC='${triple}-gcc' \
|
||||||
|
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
|
||||||
|
RANLIB='ccache ${triple}-ranlib' \
|
||||||
|
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
|
||||||
|
HOSTCC=gcc HOSTFC=gfortran -j$(nproc) tests
|
||||||
|
|
||||||
|
- name: build lapack-netlib tests
|
||||||
|
working-directory: ./lapack-netlib/TESTING
|
||||||
|
run: |
|
||||||
|
export PATH="/opt/riscv/bin:$PATH"
|
||||||
|
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
|
||||||
|
CC='${triple}-gcc' \
|
||||||
|
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
|
||||||
|
RANLIB='ccache ${triple}-ranlib' \
|
||||||
|
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
|
||||||
|
HOSTCC=gcc HOSTFC=gfortran -j$(nproc) \
|
||||||
|
LIN/xlintsts LIN/xlintstc LIN/xlintstd LIN/xlintstz LIN/xlintstrfs \
|
||||||
|
LIN/xlintstrfc LIN/xlintstrfd LIN/xlintstrfz LIN/xlintstds \
|
||||||
|
LIN/xlintstzc EIG/xeigtsts EIG/xeigtstc EIG/xeigtstd EIG/xeigtstz \
|
||||||
|
|
||||||
|
- name: OpenBLAS tests
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
export PATH="/opt/riscv/bin:$PATH"
|
||||||
|
export QEMU_CPU=${{ matrix.qemu_cpu }}
|
||||||
|
rm -rf ./test_out
|
||||||
|
mkdir -p ./test_out
|
||||||
|
run_test() { local DIR=$1; local CMD=$2; local DATA=$3; local OUTPUT="./test_out/$DIR.$CMD"; \
|
||||||
|
echo "`pwd`/$DIR/$CMD $DIR/$DATA" >> $OUTPUT; \
|
||||||
|
if [[ -z $DATA ]]; then qemu-riscv64 ./$DIR/$CMD |& tee $OUTPUT ; \
|
||||||
|
else qemu-riscv64 ./$DIR/$CMD < ./$DIR/$DATA |& tee $OUTPUT ; fi ; \
|
||||||
|
RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi \
|
||||||
|
}
|
||||||
|
run_test test cblat1 &
|
||||||
|
run_test test cblat2 cblat2.dat &
|
||||||
|
run_test test cblat3 cblat3.dat &
|
||||||
|
run_test test dblat1 &
|
||||||
|
run_test test dblat2 dblat2.dat &
|
||||||
|
run_test test dblat3 dblat3.dat &
|
||||||
|
run_test test sblat1 &
|
||||||
|
run_test test sblat2 sblat2.dat &
|
||||||
|
run_test test sblat3 sblat3.dat &
|
||||||
|
run_test test zblat1 &
|
||||||
|
run_test test zblat2 zblat2.dat &
|
||||||
|
run_test test zblat3 zblat3.dat &
|
||||||
|
run_test ctest xccblat1 &
|
||||||
|
run_test ctest xccblat2 cin2 &
|
||||||
|
run_test ctest xccblat3 cin3 &
|
||||||
|
run_test ctest xdcblat1 &
|
||||||
|
run_test ctest xdcblat2 din2 &
|
||||||
|
run_test ctest xdcblat3 din3 &
|
||||||
|
run_test ctest xscblat1 &
|
||||||
|
run_test ctest xscblat2 sin2 &
|
||||||
|
run_test ctest xscblat3 sin3 &
|
||||||
|
run_test ctest xzcblat1 &
|
||||||
|
run_test ctest xzcblat2 zin2 &
|
||||||
|
run_test ctest xzcblat3 zin3 &
|
||||||
|
wait
|
||||||
|
while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*)
|
||||||
|
if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi
|
||||||
|
|
||||||
|
- name: netlib tests
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
: # these take a very long time
|
||||||
|
echo "Skipping netlib tests in CI"
|
||||||
|
exit 0
|
||||||
|
: # comment out exit above to enable the tests
|
||||||
|
: # probably we want to identify a subset to run in CI
|
||||||
|
export PATH="/opt/riscv/bin:$PATH"
|
||||||
|
export QEMU_CPU=${{ matrix.qemu_cpu }}
|
||||||
|
rm -rf ./test_out
|
||||||
|
mkdir -p ./test_out
|
||||||
|
run_test() { local OUTPUT="./test_out/$1"; local DATA="./lapack-netlib/TESTING/$2"; local CMD="./lapack-netlib/TESTING/$3"; \
|
||||||
|
echo "$4" >> $OUTPUT; \
|
||||||
|
echo "$CMD" >> $OUTPUT; \
|
||||||
|
qemu-riscv64 $CMD < $DATA |& tee $OUTPUT; \
|
||||||
|
RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi; \
|
||||||
|
if grep -q fail $OUTPUT ; then echo "*** FAIL: log contains 'fail'" >> $OUTPUT ; fi ; \
|
||||||
|
if grep -q rror $OUTPUT | grep -v -q "passed" | grep -v "largest error" ; then echo "*** FAIL: log contains 'error'" >> $OUTPUT ; fi \
|
||||||
|
}
|
||||||
|
run_test stest.out stest.in LIN/xlintsts "Testing REAL LAPACK linear equation routines" &
|
||||||
|
run_test ctest.out ctest.in LIN/xlintstc "Testing COMPLEX LAPACK linear equation routines" &
|
||||||
|
run_test dtest.out dtest.in LIN/xlintstd "Testing DOUBLE PRECISION LAPACK linear equation routines" &
|
||||||
|
run_test ztest.out ztest.in LIN/xlintstz "Testing COMPLEX16 LAPACK linear equation routines" &
|
||||||
|
run_test dstest.out dstest.in LIN/xlintstds "Testing SINGLE-DOUBLE PRECISION LAPACK prototype linear equation routines" &
|
||||||
|
run_test zctest.out zctest.in LIN/xlintstzc "Testing COMPLEX-COMPLEX16 LAPACK prototype linear equation routines" &
|
||||||
|
run_test stest_rfp.out stest_rfp.in LIN/xlintstrfs "Testing REAL LAPACK RFP prototype linear equation routines" &
|
||||||
|
run_test dtest_rfp.out dtest_rfp.in LIN/xlintstrfd "Testing DOUBLE PRECISION LAPACK RFP prototype linear equation routines" &
|
||||||
|
run_test ctest_rfp.out ctest_rfp.in LIN/xlintstrfc "Testing COMPLEX LAPACK RFP prototype linear equation routines" &
|
||||||
|
run_test ztest_rfp.out ztest_rfp.in LIN/xlintstrfz "Testing COMPLEX16 LAPACK RFP prototype linear equation routines" &
|
||||||
|
run_test snep.out nep.in EIG/xeigtsts "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
|
||||||
|
run_test ssep.out sep.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||||
|
run_test sse2.out se2.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||||
|
run_test ssvd.out svd.in EIG/xeigtsts "SVD - Testing Singular Value Decomposition routines" &
|
||||||
|
run_test sec.out sec.in EIG/xeigtsts "SEC - Testing REAL Eigen Condition Routines" &
|
||||||
|
run_test sed.out sed.in EIG/xeigtsts "SEV - Testing REAL Nonsymmetric Eigenvalue Driver" &
|
||||||
|
run_test sgg.out sgg.in EIG/xeigtsts "SGG - Testing REAL Nonsymmetric Generalized Eigenvalue Problem routines" &
|
||||||
|
run_test sgd.out sgd.in EIG/xeigtsts "SGD - Testing REAL Nonsymmetric Generalized Eigenvalue Problem driver routines" &
|
||||||
|
run_test ssb.out ssb.in EIG/xeigtsts "SSB - Testing REAL Symmetric Eigenvalue Problem routines" &
|
||||||
|
run_test ssg.out ssg.in EIG/xeigtsts "SSG - Testing REAL Symmetric Generalized Eigenvalue Problem routines" &
|
||||||
|
run_test sbal.out sbal.in EIG/xeigtsts "SGEBAL - Testing the balancing of a REAL general matrix" &
|
||||||
|
run_test sbak.out sbak.in EIG/xeigtsts "SGEBAK - Testing the back transformation of a REAL balanced matrix" &
|
||||||
|
run_test sgbal.out sgbal.in EIG/xeigtsts "SGGBAL - Testing the balancing of a pair of REAL general matrices" &
|
||||||
|
run_test sgbak.out sgbak.in EIG/xeigtsts "SGGBAK - Testing the back transformation of a pair of REAL balanced matrices" &
|
||||||
|
run_test sbb.out sbb.in EIG/xeigtsts "SBB - Testing banded Singular Value Decomposition routines" &
|
||||||
|
run_test sglm.out glm.in EIG/xeigtsts "GLM - Testing Generalized Linear Regression Model routines" &
|
||||||
|
run_test sgqr.out gqr.in EIG/xeigtsts "GQR - Testing Generalized QR and RQ factorization routines" &
|
||||||
|
run_test sgsv.out gsv.in EIG/xeigtsts "GSV - Testing Generalized Singular Value Decomposition routines" &
|
||||||
|
run_test scsd.out csd.in EIG/xeigtsts "CSD - Testing CS Decomposition routines" &
|
||||||
|
run_test slse.out lse.in EIG/xeigtsts "LSE - Testing Constrained Linear Least Squares routines" &
|
||||||
|
run_test cnep.out nep.in EIG/xeigtstc "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
|
||||||
|
run_test csep.out sep.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||||
|
run_test cse2.out se2.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||||
|
run_test csvd.out svd.in EIG/xeigtstc "SVD - Testing Singular Value Decomposition routines" &
|
||||||
|
run_test cec.out cec.in EIG/xeigtstc "CEC - Testing COMPLEX Eigen Condition Routines" &
|
||||||
|
run_test ced.out ced.in EIG/xeigtstc "CES - Testing COMPLEX Nonsymmetric Schur Form Driver" &
|
||||||
|
run_test cgg.out cgg.in EIG/xeigtstc "CGG - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem routines" &
|
||||||
|
run_test cgd.out cgd.in EIG/xeigtstc "CGD - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem driver routines" &
|
||||||
|
run_test csb.out csb.in EIG/xeigtstc "CHB - Testing Hermitian Eigenvalue Problem routines" &
|
||||||
|
run_test csg.out csg.in EIG/xeigtstc "CSG - Testing Symmetric Generalized Eigenvalue Problem routines" &
|
||||||
|
run_test cbal.out cbal.in EIG/xeigtstc "CGEBAL - Testing the balancing of a COMPLEX general matrix" &
|
||||||
|
run_test cbak.out cbak.in EIG/xeigtstc "CGEBAK - Testing the back transformation of a COMPLEX balanced matrix" &
|
||||||
|
run_test cgbal.out cgbal.in EIG/xeigtstc "CGGBAL - Testing the balancing of a pair of COMPLEX general matrices" &
|
||||||
|
run_test cgbak.out cgbak.in EIG/xeigtstc "CGGBAK - Testing the back transformation of a pair of COMPLEX balanced matrices" &
|
||||||
|
run_test cbb.out cbb.in EIG/xeigtstc "CBB - Testing banded Singular Value Decomposition routines" &
|
||||||
|
run_test cglm.out glm.in EIG/xeigtstc "GLM - Testing Generalized Linear Regression Model routines" &
|
||||||
|
run_test cgqr.out gqr.in EIG/xeigtstc "GQR - Testing Generalized QR and RQ factorization routines" &
|
||||||
|
run_test cgsv.out gsv.in EIG/xeigtstc "GSV - Testing Generalized Singular Value Decomposition routines" &
|
||||||
|
run_test ccsd.out csd.in EIG/xeigtstc "CSD - Testing CS Decomposition routines" &
|
||||||
|
run_test clse.out lse.in EIG/xeigtstc "LSE - Testing Constrained Linear Least Squares routines" &
|
||||||
|
run_test dnep.out nep.in EIG/xeigtstd "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
|
||||||
|
run_test dsep.out sep.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||||
|
run_test dse2.out se2.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||||
|
run_test dsvd.out svd.in EIG/xeigtstd "SVD - Testing Singular Value Decomposition routines" &
|
||||||
|
run_test dec.out dec.in EIG/xeigtstd "DEC - Testing DOUBLE PRECISION Eigen Condition Routines" &
|
||||||
|
run_test ded.out ded.in EIG/xeigtstd "DEV - Testing DOUBLE PRECISION Nonsymmetric Eigenvalue Driver" &
|
||||||
|
run_test dgg.out dgg.in EIG/xeigtstd "DGG - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem routines" &
|
||||||
|
run_test dgd.out dgd.in EIG/xeigtstd "DGD - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem driver routines" &
|
||||||
|
run_test dsb.out dsb.in EIG/xeigtstd "DSB - Testing DOUBLE PRECISION Symmetric Eigenvalue Problem routines" &
|
||||||
|
run_test dsg.out dsg.in EIG/xeigtstd "DSG - Testing DOUBLE PRECISION Symmetric Generalized Eigenvalue Problem routines" &
|
||||||
|
run_test dbal.out dbal.in EIG/xeigtstd "DGEBAL - Testing the balancing of a DOUBLE PRECISION general matrix" &
|
||||||
|
run_test dbak.out dbak.in EIG/xeigtstd "DGEBAK - Testing the back transformation of a DOUBLE PRECISION balanced matrix" &
|
||||||
|
run_test dgbal.out dgbal.in EIG/xeigtstd "DGGBAL - Testing the balancing of a pair of DOUBLE PRECISION general matrices" &
|
||||||
|
run_test dgbak.out dgbak.in EIG/xeigtstd "DGGBAK - Testing the back transformation of a pair of DOUBLE PRECISION balanced matrices" &
|
||||||
|
run_test dbb.out dbb.in EIG/xeigtstd "DBB - Testing banded Singular Value Decomposition routines" &
|
||||||
|
run_test dglm.out glm.in EIG/xeigtstd "GLM - Testing Generalized Linear Regression Model routines" &
|
||||||
|
run_test dgqr.out gqr.in EIG/xeigtstd "GQR - Testing Generalized QR and RQ factorization routines" &
|
||||||
|
run_test dgsv.out gsv.in EIG/xeigtstd "GSV - Testing Generalized Singular Value Decomposition routines" &
|
||||||
|
run_test dcsd.out csd.in EIG/xeigtstd "CSD - Testing CS Decomposition routines" &
|
||||||
|
run_test dlse.out lse.in EIG/xeigtstd "LSE - Testing Constrained Linear Least Squares routines" &
|
||||||
|
run_test znep.out nep.in EIG/xeigtstz "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
|
||||||
|
run_test zsep.out sep.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||||
|
run_test zse2.out se2.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||||
|
run_test zsvd.out svd.in EIG/xeigtstz "SVD - Testing Singular Value Decomposition routines" &
|
||||||
|
run_test zec.out zec.in EIG/xeigtstz "ZEC - Testing COMPLEX16 Eigen Condition Routines" &
|
||||||
|
run_test zed.out zed.in EIG/xeigtstz "ZES - Testing COMPLEX16 Nonsymmetric Schur Form Driver" &
|
||||||
|
run_test zgg.out zgg.in EIG/xeigtstz "ZGG - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem routines" &
|
||||||
|
run_test zgd.out zgd.in EIG/xeigtstz "ZGD - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem driver routines" &
|
||||||
|
run_test zsb.out zsb.in EIG/xeigtstz "ZHB - Testing Hermitian Eigenvalue Problem routines" &
|
||||||
|
run_test zsg.out zsg.in EIG/xeigtstz "ZSG - Testing Symmetric Generalized Eigenvalue Problem routines" &
|
||||||
|
run_test zbal.out zbal.in EIG/xeigtstz "ZGEBAL - Testing the balancing of a COMPLEX16 general matrix" &
|
||||||
|
run_test zbak.out zbak.in EIG/xeigtstz "ZGEBAK - Testing the back transformation of a COMPLEX16 balanced matrix" &
|
||||||
|
run_test zgbal.out zgbal.in EIG/xeigtstz "ZGGBAL - Testing the balancing of a pair of COMPLEX general matrices" &
|
||||||
|
run_test zgbak.out zgbak.in EIG/xeigtstz "ZGGBAK - Testing the back transformation of a pair of COMPLEX16 balanced matrices" &
|
||||||
|
run_test zbb.out zbb.in EIG/xeigtstz "ZBB - Testing banded Singular Value Decomposition routines" &
|
||||||
|
run_test zglm.out glm.in EIG/xeigtstz "GLM - Testing Generalized Linear Regression Model routines" &
|
||||||
|
run_test zgqr.out gqr.in EIG/xeigtstz "GQR - Testing Generalized QR and RQ factorization routines" &
|
||||||
|
run_test zgsv.out gsv.in EIG/xeigtstz "GSV - Testing Generalized Singular Value Decomposition routines" &
|
||||||
|
run_test zcsd.out csd.in EIG/xeigtstz "CSD - Testing CS Decomposition routines" &
|
||||||
|
run_test zlse.out lse.in EIG/xeigtstz "LSE - Testing Constrained Linear Least Squares routines" &
|
||||||
|
wait
|
||||||
|
while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*)
|
||||||
|
python ./lapack-netlib/lapack_testing.py -d ./test_out -e > netlib_summary
|
||||||
|
TOTALS="$(grep 'ALL PRECISIONS' netlib_summary)"
|
||||||
|
NUMERICAL_ERRORS=-1
|
||||||
|
OTHER_ERRORS=-1
|
||||||
|
. <(awk '/ALL PRECISIONS/{printf "NUMERICAL_ERRORS=%s\nOTHER_ERRORS=%s\n", $5, $7}' netlib_summary
|
||||||
|
if (( NUMERICAL_ERRORS != 0 )) || (( OTHER_ERRORS != 0 )) ; then cat netlib_summary ; FAILURES=1 ; fi
|
||||||
|
if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi
|
|
@ -47,46 +47,59 @@ config_last.h
|
||||||
getarch
|
getarch
|
||||||
getarch_2nd
|
getarch_2nd
|
||||||
utest/openblas_utest
|
utest/openblas_utest
|
||||||
|
utest/openblas_utest_ext
|
||||||
ctest/xccblat1
|
ctest/xccblat1
|
||||||
ctest/xccblat2
|
ctest/xccblat2
|
||||||
ctest/xccblat3
|
ctest/xccblat3
|
||||||
|
ctest/xccblat3_3m
|
||||||
ctest/xdcblat1
|
ctest/xdcblat1
|
||||||
ctest/xdcblat2
|
ctest/xdcblat2
|
||||||
ctest/xdcblat3
|
ctest/xdcblat3
|
||||||
|
ctest/xdcblat3_3m
|
||||||
ctest/xscblat1
|
ctest/xscblat1
|
||||||
ctest/xscblat2
|
ctest/xscblat2
|
||||||
ctest/xscblat3
|
ctest/xscblat3
|
||||||
|
ctest/xscblat3_3m
|
||||||
ctest/xzcblat1
|
ctest/xzcblat1
|
||||||
ctest/xzcblat2
|
ctest/xzcblat2
|
||||||
ctest/xzcblat3
|
ctest/xzcblat3
|
||||||
|
ctest/xzcblat3_3m
|
||||||
exports/linktest.c
|
exports/linktest.c
|
||||||
exports/linux.def
|
exports/linux.def
|
||||||
kernel/setparam_*.c
|
kernel/setparam_*.c
|
||||||
kernel/kernel_*.h
|
kernel/kernel_*.h
|
||||||
test/CBLAT2.SUMM
|
test/CBLAT2.SUMM
|
||||||
test/CBLAT3.SUMM
|
test/CBLAT3.SUMM
|
||||||
|
test/CBLAT3_3M.SUMM
|
||||||
test/DBLAT2.SUMM
|
test/DBLAT2.SUMM
|
||||||
test/DBLAT3.SUMM
|
test/DBLAT3.SUMM
|
||||||
|
test/DBLAT3_3M.SUMM
|
||||||
test/SBLAT2.SUMM
|
test/SBLAT2.SUMM
|
||||||
test/SBLAT3.SUMM
|
test/SBLAT3.SUMM
|
||||||
|
test/SBLAT3_3M.SUMM
|
||||||
test/ZBLAT2.SUMM
|
test/ZBLAT2.SUMM
|
||||||
test/ZBLAT3.SUMM
|
test/ZBLAT3.SUMM
|
||||||
|
test/ZBLAT3_3M.SUMM
|
||||||
test/SHBLAT3.SUMM
|
test/SHBLAT3.SUMM
|
||||||
test/SBBLAT3.SUMM
|
test/SBBLAT3.SUMM
|
||||||
test/cblat1
|
test/cblat1
|
||||||
test/cblat2
|
test/cblat2
|
||||||
test/cblat3
|
test/cblat3
|
||||||
|
test/cblat3_3m
|
||||||
test/dblat1
|
test/dblat1
|
||||||
test/dblat2
|
test/dblat2
|
||||||
test/dblat3
|
test/dblat3
|
||||||
|
test/dblat3_3m
|
||||||
test/sblat1
|
test/sblat1
|
||||||
test/sblat2
|
test/sblat2
|
||||||
test/sblat3
|
test/sblat3
|
||||||
|
test/sblat3_3m
|
||||||
test/test_shgemm
|
test/test_shgemm
|
||||||
test/test_sbgemm
|
test/test_sbgemm
|
||||||
test/zblat1
|
test/zblat1
|
||||||
test/zblat2
|
test/zblat2
|
||||||
test/zblat3
|
test/zblat3
|
||||||
|
test/zblat3_3m
|
||||||
build
|
build
|
||||||
build.*
|
build.*
|
||||||
*.swp
|
*.swp
|
||||||
|
|
|
@ -24,6 +24,8 @@ option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, d
|
||||||
|
|
||||||
option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
|
option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
|
||||||
|
|
||||||
|
option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF)
|
||||||
|
|
||||||
option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
|
option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
|
||||||
|
|
||||||
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
|
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
|
||||||
|
@ -40,6 +42,11 @@ option(USE_PERL "Use the older PERL scripts for build preparation instead of uni
|
||||||
|
|
||||||
option(NO_WARMUP "Do not run a benchmark on each startup just to find the best location for the memory buffer" ON)
|
option(NO_WARMUP "Do not run a benchmark on each startup just to find the best location for the memory buffer" ON)
|
||||||
|
|
||||||
|
option(FIXED_LIBNAME "Use a non-versioned name for the library and no symbolic linking to variant names" OFF)
|
||||||
|
|
||||||
|
set(LIBNAMEPREFIX "" CACHE STRING "Add a prefix to the openblas part of the library name" )
|
||||||
|
set(LIBNAMESUFFIX "" CACHE STRING "Add a suffix after the openblas part of the library name" )
|
||||||
|
|
||||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
|
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
|
||||||
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
|
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
|
||||||
else()
|
else()
|
||||||
|
@ -96,7 +103,7 @@ message(WARNING "CMake support is experimental. It does not yet support all buil
|
||||||
include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
|
include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
|
||||||
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
|
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
|
||||||
|
|
||||||
set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE})
|
set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
|
||||||
|
|
||||||
set(BLASDIRS interface driver/level2 driver/level3 driver/others)
|
set(BLASDIRS interface driver/level2 driver/level3 driver/others)
|
||||||
|
|
||||||
|
@ -323,7 +330,7 @@ if (NOT NOFORTRAN)
|
||||||
# Build test and ctest
|
# Build test and ctest
|
||||||
add_subdirectory(test)
|
add_subdirectory(test)
|
||||||
endif()
|
endif()
|
||||||
if (BUILD_TESTING)
|
if (BUILD_TESTING AND NOT BUILD_WITHOUT_LAPACK)
|
||||||
add_subdirectory(lapack-netlib/TESTING)
|
add_subdirectory(lapack-netlib/TESTING)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
@ -336,11 +343,12 @@ endif()
|
||||||
add_subdirectory(cpp_thread_test)
|
add_subdirectory(cpp_thread_test)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (NOT FIXED_LIBNAME)
|
||||||
set_target_properties(${OpenBLAS_LIBS} PROPERTIES
|
set_target_properties(${OpenBLAS_LIBS} PROPERTIES
|
||||||
VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}
|
VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}
|
||||||
SOVERSION ${OpenBLAS_MAJOR_VERSION}
|
SOVERSION ${OpenBLAS_MAJOR_VERSION}
|
||||||
)
|
)
|
||||||
|
endif()
|
||||||
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
|
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
|
||||||
if (NOT MSVC)
|
if (NOT MSVC)
|
||||||
target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition")
|
target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition")
|
||||||
|
@ -452,6 +460,61 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (BUILD_BENCHMARKS)
|
||||||
|
#find_package(OpenMP REQUIRED)
|
||||||
|
file(GLOB SOURCES "benchmark/*.c")
|
||||||
|
if (NOT USE_OPENMP)
|
||||||
|
file(GLOB REMFILE "benchmark/smallscaling.c")
|
||||||
|
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||||
|
endif()
|
||||||
|
if (BUILD_WITHOUT_LAPACK)
|
||||||
|
file(GLOB REMFILE "benchmark/cholesky.c")
|
||||||
|
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||||
|
file(GLOB REMFILE "benchmark/geev.c")
|
||||||
|
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||||
|
file(GLOB REMFILE "benchmark/gesv.c")
|
||||||
|
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||||
|
file(GLOB REMFILE "benchmark/getri.c")
|
||||||
|
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||||
|
file(GLOB REMFILE "benchmark/potrf.c")
|
||||||
|
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||||
|
file(GLOB REMFILE "benchmark/spmv.c")
|
||||||
|
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||||
|
file(GLOB REMFILE "benchmark/symv.c")
|
||||||
|
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||||
|
file(GLOB REMFILE "benchmark/linpack.c")
|
||||||
|
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||||
|
endif()
|
||||||
|
if (NOT USE_GEMM3M)
|
||||||
|
file(GLOB REMFILE "benchmark/gemm3m.c")
|
||||||
|
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||||
|
endif()
|
||||||
|
foreach(source ${SOURCES})
|
||||||
|
get_filename_component(name ${source} NAME_WE)
|
||||||
|
if ((NOT ${name} STREQUAL "zdot-intel") AND (NOT ${name} STREQUAL "cula_wrapper"))
|
||||||
|
set(defines DEFAULT COMPLEX DOUBLE "COMPLEX\;DOUBLE")
|
||||||
|
foreach(define ${defines})
|
||||||
|
set(target_name "benchmark_${name}")
|
||||||
|
if (NOT "${define}" STREQUAL "DEFAULT")
|
||||||
|
string(JOIN "_" define_str ${define})
|
||||||
|
set(target_name "${target_name}_${define_str}")
|
||||||
|
endif()
|
||||||
|
if ((NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX_DOUBLE") AND
|
||||||
|
(NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX_DOUBLE") AND
|
||||||
|
(NOT ${target_name} STREQUAL "benchmark_max_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX_DOUBLE") AND
|
||||||
|
(NOT ${target_name} STREQUAL "benchmark_min_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX_DOUBLE"))
|
||||||
|
add_executable(${target_name} ${source})
|
||||||
|
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
||||||
|
target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} )
|
||||||
|
# target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C)
|
||||||
|
if (NOT "${define}" STREQUAL "DEFAULT")
|
||||||
|
target_compile_definitions(${target_name} PRIVATE ${define})
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
endforeach()
|
||||||
|
endif()
|
||||||
|
endforeach()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
# Install project
|
# Install project
|
||||||
|
|
|
@ -219,3 +219,7 @@ In chronological order:
|
||||||
|
|
||||||
* Mark Seminatore <https://github.com/mseminatore>
|
* Mark Seminatore <https://github.com/mseminatore>
|
||||||
* [2023-11-09] Improve Windows threading performance scaling
|
* [2023-11-09] Improve Windows threading performance scaling
|
||||||
|
* [2024-02-09] Introduce MT_TRACE facility and improve code consistency
|
||||||
|
|
||||||
|
* Dirreke <https://github.com/mseminatore>
|
||||||
|
* [2024-01-16] Add basic support for the CSKY architecture
|
||||||
|
|
100
Changelog.txt
100
Changelog.txt
|
@ -1,4 +1,104 @@
|
||||||
OpenBLAS ChangeLog
|
OpenBLAS ChangeLog
|
||||||
|
====================================================================
|
||||||
|
Version 0.3.27
|
||||||
|
4-Apr-2024
|
||||||
|
|
||||||
|
general:
|
||||||
|
- added initial (generic) support for the CSKY architecture
|
||||||
|
- capped the maximum number of threads used in GEMM, GETRF and POTRF to avoid creating
|
||||||
|
underutilized or idle threads
|
||||||
|
- sped up multithreaded POTRF on all platforms
|
||||||
|
- added extension openblas_set_num_threads_local() that returns the previous thread count
|
||||||
|
- re-evaluated the SGEMV and DGEMV load thresholds to avoid activating multithreading
|
||||||
|
for too small workloads
|
||||||
|
- improved the fallback code used when the precompiled number of threads is exceeded,
|
||||||
|
and made it callable multiple times during the lifetime of an instance
|
||||||
|
- added CBLAS interfaces for the BLAS extensions ?AMIN,?AMAX, CAXPYC and ZAXPYC
|
||||||
|
- fixed a potential buffer overflow in the interface to the GEMMT kernels
|
||||||
|
- fixed use of incompatible pointer types in GEMMT and C/ZAXPBY as flagged by GCC-14
|
||||||
|
- fixed unwanted case sensitivity of the character parameters in ?TRTRS
|
||||||
|
- sped up the OpenMP thread management code
|
||||||
|
- fixed sizing of logical variables in INTERFACE64 builds of the C version of LAPACK
|
||||||
|
- fixed inclusion of new LAPACK and LAPACKE functions from LAPACK 3.11 in the shared library
|
||||||
|
- added a testsuite for the BLAS extensions
|
||||||
|
- modified the error thresholds for SGS/DGS functions in the LAPACK testsuite to suppress
|
||||||
|
spurious errors
|
||||||
|
- added support for building the benchmark collection with CMAKE
|
||||||
|
- added rewriting of linker options to avoid linking both libgomp and libomp in CMAKE builds
|
||||||
|
with OpenMP enabled that use clang with gfortran
|
||||||
|
- fixed building on systems with ucLibc
|
||||||
|
- added support for calling ?NRM2 with a negative increment value on all architectures
|
||||||
|
- added support for the LLVM18 version of the flang-new compiler
|
||||||
|
- fixed handling of the OPENBLAS_LOOPS variable in several benchmarks
|
||||||
|
- Integrated fixes from the Reference-LAPACK project:
|
||||||
|
- Increased accuracy in C/ZLARFGP (Reference-LAPACK PR 981)
|
||||||
|
|
||||||
|
x86:
|
||||||
|
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||||
|
- fixed GEMM3M functions failing in CMAKE builds
|
||||||
|
|
||||||
|
x86-64:
|
||||||
|
- removed all instances of sched_yield() on Linux and BSD
|
||||||
|
- fixed a potential deadlock in the thread server on MSWindows (introduced in 0.3.26)
|
||||||
|
- fixed GEMM3M functions failing in CMAKE builds
|
||||||
|
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||||
|
- added compiler checks for AVX512BF16 compatibility
|
||||||
|
- fixed LLVM compiler options for Sapphire Rapids
|
||||||
|
- fixed cpu handling fallbacks for Sapphire Rapids with
|
||||||
|
disabled AVX2 in DYNAMIC_ARCH mode
|
||||||
|
- fixed extensions SCSUM and DZSUM
|
||||||
|
- improved GEMM performance for ZEN targets
|
||||||
|
|
||||||
|
arm:
|
||||||
|
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||||
|
|
||||||
|
arm64:
|
||||||
|
- added initial support for the Cortex-A76 cpu
|
||||||
|
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||||
|
- fixed default compiler options for gcc (-march and -mtune)
|
||||||
|
- added support for ArmCompilerForLinux
|
||||||
|
- added support for the NeoverseV2 cpu in DYNAMIC_ARCH builds
|
||||||
|
- fixed mishandling of the INTERFACE64 option in CMAKE builds
|
||||||
|
- corrected SCSUM kernels (erroneously duplicating SCASUM behaviour)
|
||||||
|
- added SVE-enabled kernels for CSUM/ZSUM
|
||||||
|
- worked around an inaccuracy in the NRM2 kernels for NeoverseN1 and Apple M
|
||||||
|
|
||||||
|
power:
|
||||||
|
- improved performance of SGEMM on POWER8/9/10
|
||||||
|
- improved performance of DGEMM on POWER10
|
||||||
|
- added support for OpenMP builds with xlc/xlf on AIX
|
||||||
|
- improved cpu autodetection for DYNAMIC_ARCH builds on older AIX
|
||||||
|
- fixed cpu core counting on AIX
|
||||||
|
- added support for building a shared library on AIX
|
||||||
|
|
||||||
|
riscv64:
|
||||||
|
- added support for the X280 cpu
|
||||||
|
- added support for semi-generic RISCV models with vector length 128 or 256
|
||||||
|
- added support for compiling with either RVV 0.7.1 or RVV 1.0 standard compilers
|
||||||
|
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||||
|
- improved cpu model autodetection
|
||||||
|
- fixed corner cases in ?AXPBY for C910V
|
||||||
|
- fixed handling of zero increments in ?AXPY kernels for C910V
|
||||||
|
|
||||||
|
loongarch64:
|
||||||
|
- added optimized kernels for ?AMIN and ?AMAX
|
||||||
|
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||||
|
- fixed handling of corner cases in ?AXPBY
|
||||||
|
- fixed computation of SAMIN and DAMIN in LSX mode
|
||||||
|
- fixed computation of ?ROT
|
||||||
|
- added optimized SSYMV and DSYMV kernels for LSX and LASX mode
|
||||||
|
- added optimized CGEMM and ZGEMM kernels for LSX and LASX mode
|
||||||
|
- added optimized CGEMV and ZGEMV kernels
|
||||||
|
|
||||||
|
mips:
|
||||||
|
- fixed utilizing MSA on P5600 and related cpus (broken in 0.3.22)
|
||||||
|
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||||
|
- fixed mishandling of the INTERFACE64 option in CMAKE builds
|
||||||
|
|
||||||
|
zarch:
|
||||||
|
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||||
|
- fixed calculation of ?SUM on Z13
|
||||||
|
|
||||||
====================================================================
|
====================================================================
|
||||||
Version 0.3.26
|
Version 0.3.26
|
||||||
2-Jan-2024
|
2-Jan-2024
|
||||||
|
|
32
Makefile
32
Makefile
|
@ -1,5 +1,9 @@
|
||||||
TOPDIR = .
|
TOPDIR = .
|
||||||
include ./Makefile.system
|
include ./Makefile.system
|
||||||
|
LNCMD = ln -fs
|
||||||
|
ifeq ($(FIXED_LIBNAME), 1)
|
||||||
|
LNCMD = true
|
||||||
|
endif
|
||||||
|
|
||||||
BLASDIRS = interface driver/level2 driver/level3 driver/others
|
BLASDIRS = interface driver/level2 driver/level3 driver/others
|
||||||
|
|
||||||
|
@ -134,17 +138,17 @@ shared : libs netlib $(RELA)
|
||||||
ifneq ($(NO_SHARED), 1)
|
ifneq ($(NO_SHARED), 1)
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
|
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
|
||||||
@$(MAKE) -C exports so
|
@$(MAKE) -C exports so
|
||||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
@$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so
|
||||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
@$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD))
|
ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD))
|
||||||
@$(MAKE) -C exports so
|
@$(MAKE) -C exports so
|
||||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
@$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), Darwin)
|
ifeq ($(OSNAME), Darwin)
|
||||||
@$(MAKE) -C exports dyn
|
@$(MAKE) -C exports dyn
|
||||||
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
@$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||||
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
@$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), WINNT)
|
ifeq ($(OSNAME), WINNT)
|
||||||
@$(MAKE) -C exports dll
|
@$(MAKE) -C exports dll
|
||||||
|
@ -152,6 +156,9 @@ endif
|
||||||
ifeq ($(OSNAME), CYGWIN_NT)
|
ifeq ($(OSNAME), CYGWIN_NT)
|
||||||
@$(MAKE) -C exports dll
|
@$(MAKE) -C exports dll
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(OSNAME), AIX)
|
||||||
|
@$(MAKE) -C exports so
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
tests : shared
|
tests : shared
|
||||||
|
@ -229,13 +236,13 @@ ifeq ($(INTERFACE64),1)
|
||||||
endif
|
endif
|
||||||
@echo THELIBNAME=$(LIBNAME) >> Makefile.conf_last
|
@echo THELIBNAME=$(LIBNAME) >> Makefile.conf_last
|
||||||
@echo THELIBSONAME=$(LIBSONAME) >> Makefile.conf_last
|
@echo THELIBSONAME=$(LIBSONAME) >> Makefile.conf_last
|
||||||
@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
@-$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||||
@touch lib.grd
|
@touch lib.grd
|
||||||
|
|
||||||
prof : prof_blas prof_lapack
|
prof : prof_blas prof_lapack
|
||||||
|
|
||||||
prof_blas :
|
prof_blas :
|
||||||
ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
|
$(LNCMD) $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
|
||||||
for d in $(SUBDIRS) ; \
|
for d in $(SUBDIRS) ; \
|
||||||
do if test -d $$d; then \
|
do if test -d $$d; then \
|
||||||
$(MAKE) -C $$d prof || exit 1 ; \
|
$(MAKE) -C $$d prof || exit 1 ; \
|
||||||
|
@ -246,7 +253,7 @@ ifeq ($(DYNAMIC_ARCH), 1)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
blas :
|
blas :
|
||||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||||
for d in $(BLASDIRS) ; \
|
for d in $(BLASDIRS) ; \
|
||||||
do if test -d $$d; then \
|
do if test -d $$d; then \
|
||||||
$(MAKE) -C $$d libs || exit 1 ; \
|
$(MAKE) -C $$d libs || exit 1 ; \
|
||||||
|
@ -254,7 +261,7 @@ blas :
|
||||||
done
|
done
|
||||||
|
|
||||||
hpl :
|
hpl :
|
||||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||||
for d in $(BLASDIRS) ../laswp exports ; \
|
for d in $(BLASDIRS) ../laswp exports ; \
|
||||||
do if test -d $$d; then \
|
do if test -d $$d; then \
|
||||||
$(MAKE) -C $$d $(@F) || exit 1 ; \
|
$(MAKE) -C $$d $(@F) || exit 1 ; \
|
||||||
|
@ -268,7 +275,7 @@ ifeq ($(DYNAMIC_ARCH), 1)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
hpl_p :
|
hpl_p :
|
||||||
ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
|
$(LNCMD) $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
|
||||||
for d in $(SUBDIRS) ../laswp exports ; \
|
for d in $(SUBDIRS) ../laswp exports ; \
|
||||||
do if test -d $$d; then \
|
do if test -d $$d; then \
|
||||||
$(MAKE) -C $$d $(@F) || exit 1 ; \
|
$(MAKE) -C $$d $(@F) || exit 1 ; \
|
||||||
|
@ -309,8 +316,12 @@ endif
|
||||||
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1)
|
ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1)
|
||||||
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
|
else
|
||||||
|
ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGIBM1)
|
||||||
|
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
else
|
else
|
||||||
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
|
@ -401,6 +412,7 @@ lapack-runtest: lapack-test
|
||||||
|
|
||||||
blas-test:
|
blas-test:
|
||||||
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out)
|
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out)
|
||||||
|
|
||||||
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
|
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
|
||||||
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out)
|
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out)
|
||||||
|
|
||||||
|
|
|
@ -58,6 +58,13 @@ FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(CORE), CORTEXA76)
|
||||||
|
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a76
|
||||||
|
ifneq ($(F_COMPILER), NAG)
|
||||||
|
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a76
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), FT2000)
|
ifeq ($(CORE), FT2000)
|
||||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
|
@ -104,19 +111,25 @@ ifneq ($(F_COMPILER), NAG)
|
||||||
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
|
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
|
||||||
endif
|
endif
|
||||||
else
|
else
|
||||||
CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native
|
CCOMMON_OPT += -march=armv8.4-a+sve
|
||||||
|
ifneq ($(CROSS), 1)
|
||||||
|
CCOMMON_OPT += -mtune=native
|
||||||
|
endif
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
FCOMMON_OPT += -march=armv8.4-a -mtune=native
|
FCOMMON_OPT += -march=armv8.4-a
|
||||||
|
ifneq ($(CROSS), 1)
|
||||||
|
FCOMMON_OPT += -mtune=native
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
else
|
else
|
||||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
else
|
else
|
||||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||||
endif
|
endif
|
||||||
|
@ -132,25 +145,31 @@ ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
|
||||||
ifneq ($(OSNAME), Darwin)
|
ifneq ($(OSNAME), Darwin)
|
||||||
CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
|
CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
|
||||||
else
|
else
|
||||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
|
||||||
endif
|
endif
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
|
FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
|
||||||
endif
|
endif
|
||||||
else
|
else
|
||||||
CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native
|
CCOMMON_OPT += -march=armv8.5-a+sve
|
||||||
|
ifneq ($(CROSS), 1)
|
||||||
|
CCOMMON_OPT += -mtune=native
|
||||||
|
endif
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
FCOMMON_OPT += -march=armv8.5-a -mtune=native
|
FCOMMON_OPT += -march=armv8.5-a
|
||||||
|
ifneq ($(CROSS), 1)
|
||||||
|
FCOMMON_OPT += -mtune=native
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
else
|
else
|
||||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
else
|
else
|
||||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||||
endif
|
endif
|
||||||
|
@ -258,9 +277,17 @@ endif
|
||||||
|
|
||||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
|
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
|
||||||
ifeq ($(CORE), CORTEXX1)
|
ifeq ($(CORE), CORTEXX1)
|
||||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72
|
CCOMMON_OPT += -march=armv8.2-a
|
||||||
|
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ12) $(ISCLANG)))
|
||||||
|
CCOMMON_OPT += -mtune=cortex-x1
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72
|
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-x1
|
||||||
|
endif
|
||||||
|
else
|
||||||
|
CCOMMON_OPT += -mtune=cortex-a72
|
||||||
|
ifneq ($(F_COMPILER), NAG)
|
||||||
|
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
@ -271,6 +298,12 @@ CCOMMON_OPT += -march=armv8.4-a+sve
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
FCOMMON_OPT += -march=armv8.4-a+sve
|
FCOMMON_OPT += -march=armv8.4-a+sve
|
||||||
endif
|
endif
|
||||||
|
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ12) $(ISCLANG)))
|
||||||
|
CCOMMON_OPT += -mtune=cortex-x2
|
||||||
|
ifneq ($(F_COMPILER), NAG)
|
||||||
|
FCOMMON_OPT += -mtune=cortex-x2
|
||||||
|
endif
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -290,6 +323,12 @@ CCOMMON_OPT += -march=armv8.4-a+sve
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
FCOMMON_OPT += -march=armv8.4-a+sve
|
FCOMMON_OPT += -march=armv8.4-a+sve
|
||||||
endif
|
endif
|
||||||
|
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ12) $(ISCLANG)))
|
||||||
|
CCOMMON_OPT += -mtune=cortex-a710
|
||||||
|
ifneq ($(F_COMPILER), NAG)
|
||||||
|
FCOMMON_OPT += -mtune=cortex-a710
|
||||||
|
endif
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
ifeq ($(CORE), CK860FV)
|
||||||
|
CCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float
|
||||||
|
FCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float -static
|
||||||
|
endif
|
|
@ -2,11 +2,15 @@ TOPDIR = .
|
||||||
export GOTOBLAS_MAKEFILE = 1
|
export GOTOBLAS_MAKEFILE = 1
|
||||||
-include $(TOPDIR)/Makefile.conf_last
|
-include $(TOPDIR)/Makefile.conf_last
|
||||||
include ./Makefile.system
|
include ./Makefile.system
|
||||||
|
LNCMD = ln -fs
|
||||||
|
|
||||||
ifdef THELIBNAME
|
ifdef THELIBNAME
|
||||||
LIBNAME=$(THELIBNAME)
|
LIBNAME=$(THELIBNAME)
|
||||||
LIBSONAME=$(THELIBSONAME)
|
LIBSONAME=$(THELIBSONAME)
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(FIXED_LIBNAME), 1)
|
||||||
|
LNCMD = true
|
||||||
|
endif
|
||||||
ifeq ($(INTERFACE64),1)
|
ifeq ($(INTERFACE64),1)
|
||||||
USE_64BITINT=1
|
USE_64BITINT=1
|
||||||
endif
|
endif
|
||||||
|
@ -99,7 +103,7 @@ ifneq ($(NO_STATIC),1)
|
||||||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||||
@install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
@install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||||
endif
|
endif
|
||||||
#for install shared library
|
#for install shared library
|
||||||
ifneq ($(NO_SHARED),1)
|
ifneq ($(NO_SHARED),1)
|
||||||
|
@ -107,21 +111,21 @@ ifneq ($(NO_SHARED),1)
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
|
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
|
||||||
@install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
@install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD))
|
ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD))
|
||||||
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), Darwin)
|
ifeq ($(OSNAME), Darwin)
|
||||||
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
@-install_name_tool -id "$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(MAJOR_VERSION).dylib" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
|
@-install_name_tool -id "$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(MAJOR_VERSION).dylib" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
|
||||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||||
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
|
$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
|
||||||
ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), WINNT)
|
ifeq ($(OSNAME), WINNT)
|
||||||
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
|
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
|
||||||
|
@ -149,15 +153,15 @@ ifneq ($(NO_STATIC),1)
|
||||||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||||
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||||
endif
|
endif
|
||||||
#for install shared library
|
#for install shared library
|
||||||
ifneq ($(NO_SHARED),1)
|
ifneq ($(NO_SHARED),1)
|
||||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||||
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
@ -170,6 +174,8 @@ endif
|
||||||
|
|
||||||
@echo Generating $(LIBSONAMEBASE)$(SUFFIX64).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
|
@echo Generating $(LIBSONAMEBASE)$(SUFFIX64).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
|
||||||
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(PKGFILE)"
|
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(PKGFILE)"
|
||||||
|
@echo 'libprefix='$(LIBNAMEPREFIX) >> "$(PKGFILE)"
|
||||||
|
@echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)"
|
||||||
@echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)"
|
@echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)"
|
||||||
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)"
|
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)"
|
||||||
@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
|
@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
|
||||||
|
@ -186,7 +192,7 @@ endif
|
||||||
ifneq ($(NO_SHARED),1)
|
ifneq ($(NO_SHARED),1)
|
||||||
#ifeq logical or
|
#ifeq logical or
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
|
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
|
||||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
|
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
|
||||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||||
|
|
|
@ -55,6 +55,26 @@ ifeq ($(TARGET), C910V)
|
||||||
TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d
|
TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(TARGET), CK860FV)
|
||||||
|
TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(TARGET), x280)
|
||||||
|
TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(TARGET), RISCV64_ZVL256B)
|
||||||
|
TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(TARGET), RISCV64_ZVL128B)
|
||||||
|
TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(TARGET), RISCV64_GENERIC)
|
||||||
|
TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d
|
||||||
|
endif
|
||||||
|
|
||||||
all: getarch_2nd
|
all: getarch_2nd
|
||||||
./getarch_2nd 0 >> $(TARGET_MAKE)
|
./getarch_2nd 0 >> $(TARGET_MAKE)
|
||||||
./getarch_2nd 1 >> $(TARGET_CONF)
|
./getarch_2nd 1 >> $(TARGET_CONF)
|
||||||
|
|
|
@ -2,3 +2,19 @@ ifeq ($(CORE), C910V)
|
||||||
CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920
|
CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920
|
||||||
FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static
|
FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(CORE), x280)
|
||||||
|
CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math
|
||||||
|
FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
|
||||||
|
endif
|
||||||
|
ifeq ($(CORE), RISCV64_ZVL256B)
|
||||||
|
CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d
|
||||||
|
FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
|
||||||
|
endif
|
||||||
|
ifeq ($(CORE), RISCV64_ZVL128B)
|
||||||
|
CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
|
||||||
|
FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
|
||||||
|
endif
|
||||||
|
ifeq ($(CORE), RISCV64_GENERIC)
|
||||||
|
CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d
|
||||||
|
FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static
|
||||||
|
endif
|
||||||
|
|
|
@ -3,7 +3,12 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
# This library's version
|
# This library's version
|
||||||
VERSION = 0.3.26
|
VERSION = 0.3.26.dev
|
||||||
|
|
||||||
|
# If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
|
||||||
|
# and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library
|
||||||
|
#
|
||||||
|
# LIBNAMEPREFIX = scipy
|
||||||
|
|
||||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||||
|
|
|
@ -365,8 +365,9 @@ GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
|
||||||
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
|
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
|
||||||
GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8)
|
GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8)
|
||||||
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||||
GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11)
|
|
||||||
GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
|
GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
|
||||||
|
GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11)
|
||||||
|
GCCVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12)
|
||||||
# Note that the behavior of -dumpversion is compile-time-configurable for
|
# Note that the behavior of -dumpversion is compile-time-configurable for
|
||||||
# gcc-7.x and newer. Use -dumpfullversion there
|
# gcc-7.x and newer. Use -dumpfullversion there
|
||||||
ifeq ($(GCCVERSIONGTEQ7),1)
|
ifeq ($(GCCVERSIONGTEQ7),1)
|
||||||
|
@ -873,6 +874,11 @@ endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ARCH), csky)
|
||||||
|
NO_BINARY_MODE = 1
|
||||||
|
BINARY_DEFINED = 1
|
||||||
|
endif
|
||||||
|
|
||||||
#
|
#
|
||||||
# C Compiler dependent settings
|
# C Compiler dependent settings
|
||||||
#
|
#
|
||||||
|
@ -1176,7 +1182,7 @@ ifeq ($(F_COMPILER), IBM)
|
||||||
CCOMMON_OPT += -DF_INTERFACE_IBM
|
CCOMMON_OPT += -DF_INTERFACE_IBM
|
||||||
FEXTRALIB += -lxlf90
|
FEXTRALIB += -lxlf90
|
||||||
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG))
|
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG))
|
||||||
FCOMMON_OPT += -qextname
|
FCOMMON_OPT += -qextname -qzerosize
|
||||||
endif
|
endif
|
||||||
# FCOMMON_OPT += -qarch=440
|
# FCOMMON_OPT += -qarch=440
|
||||||
ifdef BINARY64
|
ifdef BINARY64
|
||||||
|
@ -1511,16 +1517,28 @@ ifndef LIBSONAMEBASE
|
||||||
LIBSONAMEBASE = openblas
|
LIBSONAMEBASE = openblas
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifndef LIBNAMEPREFIX
|
||||||
|
LIBNAMEPREFIX =
|
||||||
|
endif
|
||||||
|
|
||||||
|
SYMPREFIX=$(SYMBOLPREFIX)
|
||||||
|
ifeq ($(SYMBOLPREFIX),$(LIBNAMEPREFIX))
|
||||||
|
SYMPREFIX=
|
||||||
|
endif
|
||||||
|
SYMSUFFIX=$(SYMBOLSUFFIX)
|
||||||
|
ifeq ($(SYMBOLSUFFIX),$(LIBNAMESUFFIX))
|
||||||
|
SYMSUFFIX=
|
||||||
|
endif
|
||||||
ifndef LIBNAMESUFFIX
|
ifndef LIBNAMESUFFIX
|
||||||
LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)
|
LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)
|
||||||
else
|
else
|
||||||
LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX)
|
LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)$(LIBNAMESUFFIX)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(OSNAME), CYGWIN_NT)
|
ifeq ($(OSNAME), CYGWIN_NT)
|
||||||
LIBPREFIX = cyg$(LIBNAMEBASE)
|
LIBPREFIX = cyg$(LIBNAMEPREFIX)$(LIBNAMEBASE)
|
||||||
else
|
else
|
||||||
LIBPREFIX = lib$(LIBNAMEBASE)
|
LIBPREFIX = lib$(LIBNAMEPREFIX)$(LIBNAMEBASE)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
KERNELDIR = $(TOPDIR)/kernel/$(ARCH)
|
KERNELDIR = $(TOPDIR)/kernel/$(ARCH)
|
||||||
|
@ -1652,6 +1670,10 @@ ifeq ($(F_COMPILER),CRAY)
|
||||||
LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||||
override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(F_COMPILER),FLANGNEW)
|
||||||
|
LAPACK_FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||||
|
override FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||||
|
endif
|
||||||
|
|
||||||
LAPACK_CFLAGS = $(CFLAGS)
|
LAPACK_CFLAGS = $(CFLAGS)
|
||||||
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
|
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
|
||||||
|
@ -1699,14 +1721,14 @@ LIBNAME_P = $(LIBPREFIX)p$(REVISION)_p.$(LIBSUFFIX)
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(FIXED_LIBNAME),1)
|
||||||
|
LIBNAME = lib$(LIBNAMEPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).$(LIBSUFFIX)
|
||||||
|
LIBNAME_P = lib$(LIBNAMEPREFIX)$(LISOBNAMEBASE)$(LIBNAMESUFFIX)_p.$(LIBSUFFIX)
|
||||||
|
endif
|
||||||
|
|
||||||
LIBDLLNAME = $(LIBPREFIX).dll
|
LIBDLLNAME = $(LIBPREFIX).dll
|
||||||
IMPLIBNAME = lib$(LIBNAMEBASE).dll.a
|
IMPLIBNAME = lib$(LIBNAMEBASE).dll.a
|
||||||
ifneq ($(OSNAME), AIX)
|
|
||||||
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
|
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
|
||||||
else
|
|
||||||
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a)
|
|
||||||
endif
|
|
||||||
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
|
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
|
||||||
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
|
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
|
||||||
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)
|
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)
|
||||||
|
|
|
@ -130,11 +130,11 @@ ifeq ($(C_COMPILER), GCC)
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
else ifeq ($(C_COMPILER), CLANG)
|
else ifeq ($(C_COMPILER), CLANG)
|
||||||
# cooperlake support was added in clang 12
|
# sapphire rapids support was added in clang 12
|
||||||
ifeq ($(CLANGVERSIONGTEQ12), 1)
|
ifeq ($(CLANGVERSIONGTEQ12), 1)
|
||||||
CCOMMON_OPT += -march=cooperlake
|
CCOMMON_OPT += -march=sapphirerapids
|
||||||
ifneq ($(F_COMPILER), NAG)
|
ifneq ($(F_COMPILER), NAG)
|
||||||
FCOMMON_OPT += -march=cooperlake
|
FCOMMON_OPT += -march=sapphirerapids
|
||||||
endif
|
endif
|
||||||
else # not supported in clang, fallback to avx512
|
else # not supported in clang, fallback to avx512
|
||||||
CCOMMON_OPT += -march=skylake-avx512
|
CCOMMON_OPT += -march=skylake-avx512
|
||||||
|
|
23
README.md
23
README.md
|
@ -167,6 +167,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
||||||
- **Cortex A57**: Optimized Level-3 and Level-2 functions
|
- **Cortex A57**: Optimized Level-3 and Level-2 functions
|
||||||
- **Cortex A72**: same as A57 ( different cpu specifications)
|
- **Cortex A72**: same as A57 ( different cpu specifications)
|
||||||
- **Cortex A73**: same as A57 (different cpu specifications)
|
- **Cortex A73**: same as A57 (different cpu specifications)
|
||||||
|
- **Cortex A76**: same as A57 (different cpu specifications)
|
||||||
- **Falkor**: same as A57 (different cpu specifications)
|
- **Falkor**: same as A57 (different cpu specifications)
|
||||||
- **ThunderX**: Optimized some Level-1 functions
|
- **ThunderX**: Optimized some Level-1 functions
|
||||||
- **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
|
- **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
|
||||||
|
@ -185,6 +186,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
||||||
- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
|
- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
|
||||||
- **POWER10**: Optimized Level-3 BLAS including SBGEMM and some Level-1,2.
|
- **POWER10**: Optimized Level-3 BLAS including SBGEMM and some Level-1,2.
|
||||||
|
|
||||||
|
- **AIX**: Dynamic architecture with OpenXL and OpenMP.
|
||||||
|
```sh
|
||||||
|
make CC=ibm-clang_r FC=xlf TARGET=POWER7 BINARY=64 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 USE_THREAD=1
|
||||||
|
```
|
||||||
|
|
||||||
#### IBM zEnterprise System
|
#### IBM zEnterprise System
|
||||||
|
|
||||||
- **Z13**: Optimized Level-3 BLAS and Level-1,2
|
- **Z13**: Optimized Level-3 BLAS and Level-1,2
|
||||||
|
@ -198,6 +204,21 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
||||||
```
|
```
|
||||||
(also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision)
|
(also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision)
|
||||||
|
|
||||||
|
- **x280**: Level-3 BLAS and Level-1,2 are optimized by RISC-V Vector extension 1.0.
|
||||||
|
```sh
|
||||||
|
make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran
|
||||||
|
```
|
||||||
|
|
||||||
|
- **ZVL???B**: Level-3 BLAS and Level-1,2 including vectorised kernels targeting generic RISCV cores with vector support with registers of at least the corresponding width; ZVL128B and ZVL256B are available.
|
||||||
|
e.g.:
|
||||||
|
```sh
|
||||||
|
make TARGET=RISCV64_ZVL256B CFLAGS="-DTARGET=RISCV64_ZVL256B" \
|
||||||
|
BINARY=64 ARCH=riscv64 CC='clang -target riscv64-unknown-linux-gnu' \
|
||||||
|
AR=riscv64-unknown-linux-gnu-ar AS=riscv64-unknown-linux-gnu-gcc \
|
||||||
|
LD=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran \
|
||||||
|
HOSTCC=gcc HOSTFC=gfortran -j
|
||||||
|
```
|
||||||
|
|
||||||
### Support for multiple targets in a single library
|
### Support for multiple targets in a single library
|
||||||
|
|
||||||
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
|
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
|
||||||
|
@ -227,7 +248,7 @@ Please note that it is not possible to combine support for different architectur
|
||||||
- **NetBSD**: Supported by the community. We don't actively test the library on this OS.
|
- **NetBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||||
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
|
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
|
||||||
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
||||||
- **AIX**: Supported on PPC up to POWER8
|
- **AIX**: Supported on PPC up to POWER10
|
||||||
- **Haiku**: Supported by the community. We don't actively test the library on this OS.
|
- **Haiku**: Supported by the community. We don't actively test the library on this OS.
|
||||||
- **SunOS**: Supported by the community. We don't actively test the library on this OS.
|
- **SunOS**: Supported by the community. We don't actively test the library on this OS.
|
||||||
- **Cortex-M**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-on-Cortex-M>.
|
- **Cortex-M**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-on-Cortex-M>.
|
||||||
|
|
|
@ -93,6 +93,7 @@ CORTEXA53
|
||||||
CORTEXA57
|
CORTEXA57
|
||||||
CORTEXA72
|
CORTEXA72
|
||||||
CORTEXA73
|
CORTEXA73
|
||||||
|
CORTEXA76
|
||||||
CORTEXA510
|
CORTEXA510
|
||||||
CORTEXA710
|
CORTEXA710
|
||||||
CORTEXX1
|
CORTEXX1
|
||||||
|
@ -118,8 +119,11 @@ Z13
|
||||||
Z14
|
Z14
|
||||||
|
|
||||||
10.RISC-V 64:
|
10.RISC-V 64:
|
||||||
RISCV64_GENERIC
|
RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54)
|
||||||
|
RISCV64_ZVL128B
|
||||||
C910V
|
C910V
|
||||||
|
x280
|
||||||
|
RISCV64_ZVL256B
|
||||||
|
|
||||||
11.LOONGARCH64:
|
11.LOONGARCH64:
|
||||||
LOONGSONGENERIC
|
LOONGSONGENERIC
|
||||||
|
@ -133,3 +137,7 @@ E2K
|
||||||
EV4
|
EV4
|
||||||
EV5
|
EV5
|
||||||
EV6
|
EV6
|
||||||
|
|
||||||
|
14.CSKY
|
||||||
|
CSKY
|
||||||
|
CK860FV
|
||||||
|
|
|
@ -37,6 +37,12 @@ ESSL=/opt/ibm/lib
|
||||||
#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
|
#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
|
||||||
LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
|
LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
|
||||||
|
|
||||||
|
# x280 temporary workaround for gfortran
|
||||||
|
ifeq ($(TARGET), x280)
|
||||||
|
CCOMMON_OPT:=$(filter-out -mllvm --riscv-v-vector-bits-min=512,$(CCOMMON_OPT))
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
ifneq ($(NO_LAPACK), 1)
|
ifneq ($(NO_LAPACK), 1)
|
||||||
GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||||
scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \
|
scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \
|
||||||
|
|
|
@ -92,7 +92,7 @@ int main(int argc, char *argv[]){
|
||||||
|
|
||||||
if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
|
if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
|
||||||
|
|
||||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
|
||||||
|
|
||||||
fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step);
|
fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step);
|
||||||
|
|
||||||
|
|
|
@ -85,7 +85,7 @@ int main(int argc, char *argv[]){
|
||||||
double time1, time2, timeg1,timeg2;
|
double time1, time2, timeg1,timeg2;
|
||||||
|
|
||||||
char *p;
|
char *p;
|
||||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
|
||||||
|
|
||||||
argc--;argv++;
|
argc--;argv++;
|
||||||
|
|
||||||
|
|
|
@ -120,7 +120,7 @@ int main(int argc, char *argv[]){
|
||||||
|
|
||||||
if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
|
if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
|
||||||
|
|
||||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
|
||||||
|
|
||||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]);
|
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]);
|
||||||
|
|
||||||
|
|
|
@ -54,7 +54,7 @@ int main(int argc, char *argv[]){
|
||||||
int step = 1;
|
int step = 1;
|
||||||
int loops = 1;
|
int loops = 1;
|
||||||
|
|
||||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
|
||||||
|
|
||||||
double time1,timeg;
|
double time1,timeg;
|
||||||
|
|
||||||
|
|
24
c_check
24
c_check
|
@ -91,6 +91,7 @@ case "$data" in
|
||||||
*ARCH_ZARCH*) architecture=zarch ;;
|
*ARCH_ZARCH*) architecture=zarch ;;
|
||||||
*ARCH_RISCV64*) architecture=riscv64 ;;
|
*ARCH_RISCV64*) architecture=riscv64 ;;
|
||||||
*ARCH_LOONGARCH64*) architecture=loongarch64 ;;
|
*ARCH_LOONGARCH64*) architecture=loongarch64 ;;
|
||||||
|
*ARCH_CSKY*) architecture=csky ;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
defined=0
|
defined=0
|
||||||
|
@ -236,6 +237,7 @@ case "$data" in
|
||||||
*ARCH_ARM*) architecture=arm ;;
|
*ARCH_ARM*) architecture=arm ;;
|
||||||
*ARCH_ZARCH*) architecture=zarch ;;
|
*ARCH_ZARCH*) architecture=zarch ;;
|
||||||
*ARCH_LOONGARCH64*) architecture=loongarch64 ;;
|
*ARCH_LOONGARCH64*) architecture=loongarch64 ;;
|
||||||
|
*ARCH_CSKY*) architecture=csky ;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
binformat='bin32'
|
binformat='bin32'
|
||||||
|
@ -244,6 +246,7 @@ case "$data" in
|
||||||
esac
|
esac
|
||||||
|
|
||||||
no_avx512=0
|
no_avx512=0
|
||||||
|
no_avx512bf=0
|
||||||
if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
|
if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
|
||||||
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||||
tmpf="$tmpd/a.c"
|
tmpf="$tmpd/a.c"
|
||||||
|
@ -262,6 +265,25 @@ if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
|
||||||
}
|
}
|
||||||
|
|
||||||
rm -rf "$tmpd"
|
rm -rf "$tmpd"
|
||||||
|
if [ "$no_avx512" -eq 0 ]; then
|
||||||
|
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||||
|
tmpf="$tmpd/a.c"
|
||||||
|
code='"__m512 a= _mm512_dpbf16_ps(a, (__m512bh) _mm512_loadu_si512(%1]), (__m512bh) _mm512_loadu_si512(%2]));"'
|
||||||
|
printf "#include <immintrin.h>\n\nint main(void){ %s; }\n" "$code" >> "$tmpf"
|
||||||
|
if [ "$compiler" = "PGI" ]; then
|
||||||
|
args=" -tp cooperlake -c -o $tmpf.o $tmpf"
|
||||||
|
else
|
||||||
|
args=" -march=cooperlake -c -o $tmpf.o $tmpf"
|
||||||
|
fi
|
||||||
|
no_avx512bf=0
|
||||||
|
{
|
||||||
|
$compiler_name $flags $args >/dev/null 2>&1
|
||||||
|
} || {
|
||||||
|
no_avx512bf=1
|
||||||
|
}
|
||||||
|
|
||||||
|
rm -rf "$tmpd"
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
no_rv64gv=0
|
no_rv64gv=0
|
||||||
|
@ -409,6 +431,7 @@ done
|
||||||
[ "$makefile" = "-" ] && {
|
[ "$makefile" = "-" ] && {
|
||||||
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
|
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
|
||||||
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
|
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
|
||||||
|
[ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
|
||||||
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
|
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
|
||||||
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
|
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
|
||||||
exit 0
|
exit 0
|
||||||
|
@ -437,6 +460,7 @@ done
|
||||||
[ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n"
|
[ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n"
|
||||||
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
|
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
|
||||||
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
|
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
|
||||||
|
[ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
|
||||||
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
|
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
|
||||||
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
|
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
|
||||||
[ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"
|
[ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"
|
||||||
|
|
|
@ -97,6 +97,7 @@ $architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||||
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
|
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
|
||||||
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
||||||
|
$architecture = csky if ($data =~ /ARCH_CSKY/);
|
||||||
|
|
||||||
$defined = 0;
|
$defined = 0;
|
||||||
|
|
||||||
|
@ -156,6 +157,11 @@ if ($architecture eq "loongarch64") {
|
||||||
$binary = 64;
|
$binary = 64;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ($architecture eq "csky") {
|
||||||
|
$defined = 1;
|
||||||
|
$binary = 32;
|
||||||
|
}
|
||||||
|
|
||||||
if ($compiler eq "PGI") {
|
if ($compiler eq "PGI") {
|
||||||
$compiler_name .= " -tp p7" if ($binary eq "32");
|
$compiler_name .= " -tp p7" if ($binary eq "32");
|
||||||
$compiler_name .= " -tp p7-64" if ($binary eq "64");
|
$compiler_name .= " -tp p7-64" if ($binary eq "64");
|
||||||
|
@ -284,6 +290,7 @@ $architecture = arm if ($data =~ /ARCH_ARM/);
|
||||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||||
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
||||||
|
$architecture = csky if ($data =~ /ARCH_CSKY/);
|
||||||
|
|
||||||
$binformat = bin32;
|
$binformat = bin32;
|
||||||
$binformat = bin64 if ($data =~ /BINARY_64/);
|
$binformat = bin64 if ($data =~ /BINARY_64/);
|
||||||
|
|
22
cblas.h
22
cblas.h
|
@ -12,6 +12,7 @@ extern "C" {
|
||||||
/*Set the number of threads on runtime.*/
|
/*Set the number of threads on runtime.*/
|
||||||
void openblas_set_num_threads(int num_threads);
|
void openblas_set_num_threads(int num_threads);
|
||||||
void goto_set_num_threads(int num_threads);
|
void goto_set_num_threads(int num_threads);
|
||||||
|
int openblas_set_num_threads_local(int num_threads);
|
||||||
|
|
||||||
/*Get the number of threads on runtime.*/
|
/*Get the number of threads on runtime.*/
|
||||||
int openblas_get_num_threads(void);
|
int openblas_get_num_threads(void);
|
||||||
|
@ -100,6 +101,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
|
||||||
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
|
||||||
|
float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||||
|
double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||||
|
float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
|
||||||
|
float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||||
|
double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||||
|
float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
|
||||||
CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||||
CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||||
CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
@ -115,6 +126,9 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS
|
||||||
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||||
void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||||
|
|
||||||
|
void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||||
|
void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||||
|
|
||||||
void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
|
void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
|
||||||
void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
|
void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
|
||||||
void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||||
|
@ -289,6 +303,14 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA
|
||||||
void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
|
void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
|
||||||
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
|
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
|
||||||
|
|
||||||
|
void cblas_sgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
|
||||||
|
OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
|
||||||
|
void cblas_dgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
|
||||||
|
OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
|
||||||
|
void cblas_cgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
|
||||||
|
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
|
||||||
|
void cblas_zgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
|
||||||
|
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
|
||||||
|
|
||||||
void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
|
void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
|
||||||
OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
|
OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
|
||||||
|
|
|
@ -64,6 +64,7 @@ else ()
|
||||||
"#define NEEDBUNDERSCORE 1\n")
|
"#define NEEDBUNDERSCORE 1\n")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (CMAKE_Fortran_COMPILER)
|
||||||
get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE)
|
get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE)
|
||||||
string(TOUPPER ${F_COMPILER} F_COMPILER)
|
string(TOUPPER ${F_COMPILER} F_COMPILER)
|
||||||
|
endif()
|
||||||
|
|
|
@ -6,9 +6,6 @@
|
||||||
if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
|
if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
|
||||||
# This is for classic Flang. LLVM Flang is handled with gfortran below.
|
# This is for classic Flang. LLVM Flang is handled with gfortran below.
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
|
||||||
if (BINARY64 AND INTERFACE64)
|
|
||||||
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
|
|
||||||
endif ()
|
|
||||||
if (USE_OPENMP)
|
if (USE_OPENMP)
|
||||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
|
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
|
||||||
endif ()
|
endif ()
|
||||||
|
@ -55,6 +52,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
|
||||||
if (MIPS64)
|
if (MIPS64)
|
||||||
if (BINARY64)
|
if (BINARY64)
|
||||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64")
|
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64")
|
||||||
|
if (INTERFACE64)
|
||||||
|
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
|
||||||
|
endif ()
|
||||||
else ()
|
else ()
|
||||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32")
|
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32")
|
||||||
endif ()
|
endif ()
|
||||||
|
@ -83,9 +83,14 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
if (ARM64 AND INTERFACE64)
|
||||||
|
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
|
||||||
|
endif ()
|
||||||
else ()
|
else ()
|
||||||
if (BINARY64)
|
if (BINARY64)
|
||||||
|
if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
|
||||||
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
|
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
|
||||||
|
endif ()
|
||||||
if (INTERFACE64)
|
if (INTERFACE64)
|
||||||
if (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel")
|
if (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel")
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
|
@ -98,9 +103,11 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
else ()
|
else ()
|
||||||
|
if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
|
||||||
set(FCOMMON_OPT "${FCOMMON_OPT} -m32")
|
set(FCOMMON_OPT "${FCOMMON_OPT} -m32")
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
endif ()
|
||||||
|
|
||||||
if (USE_OPENMP)
|
if (USE_OPENMP)
|
||||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
|
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
||||||
|
libnameprefix=@LIBNAMEPREFIX@
|
||||||
|
libnamesuffix=@LIBNAMESUFFIX@
|
||||||
libsuffix=@SUFFIX64_UNDERSCORE@
|
libsuffix=@SUFFIX64_UNDERSCORE@
|
||||||
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
||||||
|
|
||||||
|
@ -7,5 +9,5 @@ Name: OpenBLAS
|
||||||
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
||||||
Version: @OpenBLAS_VERSION@
|
Version: @OpenBLAS_VERSION@
|
||||||
URL: https://github.com/OpenMathLib/OpenBLAS
|
URL: https://github.com/OpenMathLib/OpenBLAS
|
||||||
Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix}
|
Libs: @OpenMP_C_FLAGS@ -L${libdir} -l${libnameprefix}openblas${libnamesuffix}${libsuffix}
|
||||||
Cflags: -I${includedir}
|
Cflags: -I${includedir}
|
||||||
|
|
|
@ -932,7 +932,7 @@ endif ()
|
||||||
set(ZGEMM_UNROLL_M 4)
|
set(ZGEMM_UNROLL_M 4)
|
||||||
set(ZGEMM_UNROLL_N 4)
|
set(ZGEMM_UNROLL_N 4)
|
||||||
set(SYMV_P 16)
|
set(SYMV_P 16)
|
||||||
elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73")
|
elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73" OR "${TCORE}" STREQUAL "CORTEXA76")
|
||||||
file(APPEND ${TARGET_CONF_TEMP}
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
"#define L1_CODE_SIZE\t49152\n"
|
"#define L1_CODE_SIZE\t49152\n"
|
||||||
"#define L1_CODE_LINESIZE\t64\n"
|
"#define L1_CODE_LINESIZE\t64\n"
|
||||||
|
|
|
@ -501,10 +501,11 @@ set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_TH
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
set(LIBPREFIX "lib${LIBNAMEPREFIX}openblas")
|
||||||
|
|
||||||
if (DEFINED LIBNAMESUFFIX)
|
if (DEFINED LIBNAMESUFFIX)
|
||||||
set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}")
|
set(LIBPREFIX "${LIBNAMEPREFIX}_${LIBNAMESUFFIX}")
|
||||||
else ()
|
|
||||||
set(LIBPREFIX "libopenblas")
|
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (NOT DEFINED SYMBOLPREFIX)
|
if (NOT DEFINED SYMBOLPREFIX)
|
||||||
|
@ -615,13 +616,19 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
|
||||||
endforeach ()
|
endforeach ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if ("${F_COMPILER}" STREQUAL "NAG" OR "${F_COMPILER}" STREQUAL "CRAY")
|
if (CMAKE_Fortran_COMPILER)
|
||||||
|
if (${F_COMPILER} STREQUAL "NAG" OR ${F_COMPILER} STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
|
||||||
set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512")
|
set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512")
|
||||||
|
if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
|
||||||
|
message(STATUS "removing fortran flags")
|
||||||
|
set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64")
|
||||||
|
endif ()
|
||||||
foreach (FILTER_FLAG ${FILTER_FLAGS})
|
foreach (FILTER_FLAG ${FILTER_FLAGS})
|
||||||
string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS})
|
string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS})
|
||||||
string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS})
|
string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS})
|
||||||
endforeach ()
|
endforeach ()
|
||||||
endif ()
|
endif ()
|
||||||
|
endif ()
|
||||||
|
|
||||||
if ("${F_COMPILER}" STREQUAL "GFORTRAN")
|
if ("${F_COMPILER}" STREQUAL "GFORTRAN")
|
||||||
# lapack-netlib is rife with uninitialized warnings -hpa
|
# lapack-netlib is rife with uninitialized warnings -hpa
|
||||||
|
@ -679,6 +686,10 @@ else ()
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
if (DEFINED FIXED_LIBNAME)
|
||||||
|
set (LIBNAME "${LIBPREFIX}.${LIBSUFFIX}")
|
||||||
|
set (LIBNAME "${LIBPREFIX}_p.${LIBSUFFIX}")
|
||||||
|
endif()
|
||||||
|
|
||||||
set(LIBDLLNAME "${LIBPREFIX}.dll")
|
set(LIBDLLNAME "${LIBPREFIX}.dll")
|
||||||
set(LIBSONAME "${LIBNAME}.${LIBSUFFIX}.so")
|
set(LIBSONAME "${LIBNAME}.${LIBSUFFIX}.so")
|
||||||
|
|
24
common.h
24
common.h
|
@ -358,12 +358,6 @@ typedef int blasint;
|
||||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop; \n");
|
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop; \n");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef BULLDOZER
|
|
||||||
#ifndef YIELDING
|
|
||||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||||
#ifndef YIELDING
|
#ifndef YIELDING
|
||||||
|
@ -371,21 +365,13 @@ typedef int blasint;
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
|
||||||
#ifdef PILEDRIVER
|
|
||||||
#ifndef YIELDING
|
|
||||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
#if defined(ARCH_X86_64)
|
||||||
#ifdef STEAMROLLER
|
|
||||||
#ifndef YIELDING
|
#ifndef YIELDING
|
||||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
*/
|
|
||||||
|
|
||||||
#ifdef __EMSCRIPTEN__
|
#ifdef __EMSCRIPTEN__
|
||||||
#define YIELDING
|
#define YIELDING
|
||||||
|
@ -396,7 +382,7 @@ typedef int blasint;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/***
|
/***
|
||||||
To alloc job_t on heap or statck.
|
To alloc job_t on heap or stack.
|
||||||
please https://github.com/xianyi/OpenBLAS/issues/246
|
please https://github.com/xianyi/OpenBLAS/issues/246
|
||||||
***/
|
***/
|
||||||
#if defined(OS_WINDOWS)
|
#if defined(OS_WINDOWS)
|
||||||
|
@ -482,6 +468,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
||||||
#include "common_e2k.h"
|
#include "common_e2k.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef ARCH_CSKY
|
||||||
|
#include "common_csky.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef ASSEMBLER
|
#ifndef ASSEMBLER
|
||||||
#ifdef OS_WINDOWSSTORE
|
#ifdef OS_WINDOWSSTORE
|
||||||
typedef char env_var_t[MAX_PATH];
|
typedef char env_var_t[MAX_PATH];
|
||||||
|
|
|
@ -0,0 +1,56 @@
|
||||||
|
/*****************************************************************************
|
||||||
|
Copyright (c) 2011-2015, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written
|
||||||
|
permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
**********************************************************************************/
|
||||||
|
|
||||||
|
#ifndef COMMON_CSKY
|
||||||
|
#define COMMON_CSKY
|
||||||
|
|
||||||
|
#define MB __sync_synchronize()
|
||||||
|
#define WMB __sync_synchronize()
|
||||||
|
#define RMB __sync_synchronize()
|
||||||
|
|
||||||
|
#define INLINE inline
|
||||||
|
|
||||||
|
#ifndef ASSEMBLER
|
||||||
|
|
||||||
|
|
||||||
|
static inline int blas_quickdivide(blasint x, blasint y){
|
||||||
|
return x / y;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#define BUFFER_SIZE ( 32 << 20)
|
||||||
|
#define SEEK_ADDRESS
|
||||||
|
|
||||||
|
#endif
|
|
@ -498,6 +498,15 @@ void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double *
|
||||||
void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
|
void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
|
||||||
xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);
|
xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);
|
||||||
|
|
||||||
|
void BLASFUNC(sgemmt)(char*, char *, char *, blasint *, blasint *, float *,
|
||||||
|
float *, blasint *, float *, blasint *, float *, float *, blasint *);
|
||||||
|
void BLASFUNC(dgemmt)(char*, char *, char *, blasint *, blasint *, double *,
|
||||||
|
double *, blasint *, double *, blasint *, double *, double *, blasint *);
|
||||||
|
void BLASFUNC(cgemmt)(char*, char *, char *, blasint *, blasint *, float *,
|
||||||
|
float *, blasint *, float *, blasint *, float *, float *, blasint *);
|
||||||
|
void BLASFUNC(zgemmt)(char*, char *, char *, blasint *, blasint *, double *,
|
||||||
|
double *, blasint *, double *, blasint *, double *, double *, blasint *);
|
||||||
|
|
||||||
int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *,
|
int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *,
|
||||||
float *, float *, blasint *, float *, blasint *,
|
float *, float *, blasint *, float *, blasint *,
|
||||||
float *, float *, blasint *);
|
float *, float *, blasint *);
|
||||||
|
@ -764,8 +773,8 @@ xdouble BLASFUNC(qlamc3)(xdouble *, xdouble *);
|
||||||
|
|
||||||
void BLASFUNC(saxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *);
|
void BLASFUNC(saxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *);
|
||||||
void BLASFUNC(daxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *);
|
void BLASFUNC(daxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *);
|
||||||
void BLASFUNC(caxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *);
|
void BLASFUNC(caxpby) (blasint *, void *, float *, blasint *, void *, float *, blasint *);
|
||||||
void BLASFUNC(zaxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *);
|
void BLASFUNC(zaxpby) (blasint *, void *, double *, blasint *, void *, double *, blasint *);
|
||||||
|
|
||||||
void BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *);
|
void BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *);
|
||||||
void BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *);
|
void BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *);
|
||||||
|
|
|
@ -91,8 +91,26 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||||
#define BUFFER_SIZE ( 32 << 20)
|
#define BUFFER_SIZE ( 32 << 20)
|
||||||
#define SEEK_ADDRESS
|
#define SEEK_ADDRESS
|
||||||
|
|
||||||
#if defined(C910V)
|
#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(RISCV64_ZVL128B) || defined(x280)
|
||||||
# include <riscv_vector.h>
|
# include <riscv_vector.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined( __riscv_xtheadc ) && defined( __riscv_v ) && ( __riscv_v <= 7000 )
|
||||||
|
// t-head toolchain uses obsolete rvv intrinsics, can't build for C910V without this
|
||||||
|
#define RISCV_0p10_INTRINSICS
|
||||||
|
#define RISCV_RVV(x) x
|
||||||
|
#else
|
||||||
|
#define RISCV_RVV(x) __riscv_ ## x
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(C910V) || defined(RISCV64_ZVL256B)
|
||||||
|
# if !defined(DOUBLE)
|
||||||
|
# define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f32m1_f32)(v)
|
||||||
|
# else
|
||||||
|
# define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f64m1_f64)(v)
|
||||||
|
# endif
|
||||||
|
#else
|
||||||
|
# define EXTRACT_FLOAT(v) (v[0])
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -137,19 +137,20 @@ typedef struct blas_queue {
|
||||||
|
|
||||||
extern int blas_server_avail;
|
extern int blas_server_avail;
|
||||||
extern int blas_omp_number_max;
|
extern int blas_omp_number_max;
|
||||||
|
extern int blas_omp_threads_local;
|
||||||
|
|
||||||
static __inline int num_cpu_avail(int level) {
|
static __inline int num_cpu_avail(int level) {
|
||||||
|
|
||||||
#ifdef USE_OPENMP
|
#ifdef USE_OPENMP
|
||||||
int openmp_nthreads;
|
int openmp_nthreads;
|
||||||
openmp_nthreads=omp_get_max_threads();
|
openmp_nthreads=omp_get_max_threads();
|
||||||
|
if (omp_in_parallel()) openmp_nthreads = blas_omp_threads_local;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef USE_OPENMP
|
#ifndef USE_OPENMP
|
||||||
if (blas_cpu_number == 1
|
if (blas_cpu_number == 1
|
||||||
#endif
|
#else
|
||||||
#ifdef USE_OPENMP
|
if (openmp_nthreads == 1
|
||||||
if (openmp_nthreads == 1 || omp_in_parallel()
|
|
||||||
#endif
|
#endif
|
||||||
) return 1;
|
) return 1;
|
||||||
|
|
||||||
|
|
|
@ -42,6 +42,7 @@ size_t length64=sizeof(value64);
|
||||||
#define CPU_CORTEXA57 3
|
#define CPU_CORTEXA57 3
|
||||||
#define CPU_CORTEXA72 4
|
#define CPU_CORTEXA72 4
|
||||||
#define CPU_CORTEXA73 5
|
#define CPU_CORTEXA73 5
|
||||||
|
#define CPU_CORTEXA76 23
|
||||||
#define CPU_NEOVERSEN1 11
|
#define CPU_NEOVERSEN1 11
|
||||||
#define CPU_NEOVERSEV1 16
|
#define CPU_NEOVERSEV1 16
|
||||||
#define CPU_NEOVERSEN2 17
|
#define CPU_NEOVERSEN2 17
|
||||||
|
@ -89,7 +90,8 @@ static char *cpuname[] = {
|
||||||
"CORTEXX2",
|
"CORTEXX2",
|
||||||
"CORTEXA510",
|
"CORTEXA510",
|
||||||
"CORTEXA710",
|
"CORTEXA710",
|
||||||
"FT2000"
|
"FT2000",
|
||||||
|
"CORTEXA76"
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *cpuname_lower[] = {
|
static char *cpuname_lower[] = {
|
||||||
|
@ -115,7 +117,8 @@ static char *cpuname_lower[] = {
|
||||||
"cortexx2",
|
"cortexx2",
|
||||||
"cortexa510",
|
"cortexa510",
|
||||||
"cortexa710",
|
"cortexa710",
|
||||||
"ft2000"
|
"ft2000",
|
||||||
|
"cortexa76"
|
||||||
};
|
};
|
||||||
|
|
||||||
int get_feature(char *search)
|
int get_feature(char *search)
|
||||||
|
@ -210,6 +213,8 @@ int detect(void)
|
||||||
return CPU_CORTEXX2;
|
return CPU_CORTEXX2;
|
||||||
else if (strstr(cpu_part, "0xd4e")) //X3
|
else if (strstr(cpu_part, "0xd4e")) //X3
|
||||||
return CPU_CORTEXX2;
|
return CPU_CORTEXX2;
|
||||||
|
else if (strstr(cpu_part, "0xd0b"))
|
||||||
|
return CPU_CORTEXA76;
|
||||||
}
|
}
|
||||||
// Qualcomm
|
// Qualcomm
|
||||||
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
|
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
|
||||||
|
@ -391,6 +396,7 @@ void get_cpuconfig(void)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case CPU_NEOVERSEV1:
|
case CPU_NEOVERSEV1:
|
||||||
|
case CPU_CORTEXA76:
|
||||||
printf("#define %s\n", cpuname[d]);
|
printf("#define %s\n", cpuname[d]);
|
||||||
printf("#define L1_CODE_SIZE 65536\n");
|
printf("#define L1_CODE_SIZE 65536\n");
|
||||||
printf("#define L1_CODE_LINESIZE 64\n");
|
printf("#define L1_CODE_LINESIZE 64\n");
|
||||||
|
|
|
@ -72,10 +72,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#define CPU_GENERIC 0
|
#define CPU_GENERIC 0
|
||||||
#define CPU_C910V 1
|
#define CPU_C910V 1
|
||||||
|
#define CPU_x280 2
|
||||||
|
#define CPU_RISCV64_ZVL256B 3
|
||||||
|
#define CPU_RISCV64_ZVL128B 4
|
||||||
|
|
||||||
static char *cpuname[] = {
|
static char *cpuname[] = {
|
||||||
"RISCV64_GENERIC",
|
"RISCV64_GENERIC",
|
||||||
"C910V"
|
"C910V",
|
||||||
|
"x280",
|
||||||
|
"CPU_RISCV64_ZVL256B",
|
||||||
|
"CPU_RISCV64_ZVL128B"
|
||||||
|
};
|
||||||
|
|
||||||
|
static char *cpuname_lower[] = {
|
||||||
|
"riscv64_generic",
|
||||||
|
"c910v",
|
||||||
|
"x280",
|
||||||
|
"riscv64_zvl256b",
|
||||||
|
"riscv64_zvl128b"
|
||||||
};
|
};
|
||||||
|
|
||||||
int detect(void){
|
int detect(void){
|
||||||
|
@ -86,21 +100,27 @@ int detect(void){
|
||||||
char *pmodel = NULL, *pisa = NULL;
|
char *pmodel = NULL, *pisa = NULL;
|
||||||
|
|
||||||
infile = fopen("/proc/cpuinfo", "r");
|
infile = fopen("/proc/cpuinfo", "r");
|
||||||
|
if (!infile)
|
||||||
|
return CPU_GENERIC;
|
||||||
while (fgets(buffer, sizeof(buffer), infile)){
|
while (fgets(buffer, sizeof(buffer), infile)){
|
||||||
if(!strncmp(buffer, "model name", 10)){
|
if(!strncmp(buffer, "model name", 10)){
|
||||||
strcpy(model_buffer, buffer);
|
strcpy(model_buffer, buffer);
|
||||||
pmodel = strchr(isa_buffer, ':') + 1;
|
pmodel = strchr(model_buffer, ':');
|
||||||
|
if (pmodel)
|
||||||
|
pmodel++;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(!strncmp(buffer, "isa", 3)){
|
if(!strncmp(buffer, "isa", 3)){
|
||||||
strcpy(isa_buffer, buffer);
|
strcpy(isa_buffer, buffer);
|
||||||
pisa = strchr(isa_buffer, '4') + 1;
|
pisa = strchr(isa_buffer, '4');
|
||||||
|
if (pisa)
|
||||||
|
pisa++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fclose(infile);
|
fclose(infile);
|
||||||
|
|
||||||
if (!pmodel)
|
if (!pmodel || !pisa)
|
||||||
return(CPU_GENERIC);
|
return(CPU_GENERIC);
|
||||||
|
|
||||||
if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v'))
|
if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v'))
|
||||||
|
@ -140,5 +160,5 @@ void get_cpuconfig(void){
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_libname(void){
|
void get_libname(void){
|
||||||
printf("riscv64\n");
|
printf("%s", cpuname_lower[detect()]);
|
||||||
}
|
}
|
||||||
|
|
4
ctest.c
4
ctest.c
|
@ -173,6 +173,10 @@ HAVE_C11
|
||||||
ARCH_E2K
|
ARCH_E2K
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__csky__)
|
||||||
|
ARCH_CSKY
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__EMSCRIPTEN__)
|
#if defined(__EMSCRIPTEN__)
|
||||||
ARCH_RISCV64
|
ARCH_RISCV64
|
||||||
OS_WINDOWS
|
OS_WINDOWS
|
||||||
|
|
|
@ -40,6 +40,10 @@ else()
|
||||||
c_${float_char}blas1.c)
|
c_${float_char}blas1.c)
|
||||||
endif()
|
endif()
|
||||||
target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
|
target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
|
||||||
|
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
|
||||||
|
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
|
||||||
|
target_link_libraries(x${float_char}cblat1 omp pthread)
|
||||||
|
endif()
|
||||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
||||||
target_link_libraries(x${float_char}cblat1 m)
|
target_link_libraries(x${float_char}cblat1 m)
|
||||||
endif()
|
endif()
|
||||||
|
@ -65,6 +69,10 @@ else()
|
||||||
constant.c)
|
constant.c)
|
||||||
endif()
|
endif()
|
||||||
target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
|
target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
|
||||||
|
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
|
||||||
|
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
|
||||||
|
target_link_libraries(x${float_char}cblat2 omp pthread)
|
||||||
|
endif()
|
||||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
||||||
target_link_libraries(x${float_char}cblat2 m)
|
target_link_libraries(x${float_char}cblat2 m)
|
||||||
endif()
|
endif()
|
||||||
|
@ -80,6 +88,17 @@ if (NOT NOFORTRAN)
|
||||||
auxiliary.c
|
auxiliary.c
|
||||||
c_xerbla.c
|
c_xerbla.c
|
||||||
constant.c)
|
constant.c)
|
||||||
|
if (USE_GEMM3M)
|
||||||
|
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
|
||||||
|
add_executable(x${float_char}cblat3_3m
|
||||||
|
c_${float_char}blat3_3m.f
|
||||||
|
c_${float_char}blas3_3m.c
|
||||||
|
c_${float_char}3chke_3m.c
|
||||||
|
auxiliary.c
|
||||||
|
c_xerbla.c
|
||||||
|
constant.c)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
else()
|
else()
|
||||||
add_executable(x${float_char}cblat3
|
add_executable(x${float_char}cblat3
|
||||||
c_${float_char}blat3c.c
|
c_${float_char}blat3c.c
|
||||||
|
@ -88,12 +107,44 @@ else()
|
||||||
auxiliary.c
|
auxiliary.c
|
||||||
c_xerbla.c
|
c_xerbla.c
|
||||||
constant.c)
|
constant.c)
|
||||||
|
if (USE_GEMM3M)
|
||||||
|
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
|
||||||
|
add_executable(x${float_char}cblat3_3m
|
||||||
|
c_${float_char}blat3c_3m.c
|
||||||
|
c_${float_char}blas3_3m.c
|
||||||
|
c_${float_char}3chke_3m.c
|
||||||
|
auxiliary.c
|
||||||
|
c_xerbla.c
|
||||||
|
constant.c)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
|
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
|
||||||
|
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
|
||||||
|
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
|
||||||
|
target_link_libraries(x${float_char}cblat3 omp pthread)
|
||||||
|
endif()
|
||||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
||||||
target_link_libraries(x${float_char}cblat3 m)
|
target_link_libraries(x${float_char}cblat3 m)
|
||||||
endif()
|
endif()
|
||||||
|
if (USE_GEMM3M)
|
||||||
|
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
|
||||||
|
target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME})
|
||||||
|
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
|
||||||
|
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
|
||||||
|
target_link_libraries(x${float_char}cblat3 omp pthread)
|
||||||
|
endif()
|
||||||
|
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
||||||
|
target_link_libraries(x${float_char}cblat3_3m m)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
add_test(NAME "x${float_char}cblat3"
|
add_test(NAME "x${float_char}cblat3"
|
||||||
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
|
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
|
||||||
|
if (USE_GEMM3M)
|
||||||
|
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
|
||||||
|
add_test(NAME "x${float_char}cblat3_3m"
|
||||||
|
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3_3m> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3_3m")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
|
@ -5,6 +5,24 @@
|
||||||
TOPDIR = ..
|
TOPDIR = ..
|
||||||
include $(TOPDIR)/Makefile.system
|
include $(TOPDIR)/Makefile.system
|
||||||
|
|
||||||
|
SUPPORT_GEMM3M = 0
|
||||||
|
|
||||||
|
ifeq ($(ARCH), x86)
|
||||||
|
SUPPORT_GEMM3M = 1
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ARCH), x86_64)
|
||||||
|
SUPPORT_GEMM3M = 1
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ARCH), ia64)
|
||||||
|
SUPPORT_GEMM3M = 1
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ARCH), MIPS)
|
||||||
|
SUPPORT_GEMM3M = 1
|
||||||
|
endif
|
||||||
|
|
||||||
override CFLAGS += -DADD$(BU) -DCBLAS
|
override CFLAGS += -DADD$(BU) -DCBLAS
|
||||||
ifeq ($(F_COMPILER),GFORTRAN)
|
ifeq ($(F_COMPILER),GFORTRAN)
|
||||||
override FFLAGS += -fno-tree-vectorize
|
override FFLAGS += -fno-tree-vectorize
|
||||||
|
@ -144,9 +162,15 @@ all3targets += xdcblat3
|
||||||
endif
|
endif
|
||||||
ifeq ($(BUILD_COMPLEX),1)
|
ifeq ($(BUILD_COMPLEX),1)
|
||||||
all3targets += xccblat3
|
all3targets += xccblat3
|
||||||
|
ifeq ($(SUPPORT_GEMM3M),1)
|
||||||
|
all3targets += xccblat3_3m
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
ifeq ($(BUILD_COMPLEX16),1)
|
ifeq ($(BUILD_COMPLEX16),1)
|
||||||
all3targets += xzcblat3
|
all3targets += xzcblat3
|
||||||
|
ifeq ($(SUPPORT_GEMM3M),1)
|
||||||
|
all3targets += xzcblat3_3m
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
all3: $(all3targets)
|
all3: $(all3targets)
|
||||||
|
@ -181,9 +205,9 @@ endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
all3_3m: xzcblat3_3m xccblat3_3m
|
ifeq ($(SUPPORT_GEMM3M),1)
|
||||||
ifeq ($(USE_OPENMP), 1)
|
ifeq ($(USE_OPENMP), 1)
|
||||||
ifeq ($(BUILD_SINGLE),1)
|
ifeq ($(BUILD_COMPLEX),1)
|
||||||
OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
|
OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
|
||||||
endif
|
endif
|
||||||
ifeq ($(BUILD_COMPLEX16),1)
|
ifeq ($(BUILD_COMPLEX16),1)
|
||||||
|
@ -197,6 +221,7 @@ ifeq ($(BUILD_COMPLEX16),1)
|
||||||
OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
|
OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -218,6 +243,9 @@ ifeq ($(F_COMPILER), IBM)
|
||||||
ifeq ($(C_COMPILER), GCC)
|
ifeq ($(C_COMPILER), GCC)
|
||||||
CEXTRALIB += -lgomp
|
CEXTRALIB += -lgomp
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(C_COMPILER), CLANG)
|
||||||
|
CEXTRALIB += -lomp
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -268,8 +296,10 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
$(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||||
xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME)
|
xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
$(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||||
|
ifeq ($(SUPPORT_GEMM3M),1)
|
||||||
xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME)
|
xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
$(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||||
|
endif
|
||||||
else
|
else
|
||||||
xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME)
|
xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
$(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||||
|
@ -277,6 +307,10 @@ xccblat2: $(ctestl2o) c_cblat2c.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
$(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||||
xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME)
|
xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
$(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||||
|
ifeq ($(SUPPORT_GEMM3M),1)
|
||||||
|
xccblat3_3m: $(ctestl3o_3m) c_cblat3c_3m.o $(TOPDIR)/$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o xccblat3_3m c_cblat3c_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -290,8 +324,10 @@ xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
$(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||||
xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME)
|
xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
$(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||||
|
ifeq ($(SUPPORT_GEMM3M),1)
|
||||||
xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME)
|
xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
$(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||||
|
endif
|
||||||
else
|
else
|
||||||
xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME)
|
xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
$(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||||
|
@ -299,6 +335,10 @@ xzcblat2: $(ztestl2o) c_zblat2c.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
$(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||||
xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME)
|
xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
$(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||||
|
ifeq ($(SUPPORT_GEMM3M),1)
|
||||||
|
xzcblat3_3m: $(ztestl3o_3m) c_zblat3c_3m.o $(TOPDIR)/$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o xzcblat3_3m c_zblat3c_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
|
@ -96,7 +96,7 @@
|
||||||
INTEGER ICAMAXTEST
|
INTEGER ICAMAXTEST
|
||||||
EXTERNAL SCASUMTEST, SCNRM2TEST, ICAMAXTEST
|
EXTERNAL SCASUMTEST, SCNRM2TEST, ICAMAXTEST
|
||||||
* .. External Subroutines ..
|
* .. External Subroutines ..
|
||||||
EXTERNAL CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1
|
EXTERNAL CSCALTEST, CSSCALTEST, CTEST, ITEST1, STEST1
|
||||||
* .. Intrinsic Functions ..
|
* .. Intrinsic Functions ..
|
||||||
INTRINSIC MAX
|
INTRINSIC MAX
|
||||||
* .. Common blocks ..
|
* .. Common blocks ..
|
||||||
|
@ -214,8 +214,8 @@
|
||||||
CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1),
|
CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1),
|
||||||
+ STRUE4(NP1),SFAC)
|
+ STRUE4(NP1),SFAC)
|
||||||
ELSE IF (ICASE.EQ.8) THEN
|
ELSE IF (ICASE.EQ.8) THEN
|
||||||
* .. CSCAL ..
|
* .. CSCALTEST ..
|
||||||
CALL CSCAL(N,CA,CX,INCX)
|
CALL CSCALTEST(N,CA,CX,INCX)
|
||||||
CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX),
|
CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX),
|
||||||
+ SFAC)
|
+ SFAC)
|
||||||
ELSE IF (ICASE.EQ.9) THEN
|
ELSE IF (ICASE.EQ.9) THEN
|
||||||
|
@ -236,14 +236,14 @@
|
||||||
*
|
*
|
||||||
INCX = 1
|
INCX = 1
|
||||||
IF (ICASE.EQ.8) THEN
|
IF (ICASE.EQ.8) THEN
|
||||||
* CSCAL
|
* CSCALTEST
|
||||||
* Add a test for alpha equal to zero.
|
* Add a test for alpha equal to zero.
|
||||||
CA = (0.0E0,0.0E0)
|
CA = (0.0E0,0.0E0)
|
||||||
DO 80 I = 1, 5
|
DO 80 I = 1, 5
|
||||||
MWPCT(I) = (0.0E0,0.0E0)
|
MWPCT(I) = (0.0E0,0.0E0)
|
||||||
MWPCS(I) = (1.0E0,1.0E0)
|
MWPCS(I) = (1.0E0,1.0E0)
|
||||||
80 CONTINUE
|
80 CONTINUE
|
||||||
CALL CSCAL(5,CA,CX,INCX)
|
CALL CSCALTEST(5,CA,CX,INCX)
|
||||||
CALL CTEST(5,CX,MWPCT,MWPCS,SFAC)
|
CALL CTEST(5,CX,MWPCT,MWPCS,SFAC)
|
||||||
ELSE IF (ICASE.EQ.9) THEN
|
ELSE IF (ICASE.EQ.9) THEN
|
||||||
* CSSCALTEST
|
* CSSCALTEST
|
||||||
|
|
|
@ -440,6 +440,7 @@ static real c_b43 = (float)1.;
|
||||||
extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*);
|
extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*);
|
||||||
static complex mwpcs[5], mwpct[5];
|
static complex mwpcs[5], mwpct[5];
|
||||||
extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*);
|
extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*);
|
||||||
|
extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_();
|
||||||
static complex cx[8];
|
static complex cx[8];
|
||||||
extern real scnrm2test_(integer*, complex*, integer*);
|
extern real scnrm2test_(integer*, complex*, integer*);
|
||||||
static integer np1;
|
static integer np1;
|
||||||
|
@ -481,7 +482,7 @@ static real c_b43 = (float)1.;
|
||||||
stest1_(&r__1, &strue4[np1 - 1], &strue4[np1 - 1], sfac);
|
stest1_(&r__1, &strue4[np1 - 1], &strue4[np1 - 1], sfac);
|
||||||
} else if (combla_1.icase == 8) {
|
} else if (combla_1.icase == 8) {
|
||||||
/* .. CSCAL .. */
|
/* .. CSCAL .. */
|
||||||
cscal_(&combla_1.n, &ca, cx, &combla_1.incx);
|
cscaltest_(&combla_1.n, &ca, cx, &combla_1.incx);
|
||||||
ctest_(&len, cx, &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48],
|
ctest_(&len, cx, &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48],
|
||||||
&ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], sfac);
|
&ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], sfac);
|
||||||
} else if (combla_1.icase == 9) {
|
} else if (combla_1.icase == 9) {
|
||||||
|
@ -515,7 +516,7 @@ static real c_b43 = (float)1.;
|
||||||
mwpcs[i__1].r = (float)1., mwpcs[i__1].i = (float)1.;
|
mwpcs[i__1].r = (float)1., mwpcs[i__1].i = (float)1.;
|
||||||
/* L80: */
|
/* L80: */
|
||||||
}
|
}
|
||||||
cscal_(&c__5, &ca, cx, &combla_1.incx);
|
cscaltest_(&c__5, &ca, cx, &combla_1.incx);
|
||||||
ctest_(&c__5, cx, mwpct, mwpcs, sfac);
|
ctest_(&c__5, cx, mwpct, mwpcs, sfac);
|
||||||
} else if (combla_1.icase == 9) {
|
} else if (combla_1.icase == 9) {
|
||||||
/* CSSCALTEST */
|
/* CSSCALTEST */
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -545,13 +545,31 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
*range_n, IFLOAT *sa, IFLOAT *sb,
|
*range_n, IFLOAT *sa, IFLOAT *sb,
|
||||||
BLASLONG nthreads_m, BLASLONG nthreads_n) {
|
BLASLONG nthreads_m, BLASLONG nthreads_n) {
|
||||||
|
|
||||||
#ifndef USE_OPENMP
|
#ifdef USE_OPENMP
|
||||||
#ifndef OS_WINDOWS
|
static omp_lock_t level3_lock, critical_section_lock;
|
||||||
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
|
static volatile BLASLONG init_lock = 0, omp_lock_initialized = 0,
|
||||||
#else
|
parallel_section_left = MAX_PARALLEL_NUMBER;
|
||||||
|
|
||||||
|
// Lock initialization; Todo : Maybe this part can be moved to blas_init() in blas_server_omp.c
|
||||||
|
while(omp_lock_initialized == 0)
|
||||||
|
{
|
||||||
|
blas_lock(&init_lock);
|
||||||
|
{
|
||||||
|
if(omp_lock_initialized == 0)
|
||||||
|
{
|
||||||
|
omp_init_lock(&level3_lock);
|
||||||
|
omp_init_lock(&critical_section_lock);
|
||||||
|
omp_lock_initialized = 1;
|
||||||
|
WMB;
|
||||||
|
}
|
||||||
|
blas_unlock(&init_lock);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#elif defined(OS_WINDOWS)
|
||||||
CRITICAL_SECTION level3_lock;
|
CRITICAL_SECTION level3_lock;
|
||||||
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||||
#endif
|
#else
|
||||||
|
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
blas_arg_t newarg;
|
blas_arg_t newarg;
|
||||||
|
@ -599,12 +617,28 @@ InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef USE_OPENMP
|
#ifdef USE_OPENMP
|
||||||
#ifndef OS_WINDOWS
|
omp_set_lock(&level3_lock);
|
||||||
pthread_mutex_lock(&level3_lock);
|
omp_set_lock(&critical_section_lock);
|
||||||
#else
|
|
||||||
|
parallel_section_left--;
|
||||||
|
|
||||||
|
/*
|
||||||
|
How OpenMP locks works with NUM_PARALLEL
|
||||||
|
1) parallel_section_left = Number of available concurrent executions of OpenBLAS - Number of currently executing OpenBLAS executions
|
||||||
|
2) level3_lock is acting like a master lock or barrier which stops OpenBLAS calls when all the parallel_section are currently busy executing other OpenBLAS calls
|
||||||
|
3) critical_section_lock is used for updating variables shared between threads executing OpenBLAS calls concurrently and for unlocking of master lock whenever required
|
||||||
|
4) Unlock master lock only when we have not already exhausted all the parallel_sections and allow another thread with a OpenBLAS call to enter
|
||||||
|
*/
|
||||||
|
if(parallel_section_left != 0)
|
||||||
|
omp_unset_lock(&level3_lock);
|
||||||
|
|
||||||
|
omp_unset_lock(&critical_section_lock);
|
||||||
|
|
||||||
|
#elif defined(OS_WINDOWS)
|
||||||
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||||
#endif
|
#else
|
||||||
|
pthread_mutex_lock(&level3_lock);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef USE_ALLOC_HEAP
|
#ifdef USE_ALLOC_HEAP
|
||||||
|
@ -732,12 +766,24 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||||
free(job);
|
free(job);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef USE_OPENMP
|
#ifdef USE_OPENMP
|
||||||
#ifndef OS_WINDOWS
|
omp_set_lock(&critical_section_lock);
|
||||||
pthread_mutex_unlock(&level3_lock);
|
parallel_section_left++;
|
||||||
#else
|
|
||||||
|
/*
|
||||||
|
Unlock master lock only when all the parallel_sections are already exhausted and one of the thread has completed its OpenBLAS call
|
||||||
|
otherwise just increment the parallel_section_left
|
||||||
|
The master lock is only locked when we have exhausted all the parallel_sections, So only unlock it then and otherwise just increment the count
|
||||||
|
*/
|
||||||
|
if(parallel_section_left == 1)
|
||||||
|
omp_unset_lock(&level3_lock);
|
||||||
|
|
||||||
|
omp_unset_lock(&critical_section_lock);
|
||||||
|
|
||||||
|
#elif defined(OS_WINDOWS)
|
||||||
LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||||
#endif
|
#else
|
||||||
|
pthread_mutex_unlock(&level3_lock);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -113,6 +113,8 @@ extern unsigned int openblas_thread_timeout(void);
|
||||||
/* We need this global for checking if initialization is finished. */
|
/* We need this global for checking if initialization is finished. */
|
||||||
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
|
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
|
||||||
|
|
||||||
|
int blas_omp_threads_local = 1;
|
||||||
|
|
||||||
/* Local Variables */
|
/* Local Variables */
|
||||||
#if defined(USE_PTHREAD_LOCK)
|
#if defined(USE_PTHREAD_LOCK)
|
||||||
static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER;
|
static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||||
|
|
|
@ -69,6 +69,7 @@
|
||||||
|
|
||||||
int blas_server_avail = 0;
|
int blas_server_avail = 0;
|
||||||
int blas_omp_number_max = 0;
|
int blas_omp_number_max = 0;
|
||||||
|
int blas_omp_threads_local = 1;
|
||||||
|
|
||||||
extern int openblas_omp_adaptive_env(void);
|
extern int openblas_omp_adaptive_env(void);
|
||||||
|
|
||||||
|
@ -422,7 +423,6 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
||||||
if (i != MAX_PARALLEL_NUMBER)
|
if (i != MAX_PARALLEL_NUMBER)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (openblas_omp_adaptive_env() != 0) {
|
if (openblas_omp_adaptive_env() != 0) {
|
||||||
#pragma omp parallel for num_threads(num) schedule(OMP_SCHED)
|
#pragma omp parallel for num_threads(num) schedule(OMP_SCHED)
|
||||||
for (i = 0; i < num; i ++) {
|
for (i = 0; i < num; i ++) {
|
||||||
|
|
|
@ -48,6 +48,12 @@
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef SMP_DEBUG
|
||||||
|
# define MT_TRACE(...) fprintf(stderr, __VA_ARGS__)
|
||||||
|
#else
|
||||||
|
# define MT_TRACE(...)
|
||||||
|
#endif
|
||||||
|
|
||||||
/* This is a thread implementation for Win32 lazy implementation */
|
/* This is a thread implementation for Win32 lazy implementation */
|
||||||
|
|
||||||
/* Thread server common information */
|
/* Thread server common information */
|
||||||
|
@ -59,6 +65,8 @@ static CRITICAL_SECTION queue_lock;
|
||||||
/* We need this global for checking if initialization is finished. */
|
/* We need this global for checking if initialization is finished. */
|
||||||
int blas_server_avail = 0;
|
int blas_server_avail = 0;
|
||||||
|
|
||||||
|
int blas_omp_threads_local = 1;
|
||||||
|
|
||||||
/* Local Variables */
|
/* Local Variables */
|
||||||
static BLASULONG server_lock = 0;
|
static BLASULONG server_lock = 0;
|
||||||
|
|
||||||
|
@ -66,16 +74,9 @@ static HANDLE blas_threads [MAX_CPU_NUMBER];
|
||||||
static DWORD blas_threads_id[MAX_CPU_NUMBER];
|
static DWORD blas_threads_id[MAX_CPU_NUMBER];
|
||||||
static volatile int thread_target; // target num of live threads, volatile for cross-thread reads
|
static volatile int thread_target; // target num of live threads, volatile for cross-thread reads
|
||||||
|
|
||||||
#if defined (__GNUC__) && (__GNUC__ < 6)
|
//
|
||||||
#define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch)
|
// Legacy code path
|
||||||
#else
|
//
|
||||||
#if defined(_WIN64)
|
|
||||||
#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp)
|
|
||||||
#else
|
|
||||||
#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp)
|
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) {
|
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) {
|
||||||
|
|
||||||
if (!(mode & BLAS_COMPLEX)) {
|
if (!(mode & BLAS_COMPLEX)) {
|
||||||
|
@ -199,9 +200,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* This is a main routine of threads. Each thread waits until job is */
|
//
|
||||||
/* queued. */
|
// This is a main routine of threads. Each thread waits until job is queued.
|
||||||
|
//
|
||||||
static DWORD WINAPI blas_thread_server(void *arg) {
|
static DWORD WINAPI blas_thread_server(void *arg) {
|
||||||
|
|
||||||
/* Thread identifier */
|
/* Thread identifier */
|
||||||
|
@ -213,31 +214,24 @@ static DWORD WINAPI blas_thread_server(void *arg){
|
||||||
/* Each server needs each buffer */
|
/* Each server needs each buffer */
|
||||||
buffer = blas_memory_alloc(2);
|
buffer = blas_memory_alloc(2);
|
||||||
|
|
||||||
#ifdef SMP_DEBUG
|
MT_TRACE("Server[%2ld] Thread is started!\n", cpu);
|
||||||
fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
while (1) {
|
while (1) {
|
||||||
|
|
||||||
/* Waiting for Queue */
|
/* Waiting for Queue */
|
||||||
|
|
||||||
#ifdef SMP_DEBUG
|
MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu);
|
||||||
fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu);
|
|
||||||
#endif
|
|
||||||
// event raised when work is added to the queue
|
// event raised when work is added to the queue
|
||||||
WaitForSingleObject(kickoff_event, INFINITE);
|
WaitForSingleObject(kickoff_event, INFINITE);
|
||||||
|
|
||||||
if (cpu > thread_target - 2)
|
if (cpu > thread_target - 2) {
|
||||||
{
|
//MT_TRACE("thread [%d] exiting.\n", cpu);
|
||||||
//printf("thread [%d] exiting.\n", cpu);
|
|
||||||
break; // excess thread, so worker thread exits
|
break; // excess thread, so worker thread exits
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef SMP_DEBUG
|
MT_TRACE("Server[%2ld] Got it.\n", cpu);
|
||||||
fprintf(STDERR, "Server[%2ld] Got it.\n", cpu);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if 1
|
|
||||||
EnterCriticalSection(&queue_lock);
|
EnterCriticalSection(&queue_lock);
|
||||||
|
|
||||||
queue = work_queue;
|
queue = work_queue;
|
||||||
|
@ -245,19 +239,6 @@ static DWORD WINAPI blas_thread_server(void *arg){
|
||||||
work_queue = work_queue->next;
|
work_queue = work_queue->next;
|
||||||
|
|
||||||
LeaveCriticalSection(&queue_lock);
|
LeaveCriticalSection(&queue_lock);
|
||||||
#else
|
|
||||||
volatile blas_queue_t* queue_next;
|
|
||||||
|
|
||||||
INT_PTR prev_value;
|
|
||||||
do {
|
|
||||||
queue = (volatile blas_queue_t*)work_queue;
|
|
||||||
if (!queue)
|
|
||||||
break;
|
|
||||||
|
|
||||||
queue_next = (volatile blas_queue_t*)queue->next;
|
|
||||||
prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue);
|
|
||||||
} while (prev_value != queue);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (queue) {
|
if (queue) {
|
||||||
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
|
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
|
||||||
|
@ -270,10 +251,8 @@ static DWORD WINAPI blas_thread_server(void *arg){
|
||||||
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
|
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef SMP_DEBUG
|
MT_TRACE("Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
|
||||||
fprintf(STDERR, "Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
|
|
||||||
cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
|
cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
|
||||||
#endif
|
|
||||||
|
|
||||||
// fprintf(stderr, "queue start[%ld]!!!\n", cpu);
|
// fprintf(stderr, "queue start[%ld]!!!\n", cpu);
|
||||||
|
|
||||||
|
@ -281,7 +260,8 @@ static DWORD WINAPI blas_thread_server(void *arg){
|
||||||
main_status[cpu] = MAIN_RUNNING1;
|
main_status[cpu] = MAIN_RUNNING1;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
if (sa == NULL)
|
||||||
|
sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
||||||
|
|
||||||
if (sb == NULL) {
|
if (sb == NULL) {
|
||||||
if (!(queue -> mode & BLAS_COMPLEX)) {
|
if (!(queue -> mode & BLAS_COMPLEX)) {
|
||||||
|
@ -333,7 +313,6 @@ static DWORD WINAPI blas_thread_server(void *arg){
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (!(queue -> mode & BLAS_LEGACY)) {
|
if (!(queue -> mode & BLAS_LEGACY)) {
|
||||||
|
|
||||||
(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
|
(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
|
||||||
} else {
|
} else {
|
||||||
legacy_exec(routine, queue -> mode, queue -> args, sb);
|
legacy_exec(routine, queue -> mode, queue -> args, sb);
|
||||||
|
@ -342,26 +321,23 @@ static DWORD WINAPI blas_thread_server(void *arg){
|
||||||
continue; //if queue == NULL
|
continue; //if queue == NULL
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef SMP_DEBUG
|
MT_TRACE("Server[%2ld] Finished!\n", cpu);
|
||||||
fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
queue->finished = 1;
|
queue->finished = 1;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Shutdown procedure */
|
/* Shutdown procedure */
|
||||||
|
|
||||||
#ifdef SMP_DEBUG
|
MT_TRACE("Server[%2ld] Shutdown!\n", cpu);
|
||||||
fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
blas_memory_free(buffer);
|
blas_memory_free(buffer);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Initializing routine */
|
//
|
||||||
|
// Initializing routine
|
||||||
|
//
|
||||||
int blas_thread_init(void) {
|
int blas_thread_init(void) {
|
||||||
BLASLONG i;
|
BLASLONG i;
|
||||||
|
|
||||||
|
@ -369,10 +345,7 @@ int blas_thread_init(void){
|
||||||
|
|
||||||
LOCK_COMMAND(&server_lock);
|
LOCK_COMMAND(&server_lock);
|
||||||
|
|
||||||
#ifdef SMP_DEBUG
|
MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number);
|
||||||
fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n",
|
|
||||||
blas_cpu_number);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (!blas_server_avail) {
|
if (!blas_server_avail) {
|
||||||
// create the kickoff Event
|
// create the kickoff Event
|
||||||
|
@ -383,7 +356,7 @@ int blas_thread_init(void){
|
||||||
InitializeCriticalSection(&queue_lock);
|
InitializeCriticalSection(&queue_lock);
|
||||||
|
|
||||||
for(i = 0; i < blas_cpu_number - 1; i++) {
|
for(i = 0; i < blas_cpu_number - 1; i++) {
|
||||||
//printf("thread_init: creating thread [%d]\n", i);
|
//MT_TRACE("thread_init: creating thread [%d]\n", i);
|
||||||
|
|
||||||
blas_threads[i] = CreateThread(NULL, 0,
|
blas_threads[i] = CreateThread(NULL, 0,
|
||||||
blas_thread_server, (void *)i,
|
blas_thread_server, (void *)i,
|
||||||
|
@ -398,14 +371,11 @@ int blas_thread_init(void){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
//
|
||||||
User can call one of two routines.
|
// User can call one of two routines.
|
||||||
|
// exec_blas_async ... immediately returns after jobs are queued.
|
||||||
exec_blas_async ... immediately returns after jobs are queued.
|
// exec_blas ... returns after jobs are finished.
|
||||||
|
//
|
||||||
exec_blas ... returns after jobs are finished.
|
|
||||||
*/
|
|
||||||
|
|
||||||
int exec_blas_async(BLASLONG pos, blas_queue_t *queue) {
|
int exec_blas_async(BLASLONG pos, blas_queue_t *queue) {
|
||||||
|
|
||||||
#if defined(SMP_SERVER)
|
#if defined(SMP_SERVER)
|
||||||
|
@ -439,14 +409,14 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
blas_queue_t *next_item = work_queue;
|
blas_queue_t *queue_item = work_queue;
|
||||||
|
|
||||||
// find the end of the work queue
|
// find the end of the work queue
|
||||||
while (next_item)
|
while (queue_item->next)
|
||||||
next_item = next_item->next;
|
queue_item = queue_item->next;
|
||||||
|
|
||||||
// add new work to the end
|
// add new work to the end
|
||||||
next_item = queue;
|
queue_item->next = queue;
|
||||||
}
|
}
|
||||||
|
|
||||||
LeaveCriticalSection(&queue_lock);
|
LeaveCriticalSection(&queue_lock);
|
||||||
|
@ -456,16 +426,16 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Join. Wait for all queued tasks to complete
|
||||||
|
//
|
||||||
int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) {
|
int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) {
|
||||||
|
|
||||||
#ifdef SMP_DEBUG
|
MT_TRACE("Synchronization Waiting.\n");
|
||||||
fprintf(STDERR, "Synchronization Waiting.\n");
|
|
||||||
#endif
|
|
||||||
|
|
||||||
while (num) {
|
while (num) {
|
||||||
#ifdef SMP_DEBUG
|
MT_TRACE("Waiting Queue ..\n");
|
||||||
fprintf(STDERR, "Waiting Queue ..\n");
|
|
||||||
#endif
|
|
||||||
while (!queue->finished)
|
while (!queue->finished)
|
||||||
YIELDING;
|
YIELDING;
|
||||||
|
|
||||||
|
@ -473,9 +443,8 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
|
||||||
num--;
|
num--;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef SMP_DEBUG
|
MT_TRACE("Completely Done.\n\n");
|
||||||
fprintf(STDERR, "Completely Done.\n\n");
|
|
||||||
#endif
|
|
||||||
// if work was added to the queue after this batch we can't sleep the worker threads
|
// if work was added to the queue after this batch we can't sleep the worker threads
|
||||||
// by resetting the event
|
// by resetting the event
|
||||||
EnterCriticalSection(&queue_lock);
|
EnterCriticalSection(&queue_lock);
|
||||||
|
@ -488,7 +457,9 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Execute Threads */
|
//
|
||||||
|
// Execute Threads
|
||||||
|
//
|
||||||
int exec_blas(BLASLONG num, blas_queue_t *queue) {
|
int exec_blas(BLASLONG num, blas_queue_t *queue) {
|
||||||
|
|
||||||
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
|
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
|
||||||
|
@ -502,28 +473,32 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
||||||
|
|
||||||
if ((num <= 0) || (queue == NULL)) return 0;
|
if ((num <= 0) || (queue == NULL)) return 0;
|
||||||
|
|
||||||
if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next);
|
if ((num > 1) && queue -> next)
|
||||||
|
exec_blas_async(1, queue -> next);
|
||||||
|
|
||||||
routine = queue -> routine;
|
routine = queue -> routine;
|
||||||
|
|
||||||
if (queue -> mode & BLAS_LEGACY) {
|
if (queue -> mode & BLAS_LEGACY) {
|
||||||
legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
|
legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
|
||||||
} else
|
} else {
|
||||||
if (queue -> mode & BLAS_PTHREAD) {
|
if (queue -> mode & BLAS_PTHREAD) {
|
||||||
void (*pthreadcompat)(void *) = queue -> routine;
|
void (*pthreadcompat)(void *) = queue -> routine;
|
||||||
(pthreadcompat)(queue -> args);
|
(pthreadcompat)(queue -> args);
|
||||||
} else
|
} else
|
||||||
(routine)(queue -> args, queue -> range_m, queue -> range_n,
|
(routine)(queue -> args, queue -> range_m, queue -> range_n,
|
||||||
queue -> sa, queue -> sb, 0);
|
queue -> sa, queue -> sb, 0);
|
||||||
|
}
|
||||||
|
|
||||||
if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next);
|
if ((num > 1) && queue -> next)
|
||||||
|
exec_blas_async_wait(num - 1, queue -> next);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Shutdown procedure, but user don't have to call this routine. The */
|
//
|
||||||
/* kernel automatically kill threads. */
|
// Shutdown procedure, but user don't have to call this routine. The
|
||||||
|
// kernel automatically kill threads.
|
||||||
|
//
|
||||||
int BLASFUNC(blas_thread_shutdown)(void) {
|
int BLASFUNC(blas_thread_shutdown)(void) {
|
||||||
|
|
||||||
int i;
|
int i;
|
||||||
|
@ -556,6 +531,9 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Legacy function to set numbef of threads
|
||||||
|
//
|
||||||
void goto_set_num_threads(int num_threads)
|
void goto_set_num_threads(int num_threads)
|
||||||
{
|
{
|
||||||
long i;
|
long i;
|
||||||
|
@ -577,11 +555,11 @@ void goto_set_num_threads(int num_threads)
|
||||||
SetEvent(kickoff_event);
|
SetEvent(kickoff_event);
|
||||||
|
|
||||||
for (i = num_threads - 1; i < blas_num_threads - 1; i++) {
|
for (i = num_threads - 1; i < blas_num_threads - 1; i++) {
|
||||||
//printf("set_num_threads: waiting on thread [%d] to quit.\n", i);
|
//MT_TRACE("set_num_threads: waiting on thread [%d] to quit.\n", i);
|
||||||
|
|
||||||
WaitForSingleObject(blas_threads[i], INFINITE);
|
WaitForSingleObject(blas_threads[i], INFINITE);
|
||||||
|
|
||||||
//printf("set_num_threads: thread [%d] has quit.\n", i);
|
//MT_TRACE("set_num_threads: thread [%d] has quit.\n", i);
|
||||||
|
|
||||||
CloseHandle(blas_threads[i]);
|
CloseHandle(blas_threads[i]);
|
||||||
}
|
}
|
||||||
|
@ -610,7 +588,7 @@ void goto_set_num_threads(int num_threads)
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) {
|
for (i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) {
|
||||||
//printf("set_num_threads: creating thread [%d]\n", i);
|
//MT_TRACE("set_num_threads: creating thread [%d]\n", i);
|
||||||
|
|
||||||
blas_threads[i] = CreateThread(NULL, 0,
|
blas_threads[i] = CreateThread(NULL, 0,
|
||||||
blas_thread_server, (void *)i,
|
blas_thread_server, (void *)i,
|
||||||
|
@ -625,6 +603,9 @@ void goto_set_num_threads(int num_threads)
|
||||||
blas_cpu_number = num_threads;
|
blas_cpu_number = num_threads;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Openblas function to set thread count
|
||||||
|
//
|
||||||
void openblas_set_num_threads(int num)
|
void openblas_set_num_threads(int num)
|
||||||
{
|
{
|
||||||
goto_set_num_threads(num);
|
goto_set_num_threads(num);
|
||||||
|
|
|
@ -275,6 +275,7 @@ extern gotoblas_t gotoblas_EXCAVATOR;
|
||||||
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
|
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
|
||||||
#define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE
|
#define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE
|
||||||
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
|
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
|
||||||
|
#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE
|
||||||
#else
|
#else
|
||||||
extern gotoblas_t gotoblas_HASWELL;
|
extern gotoblas_t gotoblas_HASWELL;
|
||||||
extern gotoblas_t gotoblas_ZEN;
|
extern gotoblas_t gotoblas_ZEN;
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||||
/* Copyright 2023 The OpenBLAS Project */
|
/* Copyright 2023-2024 The OpenBLAS Project */
|
||||||
/* All rights reserved. */
|
/* All rights reserved. */
|
||||||
/* */
|
/* */
|
||||||
/* Redistribution and use in source and binary forms, with or */
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
@ -143,12 +143,13 @@ extern gotoblas_t gotoblas_ARMV8SVE;
|
||||||
#endif
|
#endif
|
||||||
extern gotoblas_t gotoblas_THUNDERX3T110;
|
extern gotoblas_t gotoblas_THUNDERX3T110;
|
||||||
#endif
|
#endif
|
||||||
|
#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1
|
||||||
|
|
||||||
extern void openblas_warning(int verbose, const char * msg);
|
extern void openblas_warning(int verbose, const char * msg);
|
||||||
#define FALLBACK_VERBOSE 1
|
#define FALLBACK_VERBOSE 1
|
||||||
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"
|
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"
|
||||||
|
|
||||||
#define NUM_CORETYPES 16
|
#define NUM_CORETYPES 17
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* In case asm/hwcap.h is outdated on the build system, make sure
|
* In case asm/hwcap.h is outdated on the build system, make sure
|
||||||
|
@ -178,6 +179,7 @@ static char *corename[] = {
|
||||||
"emag8180",
|
"emag8180",
|
||||||
"neoversen1",
|
"neoversen1",
|
||||||
"neoversev1",
|
"neoversev1",
|
||||||
|
"neoversev2",
|
||||||
"neoversen2",
|
"neoversen2",
|
||||||
"thunderx3t110",
|
"thunderx3t110",
|
||||||
"cortexa55",
|
"cortexa55",
|
||||||
|
@ -198,10 +200,11 @@ char *gotoblas_corename(void) {
|
||||||
if (gotoblas == &gotoblas_EMAG8180) return corename[ 9];
|
if (gotoblas == &gotoblas_EMAG8180) return corename[ 9];
|
||||||
if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10];
|
if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10];
|
||||||
if (gotoblas == &gotoblas_NEOVERSEV1) return corename[11];
|
if (gotoblas == &gotoblas_NEOVERSEV1) return corename[11];
|
||||||
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12];
|
if (gotoblas == &gotoblas_NEOVERSEV2) return corename[12];
|
||||||
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
|
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[13];
|
||||||
if (gotoblas == &gotoblas_CORTEXA55) return corename[14];
|
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[14];
|
||||||
if (gotoblas == &gotoblas_ARMV8SVE) return corename[15];
|
if (gotoblas == &gotoblas_CORTEXA55) return corename[15];
|
||||||
|
if (gotoblas == &gotoblas_ARMV8SVE) return corename[16];
|
||||||
return corename[NUM_CORETYPES];
|
return corename[NUM_CORETYPES];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -233,10 +236,11 @@ static gotoblas_t *force_coretype(char *coretype) {
|
||||||
case 9: return (&gotoblas_EMAG8180);
|
case 9: return (&gotoblas_EMAG8180);
|
||||||
case 10: return (&gotoblas_NEOVERSEN1);
|
case 10: return (&gotoblas_NEOVERSEN1);
|
||||||
case 11: return (&gotoblas_NEOVERSEV1);
|
case 11: return (&gotoblas_NEOVERSEV1);
|
||||||
case 12: return (&gotoblas_NEOVERSEN2);
|
case 12: return (&gotoblas_NEOVERSEV2);
|
||||||
case 13: return (&gotoblas_THUNDERX3T110);
|
case 13: return (&gotoblas_NEOVERSEN2);
|
||||||
case 14: return (&gotoblas_CORTEXA55);
|
case 14: return (&gotoblas_THUNDERX3T110);
|
||||||
case 15: return (&gotoblas_ARMV8SVE);
|
case 15: return (&gotoblas_CORTEXA55);
|
||||||
|
case 16: return (&gotoblas_ARMV8SVE);
|
||||||
}
|
}
|
||||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||||
openblas_warning(1, message);
|
openblas_warning(1, message);
|
||||||
|
@ -312,6 +316,13 @@ static gotoblas_t *get_coretype(void) {
|
||||||
return &gotoblas_NEOVERSEN1;
|
return &gotoblas_NEOVERSEN1;
|
||||||
}else
|
}else
|
||||||
return &gotoblas_NEOVERSEV1;
|
return &gotoblas_NEOVERSEV1;
|
||||||
|
case 0xd4f:
|
||||||
|
if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
|
||||||
|
openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
|
||||||
|
return &gotoblas_NEOVERSEN1;
|
||||||
|
} else {
|
||||||
|
return &gotoblas_NEOVERSEV2;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
case 0xd05: // Cortex A55
|
case 0xd05: // Cortex A55
|
||||||
return &gotoblas_CORTEXA55;
|
return &gotoblas_CORTEXA55;
|
||||||
|
|
|
@ -43,6 +43,13 @@ char *gotoblas_corename(void) {
|
||||||
#define CPU_POWER9 9
|
#define CPU_POWER9 9
|
||||||
#define CPU_POWER10 10
|
#define CPU_POWER10 10
|
||||||
|
|
||||||
|
#ifndef POWER_9
|
||||||
|
#define POWER_9 0x20000 /* 9 class CPU */
|
||||||
|
#endif
|
||||||
|
#ifndef POWER_10
|
||||||
|
#define POWER_10 0x40000 /* 10 class CPU */
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef _AIX
|
#ifdef _AIX
|
||||||
#include <sys/systemcfg.h>
|
#include <sys/systemcfg.h>
|
||||||
|
|
||||||
|
@ -62,7 +69,7 @@ static int cpuid(void)
|
||||||
else if (arch == POWER_9) return CPU_POWER9;
|
else if (arch == POWER_9) return CPU_POWER9;
|
||||||
#endif
|
#endif
|
||||||
#ifdef POWER_10
|
#ifdef POWER_10
|
||||||
else if (arch == POWER_10) return CPU_POWER10;
|
else if (arch >= POWER_10) return CPU_POWER10;
|
||||||
#endif
|
#endif
|
||||||
return CPU_UNKNOWN;
|
return CPU_UNKNOWN;
|
||||||
}
|
}
|
||||||
|
@ -332,6 +339,9 @@ void gotoblas_dynamic_init(void) {
|
||||||
if (gotoblas && gotoblas -> init) {
|
if (gotoblas && gotoblas -> init) {
|
||||||
strncpy(coren,gotoblas_corename(),20);
|
strncpy(coren,gotoblas_corename(),20);
|
||||||
sprintf(coremsg, "Core: %s\n",coren);
|
sprintf(coremsg, "Core: %s\n",coren);
|
||||||
|
if (getenv("GET_OPENBLAS_CORETYPE")) {
|
||||||
|
fprintf(stderr, "%s", coremsg);
|
||||||
|
}
|
||||||
openblas_warning(2, coremsg);
|
openblas_warning(2, coremsg);
|
||||||
gotoblas -> init();
|
gotoblas -> init();
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -3214,7 +3214,7 @@ void blas_shutdown(void){
|
||||||
#endif
|
#endif
|
||||||
memory[pos].lock = 0;
|
memory[pos].lock = 0;
|
||||||
}
|
}
|
||||||
if (memory_overflowed)
|
if (memory_overflowed) {
|
||||||
for (pos = 0; pos < NEW_BUFFERS; pos ++){
|
for (pos = 0; pos < NEW_BUFFERS; pos ++){
|
||||||
newmemory[pos].addr = (void *)0;
|
newmemory[pos].addr = (void *)0;
|
||||||
newmemory[pos].used = 0;
|
newmemory[pos].used = 0;
|
||||||
|
@ -3223,6 +3223,10 @@ void blas_shutdown(void){
|
||||||
#endif
|
#endif
|
||||||
newmemory[pos].lock = 0;
|
newmemory[pos].lock = 0;
|
||||||
}
|
}
|
||||||
|
free(newmemory);
|
||||||
|
newmemory = NULL;
|
||||||
|
memory_overflowed = 0;
|
||||||
|
}
|
||||||
|
|
||||||
UNLOCK_COMMAND(&alloc_lock);
|
UNLOCK_COMMAND(&alloc_lock);
|
||||||
|
|
||||||
|
|
|
@ -36,11 +36,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#ifdef SMP_SERVER
|
#ifdef SMP_SERVER
|
||||||
|
|
||||||
extern void openblas_set_num_threads(int num_threads) ;
|
extern void openblas_set_num_threads(int num_threads) ;
|
||||||
|
extern int openblas_get_num_threads(void) ;
|
||||||
|
|
||||||
void openblas_set_num_threads_(int* num_threads){
|
void openblas_set_num_threads_(int* num_threads){
|
||||||
openblas_set_num_threads(*num_threads);
|
openblas_set_num_threads(*num_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int openblas_set_num_threads_local(int num_threads){
|
||||||
|
int ret = openblas_get_num_threads();
|
||||||
|
openblas_set_num_threads(num_threads);
|
||||||
|
blas_omp_threads_local=num_threads;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#else
|
#else
|
||||||
//Single thread
|
//Single thread
|
||||||
|
|
||||||
|
@ -50,4 +59,8 @@ void openblas_set_num_threads(int num_threads) {
|
||||||
void openblas_set_num_threads_(int* num_threads){
|
void openblas_set_num_threads_(int* num_threads){
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int openblas_set_num_threads_local(int num_threads){
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -73,6 +73,10 @@ endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(F_COMPILER)$(OSNAME), IBMAIX)
|
||||||
|
EXTRALIB += -lxlf90
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(C_COMPILER), PGI)
|
ifeq ($(C_COMPILER), PGI)
|
||||||
EXTRALIB += -pgf90libs
|
EXTRALIB += -pgf90libs
|
||||||
endif
|
endif
|
||||||
|
@ -132,8 +136,12 @@ libgoto_hpl.def : $(GENSYM)
|
||||||
./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
|
./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
|
||||||
|
|
||||||
ifeq ($(OSNAME), Darwin)
|
ifeq ($(OSNAME), Darwin)
|
||||||
|
ifeq ($(FIXED_LIBNAME),1)
|
||||||
|
INTERNALNAME = $(LIBPREFIX)$(LIBNAMESUFFIX).dylib
|
||||||
|
else
|
||||||
INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
|
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
|
||||||
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
|
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
|
||||||
|
@ -169,8 +177,12 @@ INTERNALNAME = $(LIBPREFIX).so
|
||||||
FEXTRALIB += -lm
|
FEXTRALIB += -lm
|
||||||
EXTRALIB += -lm
|
EXTRALIB += -lm
|
||||||
else
|
else
|
||||||
|
ifeq ($(FIXED_LIBNAME),1)
|
||||||
|
INTERNALNAME = $(LIBPREFIX)$(LIBNAMESUFFIX).so
|
||||||
|
else
|
||||||
INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
|
INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
|
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
|
||||||
../$(LIBSONAME) : ../$(LIBNAME) linktest.c
|
../$(LIBSONAME) : ../$(LIBNAME) linktest.c
|
||||||
|
@ -248,6 +260,20 @@ endif
|
||||||
|
|
||||||
ifeq ($(OSNAME), AIX)
|
ifeq ($(OSNAME), AIX)
|
||||||
|
|
||||||
|
so : ../$(LIBSONAME) linktest.c
|
||||||
|
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(EXTRALIB) && echo OK.
|
||||||
|
rm -f linktest
|
||||||
|
|
||||||
|
../$(LIBSONAME) : aix.exp
|
||||||
|
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||||
|
-Wl,-bcdtors:all:-2147481648:s,-bE:aix.exp -Wl,-bbigtoc ../$(LIBNAME) $(EXTRALIB)
|
||||||
|
|
||||||
|
aix.exp :
|
||||||
|
/usr/bin/nm -X32_64 -PCpgl ../$(LIBNAME) | /usr/bin/awk '{ if ((($$ 2 == "T") \
|
||||||
|
|| ($$ 2 == "D") || ($$ 2 == "B") || ($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) && (substr($$ 1,1,1) != ".")) \
|
||||||
|
{ if (($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) { print $$ 1 " weak" } else { print $$ 1 } } }' | \
|
||||||
|
/usr/bin/sort -u > aix.exp
|
||||||
|
|
||||||
ifeq ($(COMPILER_F77), xlf)
|
ifeq ($(COMPILER_F77), xlf)
|
||||||
|
|
||||||
goto32.$(SUFFIX) : ../$(LIBNAME) aix.def
|
goto32.$(SUFFIX) : ../$(LIBNAME) aix.def
|
||||||
|
@ -289,6 +315,11 @@ test : linktest.c
|
||||||
|
|
||||||
linktest.c : $(GENSYM) ../Makefile.system ../getarch.c
|
linktest.c : $(GENSYM) ../Makefile.system ../getarch.c
|
||||||
./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c
|
./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c
|
||||||
|
ifeq ($(F_COMPILER), IBM)
|
||||||
|
mv linktest.c linktest.c.FIRST
|
||||||
|
egrep -v 'second_|dsecnd_' linktest.c.FIRST > linktest.c
|
||||||
|
rm linktest.c.FIRST
|
||||||
|
endif
|
||||||
|
|
||||||
clean ::
|
clean ::
|
||||||
@rm -f *.def *.dylib __.SYMDEF* *.renamed
|
@rm -f *.def *.dylib __.SYMDEF* *.renamed
|
||||||
|
|
|
@ -60,6 +60,7 @@ cblasobjsc="
|
||||||
cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv
|
cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv
|
||||||
cblas_scnrm2 cblas_scasum cblas_cgemmt
|
cblas_scnrm2 cblas_scasum cblas_cgemmt
|
||||||
cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy
|
cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy
|
||||||
|
cblas_caxpyc cblas_crotg cblas_csrot cblas_scamax cblas_scamin
|
||||||
"
|
"
|
||||||
cblasobjsd="
|
cblasobjsd="
|
||||||
cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot
|
cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot
|
||||||
|
@ -69,6 +70,7 @@ cblasobjsd="
|
||||||
cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv
|
cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv
|
||||||
cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt
|
cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt
|
||||||
cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy
|
cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy
|
||||||
|
cblas_damax cblas_damin
|
||||||
"
|
"
|
||||||
|
|
||||||
cblasobjss="
|
cblasobjss="
|
||||||
|
@ -80,6 +82,7 @@ cblasobjss="
|
||||||
cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm
|
cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm
|
||||||
cblas_strsv cblas_sgeadd cblas_sgemmt
|
cblas_strsv cblas_sgeadd cblas_sgemmt
|
||||||
cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy
|
cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy
|
||||||
|
cblas_samax cblas_samin
|
||||||
"
|
"
|
||||||
|
|
||||||
cblasobjsz="
|
cblasobjsz="
|
||||||
|
@ -91,6 +94,7 @@ cblasobjsz="
|
||||||
cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub
|
cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub
|
||||||
cblas_zaxpby cblas_zgeadd cblas_zgemmt
|
cblas_zaxpby cblas_zgeadd cblas_zgemmt
|
||||||
cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy
|
cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy
|
||||||
|
cblas_zaxpyc cblas_zdrot cblas_zrotg cblas_dzamax cblas_dzamin
|
||||||
"
|
"
|
||||||
|
|
||||||
cblasobjs="cblas_xerbla"
|
cblasobjs="cblas_xerbla"
|
||||||
|
@ -861,6 +865,53 @@ lapackobjs2z="$lapackobjs2z
|
||||||
zgedmd
|
zgedmd
|
||||||
zgedmdq
|
zgedmdq
|
||||||
"
|
"
|
||||||
|
|
||||||
|
#functions added post 3.11
|
||||||
|
|
||||||
|
lapackobjs2c="$lapackobjs2c
|
||||||
|
claqp2rk
|
||||||
|
claqp3rk
|
||||||
|
ctrsyl3
|
||||||
|
"
|
||||||
|
# claqz0
|
||||||
|
# claqz1
|
||||||
|
# claqz2
|
||||||
|
# claqz3
|
||||||
|
# clatrs3
|
||||||
|
|
||||||
|
lapackobjs2d="$lapackobjs2d
|
||||||
|
dgelqs
|
||||||
|
dgelst
|
||||||
|
dgeqp3rk
|
||||||
|
dgeqrs
|
||||||
|
dlaqp2rk
|
||||||
|
dlaqp3rk
|
||||||
|
dlarmm
|
||||||
|
dlatrs3
|
||||||
|
dtrsyl3
|
||||||
|
"
|
||||||
|
# dlaqz0
|
||||||
|
# dlaqz1
|
||||||
|
# dlaqz2
|
||||||
|
# dlaqz3
|
||||||
|
# dlaqz4
|
||||||
|
|
||||||
|
lapackobjs2z="$lapackobjs2z
|
||||||
|
zgelqs
|
||||||
|
zgelst
|
||||||
|
zgeqp3rk
|
||||||
|
zgeqrs
|
||||||
|
zlaqp2rk
|
||||||
|
zlaqp3rk
|
||||||
|
zlatrs3
|
||||||
|
zrscl
|
||||||
|
ztrsyl3
|
||||||
|
"
|
||||||
|
# zlaqz0
|
||||||
|
# zlaqz1
|
||||||
|
# zlaqz2
|
||||||
|
# zlaqz3
|
||||||
|
|
||||||
lapack_extendedprecision_objs="
|
lapack_extendedprecision_objs="
|
||||||
zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx
|
zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx
|
||||||
dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx
|
dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx
|
||||||
|
@ -1622,6 +1673,14 @@ lapackeobjsc="
|
||||||
LAPACKE_cgetsqrhrt_work
|
LAPACKE_cgetsqrhrt_work
|
||||||
LAPACKE_cungtsqr_row
|
LAPACKE_cungtsqr_row
|
||||||
LAPACKE_cungtsqr_row_work
|
LAPACKE_cungtsqr_row_work
|
||||||
|
LAPACKE_clangb
|
||||||
|
LAPACKE_clangb_work
|
||||||
|
LAPACKE_ctrsyl3
|
||||||
|
LAPACKE_ctrsyl3_work
|
||||||
|
LAPACKE_ctz_nancheck
|
||||||
|
LAPACKE_ctz_trans
|
||||||
|
LAPACKE_cunhr_col
|
||||||
|
LAPACKE_cunhr_col_work
|
||||||
"
|
"
|
||||||
|
|
||||||
lapackeobjsd="
|
lapackeobjsd="
|
||||||
|
@ -2239,6 +2298,14 @@ lapackeobjsd="
|
||||||
LAPACKE_dgetsqrhrt_work
|
LAPACKE_dgetsqrhrt_work
|
||||||
LAPACKE_dorgtsqr_row
|
LAPACKE_dorgtsqr_row
|
||||||
LAPACKE_dorgtsqr_row_work
|
LAPACKE_dorgtsqr_row_work
|
||||||
|
LAPACKE_dlangb
|
||||||
|
LAPACKE_dlangb_work
|
||||||
|
LAPACKE_dorhr_col
|
||||||
|
LAPACKE_dorhr_col_work
|
||||||
|
LAPACKE_dtrsyl3
|
||||||
|
LAPACKE_dtrsyl3_work
|
||||||
|
LAPACKE_dtz_nancheck
|
||||||
|
LAPACKE_dtz_trans
|
||||||
"
|
"
|
||||||
|
|
||||||
lapackeobjss="
|
lapackeobjss="
|
||||||
|
@ -2848,6 +2915,14 @@ lapackeobjss="
|
||||||
LAPACKE_sgetsqrhrt_work
|
LAPACKE_sgetsqrhrt_work
|
||||||
LAPACKE_sorgtsqr_row
|
LAPACKE_sorgtsqr_row
|
||||||
LAPACKE_sorgtsqr_row_work
|
LAPACKE_sorgtsqr_row_work
|
||||||
|
LAPACKE_slangb
|
||||||
|
LAPACKE_slangb_work
|
||||||
|
LAPACKE_sorhr_col
|
||||||
|
LAPACKE_sorhr_col_work
|
||||||
|
LAPACKE_strsyl3
|
||||||
|
LAPACKE_strsyl3_work
|
||||||
|
LAPACKE_stz_nancheck
|
||||||
|
LAPACKE_stz_trans
|
||||||
"
|
"
|
||||||
|
|
||||||
lapackeobjsz="
|
lapackeobjsz="
|
||||||
|
@ -3515,6 +3590,14 @@ lapackeobjsz="
|
||||||
LAPACKE_zgetsqrhrt_work
|
LAPACKE_zgetsqrhrt_work
|
||||||
LAPACKE_zungtsqr_row
|
LAPACKE_zungtsqr_row
|
||||||
LAPACKE_zungtsqr_row_work
|
LAPACKE_zungtsqr_row_work
|
||||||
|
LAPACKE_zlangb
|
||||||
|
LAPACKE_zlangb_work
|
||||||
|
LAPACKE_ztrsyl3
|
||||||
|
LAPACKE_ztrsyl3_work
|
||||||
|
LAPACKE_ztz_nancheck
|
||||||
|
LAPACKE_ztz_trans
|
||||||
|
LAPACKE_zunhr_col
|
||||||
|
LAPACKE_zunhr_col_work
|
||||||
"
|
"
|
||||||
## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile`
|
## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile`
|
||||||
## Not exported: requires LAPACKE_EXTENDED to be set and depends on the
|
## Not exported: requires LAPACKE_EXTENDED to be set and depends on the
|
||||||
|
@ -3616,6 +3699,7 @@ lapack_embeded_underscore_objs_s="
|
||||||
ssysv_aa_2stage ssytrf_aa_2stage
|
ssysv_aa_2stage ssytrf_aa_2stage
|
||||||
ssytrs_aa_2stage
|
ssytrs_aa_2stage
|
||||||
slaorhr_col_getrfnp slaorhr_col_getrfnp2 sorhr_col
|
slaorhr_col_getrfnp slaorhr_col_getrfnp2 sorhr_col
|
||||||
|
slarfb_gett
|
||||||
"
|
"
|
||||||
lapack_embeded_underscore_objs_c="
|
lapack_embeded_underscore_objs_c="
|
||||||
chetf2_rook chetrf_rook chetri_rook
|
chetf2_rook chetrf_rook chetri_rook
|
||||||
|
@ -3641,6 +3725,7 @@ lapack_embeded_underscore_objs_c="
|
||||||
csysv_aa_2stage csytrf_aa_2stage
|
csysv_aa_2stage csytrf_aa_2stage
|
||||||
csytrs_aa_2stage
|
csytrs_aa_2stage
|
||||||
claunhr_col_getrfnp claunhr_col_getrfnp2 cunhr_col
|
claunhr_col_getrfnp claunhr_col_getrfnp2 cunhr_col
|
||||||
|
clarfb_gett
|
||||||
"
|
"
|
||||||
lapack_embeded_underscore_objs_d="
|
lapack_embeded_underscore_objs_d="
|
||||||
dlasyf_rook
|
dlasyf_rook
|
||||||
|
@ -3658,6 +3743,7 @@ lapack_embeded_underscore_objs_d="
|
||||||
dsysv_aa_2stage
|
dsysv_aa_2stage
|
||||||
dsytrf_aa_2stage dsytrs_aa_2stage
|
dsytrf_aa_2stage dsytrs_aa_2stage
|
||||||
dlaorhr_col_getrfnp dlaorhr_col_getrfnp2 dorhr_col
|
dlaorhr_col_getrfnp dlaorhr_col_getrfnp2 dorhr_col
|
||||||
|
dlarfb_gett
|
||||||
"
|
"
|
||||||
lapack_embeded_underscore_objs_z="
|
lapack_embeded_underscore_objs_z="
|
||||||
zhetf2_rook zhetrf_rook zhetri_rook
|
zhetf2_rook zhetrf_rook zhetri_rook
|
||||||
|
@ -3682,6 +3768,7 @@ lapack_embeded_underscore_objs_z="
|
||||||
zhetrs_aa_2stage zsysv_aa_2stage
|
zhetrs_aa_2stage zsysv_aa_2stage
|
||||||
zsytrf_aa_2stage zsytrs_aa_2stage
|
zsytrf_aa_2stage zsytrs_aa_2stage
|
||||||
zlaunhr_col_getrfnp zlaunhr_col_getrfnp2 zunhr_col
|
zlaunhr_col_getrfnp zlaunhr_col_getrfnp2 zunhr_col
|
||||||
|
zlarfb_gett
|
||||||
"
|
"
|
||||||
|
|
||||||
dirname=`pwd -P`/../lapack-netlib
|
dirname=`pwd -P`/../lapack-netlib
|
||||||
|
|
6
f_check
6
f_check
|
@ -45,7 +45,7 @@ if [ -z "$compiler" ]; then
|
||||||
pathf90 pathf95
|
pathf90 pathf95
|
||||||
pgf95 pgf90 pgf77 pgfortran nvfortran
|
pgf95 pgf90 pgf77 pgfortran nvfortran
|
||||||
flang egfortran
|
flang egfortran
|
||||||
ifort nagfor ifx ftn crayftn"
|
ifort nagfor ifx ftn crayftn armflang"
|
||||||
|
|
||||||
for list in $lists; do
|
for list in $lists; do
|
||||||
for p in $path; do
|
for p in $path; do
|
||||||
|
@ -86,6 +86,10 @@ else
|
||||||
vendor=CRAY
|
vendor=CRAY
|
||||||
openmp='-fopenmp'
|
openmp='-fopenmp'
|
||||||
;;
|
;;
|
||||||
|
*Arm\ F90*)
|
||||||
|
vendor=FLANG
|
||||||
|
openmp='-fopenmp'
|
||||||
|
;;
|
||||||
*GNU*|*GCC*)
|
*GNU*|*GCC*)
|
||||||
|
|
||||||
v="${data#*GCC: *\) }"
|
v="${data#*GCC: *\) }"
|
||||||
|
|
94
getarch.c
94
getarch.c
|
@ -90,7 +90,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include <sys/sysinfo.h>
|
#include <sys/sysinfo.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#endif
|
#endif
|
||||||
#if defined(AIX)
|
#if defined(_AIX)
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <sys/systemcfg.h>
|
||||||
#include <sys/sysinfo.h>
|
#include <sys/sysinfo.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -150,6 +152,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
/* #define FORCE_EV4 */
|
/* #define FORCE_EV4 */
|
||||||
/* #define FORCE_EV5 */
|
/* #define FORCE_EV5 */
|
||||||
/* #define FORCE_EV6 */
|
/* #define FORCE_EV6 */
|
||||||
|
/* #define FORCE_CSKY */
|
||||||
|
/* #define FORCE_CK860FV */
|
||||||
/* #define FORCE_GENERIC */
|
/* #define FORCE_GENERIC */
|
||||||
|
|
||||||
#ifdef FORCE_P2
|
#ifdef FORCE_P2
|
||||||
|
@ -1327,6 +1331,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define CORENAME "CORTEXA73"
|
#define CORENAME "CORTEXA73"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef FORCE_CORTEXA76
|
||||||
|
#define FORCE
|
||||||
|
#define ARCHITECTURE "ARM64"
|
||||||
|
#define SUBARCHITECTURE "CORTEXA76"
|
||||||
|
#define SUBDIRNAME "arm64"
|
||||||
|
#define ARCHCONFIG "-DCORTEXA76 " \
|
||||||
|
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
|
||||||
|
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
|
||||||
|
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||||
|
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||||
|
#define LIBNAME "cortexa76"
|
||||||
|
#define CORENAME "CORTEXA76"
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef FORCE_CORTEXX1
|
#ifdef FORCE_CORTEXX1
|
||||||
#define FORCE
|
#define FORCE
|
||||||
#define ARCHITECTURE "ARM64"
|
#define ARCHITECTURE "ARM64"
|
||||||
|
@ -1677,9 +1696,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define LIBNAME "c910v"
|
#define LIBNAME "c910v"
|
||||||
#define CORENAME "C910V"
|
#define CORENAME "C910V"
|
||||||
#endif
|
#endif
|
||||||
|
#endif
|
||||||
|
#ifdef FORCE_x280
|
||||||
|
#define FORCE
|
||||||
|
#define ARCHITECTURE "RISCV64"
|
||||||
|
#define SUBARCHITECTURE "x280"
|
||||||
|
#define SUBDIRNAME "riscv64"
|
||||||
|
#define ARCHCONFIG "-Dx280 " \
|
||||||
|
"-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \
|
||||||
|
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
|
||||||
|
#define LIBNAME "x280"
|
||||||
|
#define CORENAME "x280"
|
||||||
#else
|
#else
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef FORCE_RISCV64_ZVL256B
|
||||||
|
#define FORCE
|
||||||
|
#define ARCHITECTURE "RISCV64"
|
||||||
|
#define SUBARCHITECTURE "RISCV64_ZVL256B"
|
||||||
|
#define SUBDIRNAME "riscv64"
|
||||||
|
#define ARCHCONFIG "-DRISCV64_ZVL256B " \
|
||||||
|
"-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \
|
||||||
|
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
|
||||||
|
#define LIBNAME "riscv64_zvl256b"
|
||||||
|
#define CORENAME "RISCV64_ZVL256B"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef FORCE_RISCV64_ZVL128B
|
||||||
|
#define FORCE
|
||||||
|
#define ARCHITECTURE "RISCV64"
|
||||||
|
#define SUBARCHITECTURE "RISCV64_ZVL128B"
|
||||||
|
#define SUBDIRNAME "riscv64"
|
||||||
|
#define ARCHCONFIG "-DRISCV64_ZVL128B " \
|
||||||
|
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
|
||||||
|
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
|
||||||
|
#define LIBNAME "riscv64_zvl128b"
|
||||||
|
#define CORENAME "RISCV64_ZVL128B"
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(FORCE_E2K) || defined(__e2k__)
|
#if defined(FORCE_E2K) || defined(__e2k__)
|
||||||
#define FORCE
|
#define FORCE
|
||||||
|
@ -1692,6 +1748,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define CORENAME "generic"
|
#define CORENAME "generic"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef FORCE_CSKY
|
||||||
|
#define FORCE
|
||||||
|
#define ARCHITECTURE "CSKY"
|
||||||
|
#define SUBARCHITECTURE "CSKY"
|
||||||
|
#define SUBDIRNAME "csky"
|
||||||
|
#define ARCHCONFIG "-DCSKY" \
|
||||||
|
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
|
||||||
|
"-DL2_SIZE=524288 -DL2_LINESIZE=32 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||||
|
#define LIBNAME "csky"
|
||||||
|
#define CORENAME "CSKY"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef FORCE_CK860FV
|
||||||
|
#define FORCE
|
||||||
|
#define ARCHITECTURE "CSKY"
|
||||||
|
#define SUBARCHITECTURE "CK860V"
|
||||||
|
#define SUBDIRNAME "csky"
|
||||||
|
#define ARCHCONFIG "-DCK860FV " \
|
||||||
|
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
|
||||||
|
"-DL2_SIZE=524288 -DL2_LINESIZE=32 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||||
|
#define LIBNAME "ck860fv"
|
||||||
|
#define CORENAME "CK860FV"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifndef FORCE
|
#ifndef FORCE
|
||||||
|
|
||||||
#ifdef USER_TARGET
|
#ifdef USER_TARGET
|
||||||
|
@ -1766,7 +1849,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define OPENBLAS_SUPPORTED
|
#define OPENBLAS_SUPPORTED
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#ifndef OPENBLAS_SUPPORTED
|
#ifndef OPENBLAS_SUPPORTED
|
||||||
#error "This arch/CPU is not supported by OpenBLAS."
|
#error "This arch/CPU is not supported by OpenBLAS."
|
||||||
#endif
|
#endif
|
||||||
|
@ -1805,11 +1887,13 @@ static int get_num_cores(void) {
|
||||||
|
|
||||||
return count;
|
return count;
|
||||||
|
|
||||||
#elif defined(AIX)
|
#elif defined(_AIX)
|
||||||
//returns the number of processors which are currently online
|
//returns the number of processors which are currently online
|
||||||
count = sysconf(_SC_NPROCESSORS_ONLN);
|
count = sysconf(_SC_NPROCESSORS_ONLN);
|
||||||
if (count <= 0) count = 2;
|
if (count <= 0) count = 2;
|
||||||
|
|
||||||
|
return count;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
return 2;
|
return 2;
|
||||||
#endif
|
#endif
|
||||||
|
@ -1831,7 +1915,7 @@ int main(int argc, char *argv[]){
|
||||||
#ifdef FORCE
|
#ifdef FORCE
|
||||||
printf("CORE=%s\n", CORENAME);
|
printf("CORE=%s\n", CORENAME);
|
||||||
#else
|
#else
|
||||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__)
|
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) || defined(__csky__)
|
||||||
printf("CORE=%s\n", get_corename());
|
printf("CORE=%s\n", get_corename());
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
@ -1979,7 +2063,7 @@ printf("ELF_VERSION=2\n");
|
||||||
#ifdef FORCE
|
#ifdef FORCE
|
||||||
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
|
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
|
||||||
#else
|
#else
|
||||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv)
|
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__csky__)
|
||||||
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
|
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -119,6 +119,7 @@ endif ()
|
||||||
if (BUILD_BFLOAT16)
|
if (BUILD_BFLOAT16)
|
||||||
GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||||
GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||||
|
GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||||
GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||||
GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||||
GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||||
|
@ -130,6 +131,8 @@ endif ()
|
||||||
foreach (float_type ${FLOAT_TYPES})
|
foreach (float_type ${FLOAT_TYPES})
|
||||||
|
|
||||||
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
|
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
|
||||||
|
GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type})
|
||||||
|
|
||||||
GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type})
|
GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type})
|
||||||
GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type})
|
GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type})
|
||||||
GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type})
|
GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type})
|
||||||
|
|
|
@ -270,7 +270,8 @@ CSBLAS1OBJS = \
|
||||||
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
|
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
|
||||||
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
|
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
|
||||||
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
|
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
|
||||||
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX)
|
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_samax.$(SUFFIX) \
|
||||||
|
cblas_samin.$(SUFFIX)
|
||||||
|
|
||||||
CSBLAS2OBJS = \
|
CSBLAS2OBJS = \
|
||||||
cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
|
cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
|
||||||
|
@ -295,7 +296,8 @@ CDBLAS1OBJS = \
|
||||||
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
|
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
|
||||||
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
|
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
|
||||||
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
|
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
|
||||||
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX)
|
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_damax.$(SUFFIX) \
|
||||||
|
cblas_damin.$(SUFFIX)
|
||||||
|
|
||||||
CDBLAS2OBJS = \
|
CDBLAS2OBJS = \
|
||||||
cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
|
cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
|
||||||
|
@ -315,7 +317,7 @@ CCBLAS1OBJS = \
|
||||||
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
|
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
|
||||||
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
|
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
|
||||||
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
|
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
|
||||||
cblas_caxpby.$(SUFFIX) \
|
cblas_caxpby.$(SUFFIX) cblas_scamax.$(SUFFIX) cblas_caxpyc.$(SUFFIX) cblas_scamin.$(SUFFIX) \
|
||||||
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX)
|
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX)
|
||||||
|
|
||||||
CCBLAS2OBJS = \
|
CCBLAS2OBJS = \
|
||||||
|
@ -340,12 +342,12 @@ CXERBLAOBJ = \
|
||||||
|
|
||||||
CZBLAS1OBJS = \
|
CZBLAS1OBJS = \
|
||||||
cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
|
cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
|
||||||
cblas_zcopy.$(SUFFIX) \
|
cblas_zcopy.$(SUFFIX) cblas_dzamax.$(SUFFIX) cblas_dzamin.$(SUFFIX) \
|
||||||
cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
|
cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
|
||||||
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
|
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
|
||||||
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
|
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
|
||||||
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
|
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
|
||||||
cblas_zaxpby.$(SUFFIX) \
|
cblas_zaxpby.$(SUFFIX) cblas_zaxpyc.$(SUFFIX) \
|
||||||
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX)
|
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX)
|
||||||
|
|
||||||
|
|
||||||
|
@ -1301,7 +1303,7 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c
|
||||||
ifeq ($(BUILD_BFLOAT16),1)
|
ifeq ($(BUILD_BFLOAT16),1)
|
||||||
sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h
|
sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||||
sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : gemmt.c ../param.h
|
sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
|
||||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -1533,6 +1535,30 @@ cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c
|
||||||
cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c
|
cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c
|
||||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
|
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c
|
||||||
|
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c
|
||||||
|
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_scamax.$(SUFFIX) cblas_scamax.$(PSUFFIX) : max.c
|
||||||
|
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_dzamax.$(SUFFIX) cblas_dzamax.$(PSUFFIX) : max.c
|
||||||
|
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c
|
||||||
|
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c
|
||||||
|
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_scamin.$(SUFFIX) cblas_scamin.$(PSUFFIX) : max.c
|
||||||
|
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_dzamin.$(SUFFIX) cblas_dzamin.$(PSUFFIX) : max.c
|
||||||
|
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||||
|
|
||||||
cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c
|
cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c
|
||||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||||
|
|
||||||
|
@ -1627,6 +1653,15 @@ cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c
|
||||||
cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c
|
cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c
|
||||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_caxpyc.$(SUFFIX) cblas_caxpyc.$(PSUFFIX) : zaxpy.c
|
||||||
|
$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c
|
||||||
|
$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
|
||||||
|
|
||||||
|
cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c
|
||||||
|
$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
|
||||||
|
|
||||||
cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c
|
cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c
|
||||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||||
|
|
||||||
|
@ -1932,7 +1967,7 @@ cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h
|
||||||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||||
|
|
||||||
ifeq ($(BUILD_BFLOAT16),1)
|
ifeq ($(BUILD_BFLOAT16),1)
|
||||||
cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h
|
cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
|
||||||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
|
@ -117,8 +117,8 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
|
|
||||||
if (ldc < MAX(1, m)) info = 8;
|
if (ldc < MAX(1, m)) info = 8;
|
||||||
if (lda < MAX(1, m)) info = 5;
|
if (lda < MAX(1, m)) info = 5;
|
||||||
if (n < 0) info = 2;
|
if (n < 0) info = 1;
|
||||||
if (m < 0) info = 1;
|
if (m < 0) info = 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (info >= 0) {
|
if (info >= 0) {
|
||||||
|
|
|
@ -533,8 +533,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
||||||
MNK = (double) args.m * (double) args.n * (double) args.k;
|
MNK = (double) args.m * (double) args.n * (double) args.k;
|
||||||
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
||||||
args.nthreads = 1;
|
args.nthreads = 1;
|
||||||
else
|
else {
|
||||||
args.nthreads = num_cpu_avail(3);
|
args.nthreads = num_cpu_avail(3);
|
||||||
|
if (MNK/args.nthreads < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD)
|
||||||
|
args.nthreads = MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD);
|
||||||
|
}
|
||||||
|
|
||||||
args.common = NULL;
|
args.common = NULL;
|
||||||
|
|
||||||
if (args.nthreads == 1) {
|
if (args.nthreads == 1) {
|
||||||
|
|
|
@ -78,6 +78,9 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
||||||
|
|
||||||
char transA, transB, Uplo;
|
char transA, transB, Uplo;
|
||||||
blasint nrowa, nrowb;
|
blasint nrowa, nrowb;
|
||||||
|
#if defined(COMPLEX)
|
||||||
|
blasint ncolb;
|
||||||
|
#endif
|
||||||
IFLOAT *buffer;
|
IFLOAT *buffer;
|
||||||
IFLOAT *aa, *bb;
|
IFLOAT *aa, *bb;
|
||||||
FLOAT *cc;
|
FLOAT *cc;
|
||||||
|
@ -157,17 +160,25 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
||||||
uplo = 1;
|
uplo = 1;
|
||||||
|
|
||||||
nrowa = m;
|
nrowa = m;
|
||||||
if (transa) nrowa = k;
|
if (transa & 1) nrowa = k;
|
||||||
nrowb = k;
|
nrowb = k;
|
||||||
if (transb) nrowb = m;
|
#if defined(COMPLEX)
|
||||||
|
ncolb = m;
|
||||||
|
#endif
|
||||||
|
if (transb & 1) {
|
||||||
|
nrowb = m;
|
||||||
|
#if defined(COMPLEX)
|
||||||
|
ncolb = k;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
info = 0;
|
info = 0;
|
||||||
|
|
||||||
if (ldc < MAX(1, m))
|
if (ldc < MAX(1, m))
|
||||||
info = 13;
|
info = 13;
|
||||||
if (ldb < MAX(1, nrowa))
|
if (ldb < MAX(1, nrowb))
|
||||||
info = 10;
|
info = 10;
|
||||||
if (lda < MAX(1, nrowb))
|
if (lda < MAX(1, nrowa))
|
||||||
info = 8;
|
info = 8;
|
||||||
if (k < 0)
|
if (k < 0)
|
||||||
info = 5;
|
info = 5;
|
||||||
|
@ -211,6 +222,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
blasint info;
|
blasint info;
|
||||||
blasint lda, ldb;
|
blasint lda, ldb;
|
||||||
FLOAT *a, *b;
|
FLOAT *a, *b;
|
||||||
|
#if defined(COMPLEX)
|
||||||
|
blasint nrowb, ncolb;
|
||||||
|
#endif
|
||||||
XFLOAT *buffer;
|
XFLOAT *buffer;
|
||||||
|
|
||||||
PRINT_DEBUG_CNAME;
|
PRINT_DEBUG_CNAME;
|
||||||
|
@ -262,11 +276,22 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
|
|
||||||
info = -1;
|
info = -1;
|
||||||
|
|
||||||
blasint nrowa, nrowb;
|
blasint nrowa;
|
||||||
|
#if !defined(COMPLEX)
|
||||||
|
blasint nrowb;
|
||||||
|
#endif
|
||||||
nrowa = m;
|
nrowa = m;
|
||||||
if (transa) nrowa = k;
|
if (transa & 1) nrowa = k;
|
||||||
nrowb = k;
|
nrowb = k;
|
||||||
if (transb) nrowb = m;
|
#if defined(COMPLEX)
|
||||||
|
ncolb = m;
|
||||||
|
#endif
|
||||||
|
if (transb & 1) {
|
||||||
|
nrowb = m;
|
||||||
|
#if defined(COMPLEX)
|
||||||
|
ncolb = k;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
if (ldc < MAX(1, m))
|
if (ldc < MAX(1, m))
|
||||||
info = 13;
|
info = 13;
|
||||||
|
@ -330,26 +355,38 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
|
|
||||||
info = -1;
|
info = -1;
|
||||||
|
|
||||||
blasint ncola, ncolb;
|
blasint ncola;
|
||||||
ncola = k;
|
#if !defined(COMPLEX)
|
||||||
if (transa) ncola = m;
|
blasint ncolb;
|
||||||
|
#endif
|
||||||
|
ncola = m;
|
||||||
|
if (transa & 1) ncola = k;
|
||||||
|
ncolb = k;
|
||||||
|
#if defined(COMPLEX)
|
||||||
|
nrowb = m;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (transb & 1) {
|
||||||
|
#if defined(COMPLEX)
|
||||||
|
nrowb = k;
|
||||||
|
#endif
|
||||||
ncolb = m;
|
ncolb = m;
|
||||||
if (transb) ncolb = k;
|
}
|
||||||
|
|
||||||
if (ldc < MAX(1,m))
|
if (ldc < MAX(1,m))
|
||||||
info = 13;
|
info = 13;
|
||||||
if (ldb < MAX(1, ncolb))
|
if (ldb < MAX(1, ncolb))
|
||||||
info = 10;
|
|
||||||
if (lda < MAX(1, ncola))
|
|
||||||
info = 8;
|
info = 8;
|
||||||
|
if (lda < MAX(1, ncola))
|
||||||
|
info = 10;
|
||||||
if (k < 0)
|
if (k < 0)
|
||||||
info = 5;
|
info = 5;
|
||||||
if (m < 0)
|
if (m < 0)
|
||||||
info = 4;
|
info = 4;
|
||||||
if (transb < 0)
|
if (transb < 0)
|
||||||
info = 3;
|
|
||||||
if (transa < 0)
|
|
||||||
info = 2;
|
info = 2;
|
||||||
|
if (transa < 0)
|
||||||
|
info = 3;
|
||||||
if (uplo < 0)
|
if (uplo < 0)
|
||||||
info = 1;
|
info = 1;
|
||||||
}
|
}
|
||||||
|
@ -428,7 +465,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
|
|
||||||
IDEBUG_START;
|
IDEBUG_START;
|
||||||
|
|
||||||
const blasint incb = (transb == 0) ? 1 : ldb;
|
#if defined(COMPLEX)
|
||||||
|
if (transb > 1){
|
||||||
|
#ifndef CBLAS
|
||||||
|
IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
|
||||||
|
#else
|
||||||
|
if (order == CblasColMajor)
|
||||||
|
IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
|
||||||
|
if (order == CblasRowMajor)
|
||||||
|
IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
const blasint incb = ((transb & 1) == 0) ? 1 : ldb;
|
||||||
|
|
||||||
if (uplo == 1) {
|
if (uplo == 1) {
|
||||||
for (i = 0; i < m; i++) {
|
for (i = 0; i < m; i++) {
|
||||||
|
@ -438,19 +488,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
#if defined(COMPLEX)
|
#if defined(COMPLEX)
|
||||||
aa = a + i * 2;
|
aa = a + i * 2;
|
||||||
bb = b + i * ldb * 2;
|
bb = b + i * ldb * 2;
|
||||||
if (transa) {
|
if (transa & 1) {
|
||||||
aa = a + lda * i * 2;
|
aa = a + lda * i * 2;
|
||||||
}
|
}
|
||||||
if (transb)
|
if (transb & 1)
|
||||||
bb = b + i * 2;
|
bb = b + i * 2;
|
||||||
cc = c + i * 2 * ldc + i * 2;
|
cc = c + i * 2 * ldc + i * 2;
|
||||||
#else
|
#else
|
||||||
aa = a + i;
|
aa = a + i;
|
||||||
bb = b + i * ldb;
|
bb = b + i * ldb;
|
||||||
if (transa) {
|
if (transa & 1) {
|
||||||
aa = a + lda * i;
|
aa = a + lda * i;
|
||||||
}
|
}
|
||||||
if (transb)
|
if (transb & 1)
|
||||||
bb = b + i;
|
bb = b + i;
|
||||||
cc = c + i * ldc + i;
|
cc = c + i * ldc + i;
|
||||||
#endif
|
#endif
|
||||||
|
@ -461,7 +511,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
NULL, 0);
|
NULL, 0);
|
||||||
|
|
||||||
if (alpha_r == ZERO && alpha_i == ZERO)
|
if (alpha_r == ZERO && alpha_i == ZERO)
|
||||||
return;
|
continue;
|
||||||
#else
|
#else
|
||||||
if (beta != ONE)
|
if (beta != ONE)
|
||||||
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
|
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
|
||||||
|
@ -472,13 +522,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
|
|
||||||
IDEBUG_START;
|
IDEBUG_START;
|
||||||
|
|
||||||
buffer_size = j + k + 128 / sizeof(FLOAT);
|
buffer_size = 2 * (j + k) + 128 / sizeof(FLOAT);
|
||||||
#ifdef WINDOWS_ABI
|
#ifdef WINDOWS_ABI
|
||||||
buffer_size += 160 / sizeof(FLOAT);
|
buffer_size += 160 / sizeof(FLOAT);
|
||||||
#endif
|
#endif
|
||||||
// for alignment
|
// for alignment
|
||||||
buffer_size = (buffer_size + 3) & ~3;
|
buffer_size = (buffer_size + 3) & ~3;
|
||||||
STACK_ALLOC(buffer_size, FLOAT, buffer);
|
STACK_ALLOC(buffer_size, IFLOAT, buffer);
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
|
|
||||||
|
@ -491,7 +541,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(COMPLEX)
|
#if defined(COMPLEX)
|
||||||
if (!transa)
|
if (!(transa & 1))
|
||||||
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
|
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
|
||||||
aa, lda, bb, incb, cc, 1,
|
aa, lda, bb, incb, cc, 1,
|
||||||
buffer);
|
buffer);
|
||||||
|
@ -500,7 +550,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
aa, lda, bb, incb, cc, 1,
|
aa, lda, bb, incb, cc, 1,
|
||||||
buffer);
|
buffer);
|
||||||
#else
|
#else
|
||||||
if (!transa)
|
if (!(transa & 1))
|
||||||
(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
|
(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
|
||||||
bb, incb, cc, 1, buffer);
|
bb, incb, cc, 1, buffer);
|
||||||
else
|
else
|
||||||
|
@ -509,7 +559,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
#endif
|
#endif
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
} else {
|
} else {
|
||||||
if (!transa)
|
if (!(transa & 1))
|
||||||
(gemv_thread[(int)transa]) (j, k, alpha, aa,
|
(gemv_thread[(int)transa]) (j, k, alpha, aa,
|
||||||
lda, bb, incb, cc,
|
lda, bb, incb, cc,
|
||||||
1, buffer,
|
1, buffer,
|
||||||
|
@ -533,13 +583,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
l = j;
|
l = j;
|
||||||
#if defined COMPLEX
|
#if defined COMPLEX
|
||||||
bb = b + i * ldb * 2;
|
bb = b + i * ldb * 2;
|
||||||
if (transb) {
|
if (transb & 1) {
|
||||||
bb = b + i * 2;
|
bb = b + i * 2;
|
||||||
}
|
}
|
||||||
cc = c + i * 2 * ldc;
|
cc = c + i * 2 * ldc;
|
||||||
#else
|
#else
|
||||||
bb = b + i * ldb;
|
bb = b + i * ldb;
|
||||||
if (transb) {
|
if (transb & 1) {
|
||||||
bb = b + i;
|
bb = b + i;
|
||||||
}
|
}
|
||||||
cc = c + i * ldc;
|
cc = c + i * ldc;
|
||||||
|
@ -551,7 +601,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
NULL, 0);
|
NULL, 0);
|
||||||
|
|
||||||
if (alpha_r == ZERO && alpha_i == ZERO)
|
if (alpha_r == ZERO && alpha_i == ZERO)
|
||||||
return;
|
continue;
|
||||||
#else
|
#else
|
||||||
if (beta != ONE)
|
if (beta != ONE)
|
||||||
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
|
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
|
||||||
|
@ -561,13 +611,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
#endif
|
#endif
|
||||||
IDEBUG_START;
|
IDEBUG_START;
|
||||||
|
|
||||||
buffer_size = j + k + 128 / sizeof(FLOAT);
|
buffer_size = 2 * (j + k) + 128 / sizeof(FLOAT);
|
||||||
#ifdef WINDOWS_ABI
|
#ifdef WINDOWS_ABI
|
||||||
buffer_size += 160 / sizeof(FLOAT);
|
buffer_size += 160 / sizeof(FLOAT);
|
||||||
#endif
|
#endif
|
||||||
// for alignment
|
// for alignment
|
||||||
buffer_size = (buffer_size + 3) & ~3;
|
buffer_size = (buffer_size + 3) & ~3;
|
||||||
STACK_ALLOC(buffer_size, FLOAT, buffer);
|
STACK_ALLOC(buffer_size, IFLOAT, buffer);
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
|
|
||||||
|
@ -580,7 +630,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(COMPLEX)
|
#if defined(COMPLEX)
|
||||||
if (!transa)
|
if (!(transa & 1))
|
||||||
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
|
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
|
||||||
a, lda, bb, incb, cc, 1,
|
a, lda, bb, incb, cc, 1,
|
||||||
buffer);
|
buffer);
|
||||||
|
@ -589,7 +639,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
a, lda, bb, incb, cc, 1,
|
a, lda, bb, incb, cc, 1,
|
||||||
buffer);
|
buffer);
|
||||||
#else
|
#else
|
||||||
if (!transa)
|
if (!(transa & 1))
|
||||||
(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
|
(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
|
||||||
incb, cc, 1, buffer);
|
incb, cc, 1, buffer);
|
||||||
else
|
else
|
||||||
|
@ -599,7 +649,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
} else {
|
} else {
|
||||||
if (!transa)
|
if (!(transa & 1))
|
||||||
(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
|
(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
|
||||||
bb, incb, cc, 1,
|
bb, incb, cc, 1,
|
||||||
buffer, nthreads);
|
buffer, nthreads);
|
||||||
|
|
|
@ -226,7 +226,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
|
|
||||||
if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD )
|
if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD )
|
||||||
nthreads = 1;
|
nthreads = 1;
|
||||||
else
|
else
|
||||||
nthreads = num_cpu_avail(2);
|
nthreads = num_cpu_avail(2);
|
||||||
|
|
|
@ -154,7 +154,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT);
|
if ( *rows > *cols )
|
||||||
|
msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT);
|
||||||
|
else
|
||||||
|
msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT);
|
||||||
|
|
||||||
b = malloc(msize);
|
b = malloc(msize);
|
||||||
if ( b == NULL )
|
if ( b == NULL )
|
||||||
|
|
|
@ -95,14 +95,19 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
args.common = NULL;
|
args.common = NULL;
|
||||||
|
|
||||||
#ifndef DOUBLE
|
#ifndef DOUBLE
|
||||||
if (args.m*args.n < 40000)
|
int nmax = 40000;
|
||||||
#else
|
#else
|
||||||
if (args.m*args.n < 10000)
|
int nmax = 10000;
|
||||||
#endif
|
#endif
|
||||||
|
if (args.m*args.n <nmax) {
|
||||||
args.nthreads = 1;
|
args.nthreads = 1;
|
||||||
else
|
} else {
|
||||||
args.nthreads = num_cpu_avail(4);
|
args.nthreads = num_cpu_avail(4);
|
||||||
|
if ((args.m*args.n)/args.nthreads <nmax)
|
||||||
|
args.nthreads = (args.m*args.n)/nmax;
|
||||||
|
}
|
||||||
|
|
||||||
if (args.nthreads == 1) {
|
if (args.nthreads == 1) {
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -113,13 +113,17 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
args.common = NULL;
|
args.common = NULL;
|
||||||
#ifndef DOUBLE
|
#ifndef DOUBLE
|
||||||
if (args.n <128)
|
int nmax = 128;
|
||||||
#else
|
#else
|
||||||
if (args.n <64)
|
int nmax = 64;
|
||||||
#endif
|
#endif
|
||||||
|
if (args.n <nmax) {
|
||||||
args.nthreads = 1;
|
args.nthreads = 1;
|
||||||
else
|
} else {
|
||||||
args.nthreads = num_cpu_avail(4);
|
args.nthreads = num_cpu_avail(4);
|
||||||
|
if (args.n/args.nthreads <nmax)
|
||||||
|
args.nthreads = args.n/nmax;
|
||||||
|
}
|
||||||
|
|
||||||
if (args.nthreads == 1) {
|
if (args.nthreads == 1) {
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -95,10 +95,12 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *
|
||||||
if (trans_arg == 'R') trans = 0;
|
if (trans_arg == 'R') trans = 0;
|
||||||
if (trans_arg == 'C') trans = 1;
|
if (trans_arg == 'C') trans = 1;
|
||||||
|
|
||||||
|
TOUPPER(uplo_arg);
|
||||||
uplo = -1;
|
uplo = -1;
|
||||||
if (uplo_arg == 'U') uplo = 0;
|
if (uplo_arg == 'U') uplo = 0;
|
||||||
if (uplo_arg == 'L') uplo = 1;
|
if (uplo_arg == 'L') uplo = 1;
|
||||||
|
|
||||||
|
TOUPPER(diag_arg);
|
||||||
diag = -1;
|
diag = -1;
|
||||||
if (diag_arg == 'U') diag = 0;
|
if (diag_arg == 'U') diag = 0;
|
||||||
if (diag_arg == 'N') diag = 1;
|
if (diag_arg == 'N') diag = 1;
|
||||||
|
|
|
@ -95,10 +95,12 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *
|
||||||
if (trans_arg == 'R') trans = 2;
|
if (trans_arg == 'R') trans = 2;
|
||||||
if (trans_arg == 'C') trans = 3;
|
if (trans_arg == 'C') trans = 3;
|
||||||
|
|
||||||
|
TOUPPER(uplo_arg);
|
||||||
uplo = -1;
|
uplo = -1;
|
||||||
if (uplo_arg == 'U') uplo = 0;
|
if (uplo_arg == 'U') uplo = 0;
|
||||||
if (uplo_arg == 'L') uplo = 1;
|
if (uplo_arg == 'L') uplo = 1;
|
||||||
|
|
||||||
|
TOUPPER(diag_arg);
|
||||||
diag = -1;
|
diag = -1;
|
||||||
if (diag_arg == 'U') diag = 0;
|
if (diag_arg == 'U') diag = 0;
|
||||||
if (diag_arg == 'N') diag = 1;
|
if (diag_arg == 'N') diag = 1;
|
||||||
|
|
|
@ -46,6 +46,12 @@
|
||||||
|
|
||||||
#ifdef USE_ABS
|
#ifdef USE_ABS
|
||||||
|
|
||||||
|
#if defined(DOUBLE)
|
||||||
|
#define ABS fabs
|
||||||
|
#else
|
||||||
|
#define ABS fabsf
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef USE_MIN
|
#ifndef USE_MIN
|
||||||
|
|
||||||
/* ABS & MAX */
|
/* ABS & MAX */
|
||||||
|
@ -92,6 +98,8 @@
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
#define ABS
|
||||||
|
|
||||||
#ifndef USE_MIN
|
#ifndef USE_MIN
|
||||||
|
|
||||||
/* MAX */
|
/* MAX */
|
||||||
|
@ -130,6 +138,12 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
|
||||||
|
|
||||||
if (n <= 0) return 0;
|
if (n <= 0) return 0;
|
||||||
|
|
||||||
|
#ifndef COMPLEX
|
||||||
|
if (incx == 0) return (ABS(*x));
|
||||||
|
#else
|
||||||
|
if (incx == 0) return (ABS(*x) + ABS(*(x+1)));
|
||||||
|
#endif
|
||||||
|
|
||||||
IDEBUG_START;
|
IDEBUG_START;
|
||||||
|
|
||||||
FUNCTION_PROFILE_START();
|
FUNCTION_PROFILE_START();
|
||||||
|
@ -145,7 +159,12 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
#ifdef COMPLEX
|
||||||
|
FLOAT CNAME(blasint n, void *vx, blasint incx){
|
||||||
|
FLOAT *x = (FLOAT*) vx;
|
||||||
|
#else
|
||||||
FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
|
FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
|
||||||
|
#endif
|
||||||
|
|
||||||
FLOAT ret;
|
FLOAT ret;
|
||||||
|
|
||||||
|
@ -153,6 +172,12 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
|
||||||
|
|
||||||
if (n <= 0) return 0;
|
if (n <= 0) return 0;
|
||||||
|
|
||||||
|
#ifndef COMPLEX
|
||||||
|
if (incx == 0) return (ABS(*x));
|
||||||
|
#else
|
||||||
|
if (incx == 0) return (ABS(*x) + ABS(*(x+1)));
|
||||||
|
#endif
|
||||||
|
|
||||||
IDEBUG_START;
|
IDEBUG_START;
|
||||||
|
|
||||||
FUNCTION_PROFILE_START();
|
FUNCTION_PROFILE_START();
|
||||||
|
|
|
@ -96,12 +96,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
dp2 = *dd2 * dy1;
|
dp2 = *dd2 * dy1;
|
||||||
if(dp2 == ZERO)
|
|
||||||
{
|
|
||||||
dflag = -TWO;
|
|
||||||
dparam[0] = dflag;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
dp1 = *dd1 * *dx1;
|
dp1 = *dd1 * *dx1;
|
||||||
dq2 = dp2 * dy1;
|
dq2 = dp2 * dy1;
|
||||||
dq1 = dp1 * *dx1;
|
dq1 = dp1 * *dx1;
|
||||||
|
@ -113,24 +107,10 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
||||||
dh12 = dp2 / dp1;
|
dh12 = dp2 / dp1;
|
||||||
|
|
||||||
du = ONE - dh12 * dh21;
|
du = ONE - dh12 * dh21;
|
||||||
if(du > ZERO)
|
|
||||||
{
|
|
||||||
dflag = ZERO;
|
dflag = ZERO;
|
||||||
*dd1 = *dd1 / du;
|
*dd1 = *dd1 / du;
|
||||||
*dd2 = *dd2 / du;
|
*dd2 = *dd2 / du;
|
||||||
*dx1 = *dx1 * du;
|
*dx1 = *dx1 * du;
|
||||||
} else {
|
|
||||||
dflag = -ONE;
|
|
||||||
|
|
||||||
dh11 = ZERO;
|
|
||||||
dh12 = ZERO;
|
|
||||||
dh21 = ZERO;
|
|
||||||
dh22 = ZERO;
|
|
||||||
|
|
||||||
*dd1 = ZERO;
|
|
||||||
*dd2 = ZERO;
|
|
||||||
*dx1 = ZERO;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
|
|
@ -0,0 +1,447 @@
|
||||||
|
/*********************************************************************/
|
||||||
|
/* Copyright 2024, The OpenBLAS Project. */
|
||||||
|
/* All rights reserved. */
|
||||||
|
/* */
|
||||||
|
/* Redistribution and use in source and binary forms, with or */
|
||||||
|
/* without modification, are permitted provided that the following */
|
||||||
|
/* conditions are met: */
|
||||||
|
/* */
|
||||||
|
/* 1. Redistributions of source code must retain the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer. */
|
||||||
|
/* */
|
||||||
|
/* 2. Redistributions in binary form must reproduce the above */
|
||||||
|
/* copyright notice, this list of conditions and the following */
|
||||||
|
/* disclaimer in the documentation and/or other materials */
|
||||||
|
/* provided with the distribution. */
|
||||||
|
/* */
|
||||||
|
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||||
|
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||||
|
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||||
|
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||||
|
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||||
|
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||||
|
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||||
|
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||||
|
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||||
|
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||||
|
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||||
|
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||||
|
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||||
|
/* */
|
||||||
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define SMP_THRESHOLD_MIN 65536.0
|
||||||
|
#define ERROR_NAME "SBGEMMT "
|
||||||
|
|
||||||
|
#ifndef GEMM_MULTITHREAD_THRESHOLD
|
||||||
|
#define GEMM_MULTITHREAD_THRESHOLD 4
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef CBLAS
|
||||||
|
|
||||||
|
void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
||||||
|
blasint * M, blasint * K,
|
||||||
|
FLOAT * Alpha,
|
||||||
|
IFLOAT * a, blasint * ldA,
|
||||||
|
IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
|
||||||
|
{
|
||||||
|
|
||||||
|
blasint m, k;
|
||||||
|
blasint lda, ldb, ldc;
|
||||||
|
int transa, transb, uplo;
|
||||||
|
blasint info;
|
||||||
|
|
||||||
|
char transA, transB, Uplo;
|
||||||
|
blasint nrowa, nrowb;
|
||||||
|
IFLOAT *buffer;
|
||||||
|
IFLOAT *aa, *bb;
|
||||||
|
FLOAT *cc;
|
||||||
|
FLOAT alpha, beta;
|
||||||
|
|
||||||
|
PRINT_DEBUG_NAME;
|
||||||
|
|
||||||
|
m = *M;
|
||||||
|
k = *K;
|
||||||
|
|
||||||
|
alpha = *Alpha;
|
||||||
|
beta = *Beta;
|
||||||
|
|
||||||
|
lda = *ldA;
|
||||||
|
ldb = *ldB;
|
||||||
|
ldc = *ldC;
|
||||||
|
|
||||||
|
transA = *TRANSA;
|
||||||
|
transB = *TRANSB;
|
||||||
|
Uplo = *UPLO;
|
||||||
|
TOUPPER(transA);
|
||||||
|
TOUPPER(transB);
|
||||||
|
TOUPPER(Uplo);
|
||||||
|
|
||||||
|
transa = -1;
|
||||||
|
transb = -1;
|
||||||
|
uplo = -1;
|
||||||
|
|
||||||
|
if (transA == 'N')
|
||||||
|
transa = 0;
|
||||||
|
if (transA == 'T')
|
||||||
|
transa = 1;
|
||||||
|
|
||||||
|
if (transA == 'R')
|
||||||
|
transa = 0;
|
||||||
|
if (transA == 'C')
|
||||||
|
transa = 1;
|
||||||
|
|
||||||
|
if (transB == 'N')
|
||||||
|
transb = 0;
|
||||||
|
if (transB == 'T')
|
||||||
|
transb = 1;
|
||||||
|
|
||||||
|
if (transB == 'R')
|
||||||
|
transb = 0;
|
||||||
|
if (transB == 'C')
|
||||||
|
transb = 1;
|
||||||
|
|
||||||
|
if (Uplo == 'U')
|
||||||
|
uplo = 0;
|
||||||
|
if (Uplo == 'L')
|
||||||
|
uplo = 1;
|
||||||
|
nrowa = m;
|
||||||
|
if (transa & 1) nrowa = k;
|
||||||
|
nrowb = k;
|
||||||
|
if (transb & 1) nrowb = m;
|
||||||
|
|
||||||
|
info = 0;
|
||||||
|
|
||||||
|
if (ldc < MAX(1, m))
|
||||||
|
info = 13;
|
||||||
|
if (ldb < MAX(1, nrowb))
|
||||||
|
info = 10;
|
||||||
|
if (lda < MAX(1, nrowa))
|
||||||
|
info = 8;
|
||||||
|
if (k < 0)
|
||||||
|
info = 5;
|
||||||
|
if (m < 0)
|
||||||
|
info = 4;
|
||||||
|
if (transb < 0)
|
||||||
|
info = 3;
|
||||||
|
if (transa < 0)
|
||||||
|
info = 2;
|
||||||
|
if (uplo < 0)
|
||||||
|
info = 1;
|
||||||
|
|
||||||
|
if (info != 0) {
|
||||||
|
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
|
||||||
|
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||||
|
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m,
|
||||||
|
blasint k,
|
||||||
|
FLOAT alpha,
|
||||||
|
IFLOAT * A, blasint LDA,
|
||||||
|
IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc)
|
||||||
|
{
|
||||||
|
IFLOAT *aa, *bb;
|
||||||
|
FLOAT *cc;
|
||||||
|
|
||||||
|
int transa, transb, uplo;
|
||||||
|
blasint info;
|
||||||
|
blasint lda, ldb;
|
||||||
|
IFLOAT *a, *b;
|
||||||
|
XFLOAT *buffer;
|
||||||
|
|
||||||
|
PRINT_DEBUG_CNAME;
|
||||||
|
|
||||||
|
uplo = -1;
|
||||||
|
transa = -1;
|
||||||
|
transb = -1;
|
||||||
|
info = 0;
|
||||||
|
|
||||||
|
if (order == CblasColMajor) {
|
||||||
|
if (Uplo == CblasUpper) uplo = 0;
|
||||||
|
if (Uplo == CblasLower) uplo = 1;
|
||||||
|
|
||||||
|
if (TransA == CblasNoTrans)
|
||||||
|
transa = 0;
|
||||||
|
if (TransA == CblasTrans)
|
||||||
|
transa = 1;
|
||||||
|
|
||||||
|
if (TransA == CblasConjNoTrans)
|
||||||
|
transa = 0;
|
||||||
|
if (TransA == CblasConjTrans)
|
||||||
|
transa = 1;
|
||||||
|
|
||||||
|
if (TransB == CblasNoTrans)
|
||||||
|
transb = 0;
|
||||||
|
if (TransB == CblasTrans)
|
||||||
|
transb = 1;
|
||||||
|
|
||||||
|
if (TransB == CblasConjNoTrans)
|
||||||
|
transb = 0;
|
||||||
|
if (TransB == CblasConjTrans)
|
||||||
|
transb = 1;
|
||||||
|
|
||||||
|
a = (void *)A;
|
||||||
|
b = (void *)B;
|
||||||
|
lda = LDA;
|
||||||
|
ldb = LDB;
|
||||||
|
|
||||||
|
info = -1;
|
||||||
|
|
||||||
|
blasint nrowa;
|
||||||
|
blasint nrowb;
|
||||||
|
nrowa = m;
|
||||||
|
if (transa & 1) nrowa = k;
|
||||||
|
nrowb = k;
|
||||||
|
if (transb & 1) nrowb = m;
|
||||||
|
|
||||||
|
if (ldc < MAX(1, m))
|
||||||
|
info = 13;
|
||||||
|
if (ldb < MAX(1, nrowb))
|
||||||
|
info = 10;
|
||||||
|
if (lda < MAX(1, nrowa))
|
||||||
|
info = 8;
|
||||||
|
if (k < 0)
|
||||||
|
info = 5;
|
||||||
|
if (m < 0)
|
||||||
|
info = 4;
|
||||||
|
if (transb < 0)
|
||||||
|
info = 3;
|
||||||
|
if (transa < 0)
|
||||||
|
info = 2;
|
||||||
|
if (uplo < 0)
|
||||||
|
info = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (order == CblasRowMajor) {
|
||||||
|
|
||||||
|
a = (void *)B;
|
||||||
|
b = (void *)A;
|
||||||
|
|
||||||
|
lda = LDB;
|
||||||
|
ldb = LDA;
|
||||||
|
|
||||||
|
if (Uplo == CblasUpper) uplo = 0;
|
||||||
|
if (Uplo == CblasLower) uplo = 1;
|
||||||
|
|
||||||
|
if (TransB == CblasNoTrans)
|
||||||
|
transa = 0;
|
||||||
|
if (TransB == CblasTrans)
|
||||||
|
transa = 1;
|
||||||
|
|
||||||
|
if (TransB == CblasConjNoTrans)
|
||||||
|
transa = 0;
|
||||||
|
if (TransB == CblasConjTrans)
|
||||||
|
transa = 1;
|
||||||
|
|
||||||
|
if (TransA == CblasNoTrans)
|
||||||
|
transb = 0;
|
||||||
|
if (TransA == CblasTrans)
|
||||||
|
transb = 1;
|
||||||
|
|
||||||
|
if (TransA == CblasConjNoTrans)
|
||||||
|
transb = 0;
|
||||||
|
if (TransA == CblasConjTrans)
|
||||||
|
transb = 1;
|
||||||
|
|
||||||
|
info = -1;
|
||||||
|
|
||||||
|
blasint ncola;
|
||||||
|
blasint ncolb;
|
||||||
|
|
||||||
|
ncola = m;
|
||||||
|
if (transa & 1) ncola = k;
|
||||||
|
ncolb = k;
|
||||||
|
|
||||||
|
if (transb & 1) {
|
||||||
|
ncolb = m;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ldc < MAX(1,m))
|
||||||
|
info = 13;
|
||||||
|
if (ldb < MAX(1, ncolb))
|
||||||
|
info = 8;
|
||||||
|
if (lda < MAX(1, ncola))
|
||||||
|
info = 10;
|
||||||
|
if (k < 0)
|
||||||
|
info = 5;
|
||||||
|
if (m < 0)
|
||||||
|
info = 4;
|
||||||
|
if (transb < 0)
|
||||||
|
info = 2;
|
||||||
|
if (transa < 0)
|
||||||
|
info = 3;
|
||||||
|
if (uplo < 0)
|
||||||
|
info = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (info >= 0) {
|
||||||
|
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
int buffer_size;
|
||||||
|
blasint i, j;
|
||||||
|
|
||||||
|
#ifdef SMP
|
||||||
|
int nthreads;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef SMP
|
||||||
|
static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *,
|
||||||
|
BLASLONG, IFLOAT *, BLASLONG, FLOAT,
|
||||||
|
FLOAT *, BLASLONG, int) = {
|
||||||
|
sbgemv_thread_n, sbgemv_thread_t,
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
int (*gemv[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, BLASLONG,
|
||||||
|
IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = {
|
||||||
|
SBGEMV_N, SBGEMV_T,};
|
||||||
|
|
||||||
|
|
||||||
|
if (m == 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
IDEBUG_START;
|
||||||
|
|
||||||
|
const blasint incb = ((transb & 1) == 0) ? 1 : ldb;
|
||||||
|
|
||||||
|
if (uplo == 1) {
|
||||||
|
for (i = 0; i < m; i++) {
|
||||||
|
j = m - i;
|
||||||
|
|
||||||
|
aa = a + i;
|
||||||
|
bb = b + i * ldb;
|
||||||
|
if (transa & 1) {
|
||||||
|
aa = a + lda * i;
|
||||||
|
}
|
||||||
|
if (transb & 1)
|
||||||
|
bb = b + i;
|
||||||
|
cc = c + i * ldc + i;
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
if (beta != ONE)
|
||||||
|
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
|
||||||
|
|
||||||
|
if (alpha == ZERO)
|
||||||
|
continue;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
IDEBUG_START;
|
||||||
|
|
||||||
|
buffer_size = j + k + 128 / sizeof(FLOAT);
|
||||||
|
#ifdef WINDOWS_ABI
|
||||||
|
buffer_size += 160 / sizeof(FLOAT);
|
||||||
|
#endif
|
||||||
|
// for alignment
|
||||||
|
buffer_size = (buffer_size + 3) & ~3;
|
||||||
|
STACK_ALLOC(buffer_size, IFLOAT, buffer);
|
||||||
|
|
||||||
|
#ifdef SMP
|
||||||
|
|
||||||
|
if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
|
||||||
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(2);
|
||||||
|
|
||||||
|
if (nthreads == 1) {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (!(transa & 1))
|
||||||
|
(gemv[(int)transa]) (j, k, alpha, aa, lda,
|
||||||
|
bb, incb, beta, cc, 1);
|
||||||
|
else
|
||||||
|
(gemv[(int)transa]) (k, j, alpha, aa, lda,
|
||||||
|
bb, incb, beta, cc, 1);
|
||||||
|
|
||||||
|
#ifdef SMP
|
||||||
|
} else {
|
||||||
|
if (!(transa & 1))
|
||||||
|
(gemv_thread[(int)transa]) (j, k, alpha, aa,
|
||||||
|
lda, bb, incb, beta, cc,
|
||||||
|
1, nthreads);
|
||||||
|
else
|
||||||
|
(gemv_thread[(int)transa]) (k, j, alpha, aa,
|
||||||
|
lda, bb, incb, beta, cc,
|
||||||
|
1, nthreads);
|
||||||
|
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
STACK_FREE(buffer);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
|
||||||
|
for (i = 0; i < m; i++) {
|
||||||
|
j = i + 1;
|
||||||
|
|
||||||
|
bb = b + i * ldb;
|
||||||
|
if (transb & 1) {
|
||||||
|
bb = b + i;
|
||||||
|
}
|
||||||
|
cc = c + i * ldc;
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
if (beta != ONE)
|
||||||
|
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
|
||||||
|
|
||||||
|
if (alpha == ZERO)
|
||||||
|
continue;
|
||||||
|
#endif
|
||||||
|
IDEBUG_START;
|
||||||
|
|
||||||
|
buffer_size = j + k + 128 / sizeof(FLOAT);
|
||||||
|
#ifdef WINDOWS_ABI
|
||||||
|
buffer_size += 160 / sizeof(FLOAT);
|
||||||
|
#endif
|
||||||
|
// for alignment
|
||||||
|
buffer_size = (buffer_size + 3) & ~3;
|
||||||
|
STACK_ALLOC(buffer_size, IFLOAT, buffer);
|
||||||
|
|
||||||
|
#ifdef SMP
|
||||||
|
|
||||||
|
if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
|
||||||
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(2);
|
||||||
|
|
||||||
|
if (nthreads == 1) {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (!(transa & 1))
|
||||||
|
(gemv[(int)transa]) (j, k, alpha, a, lda, bb,
|
||||||
|
incb, beta, cc, 1);
|
||||||
|
else
|
||||||
|
(gemv[(int)transa]) (k, j, alpha, a, lda, bb,
|
||||||
|
incb, beta, cc, 1);
|
||||||
|
|
||||||
|
#ifdef SMP
|
||||||
|
} else {
|
||||||
|
if (!(transa & 1))
|
||||||
|
(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
|
||||||
|
bb, incb, beta, cc, 1,
|
||||||
|
nthreads);
|
||||||
|
else
|
||||||
|
(gemv_thread[(int)transa]) (k, j, alpha, a, lda,
|
||||||
|
bb, incb, beta, cc, 1,
|
||||||
|
nthreads);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
STACK_FREE(buffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
IDEBUG_END;
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
|
@ -39,12 +39,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#ifndef CBLAS
|
#ifndef CBLAS
|
||||||
|
|
||||||
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY)
|
void NAME(blasint *N, void *VALPHA, FLOAT *x, blasint *INCX, void *VBETA, FLOAT *y, blasint *INCY)
|
||||||
{
|
{
|
||||||
|
|
||||||
blasint n = *N;
|
blasint n = *N;
|
||||||
blasint incx = *INCX;
|
blasint incx = *INCX;
|
||||||
blasint incy = *INCY;
|
blasint incy = *INCY;
|
||||||
|
FLOAT* ALPHA = (FLOAT*) VALPHA;
|
||||||
|
FLOAT* BETA = (FLOAT*) VBETA;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
|
|
@ -66,7 +66,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
|
||||||
info = 0;
|
info = 0;
|
||||||
|
|
||||||
|
|
||||||
if (lda < MAX(1, m)) info = 6;
|
if (lda < MAX(1, m)) info = 5;
|
||||||
if (ldc < MAX(1, m)) info = 8;
|
if (ldc < MAX(1, m)) info = 8;
|
||||||
|
|
||||||
if (n < 0) info = 2;
|
if (n < 0) info = 2;
|
||||||
|
@ -115,8 +115,8 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
|
|
||||||
if (ldc < MAX(1, m)) info = 8;
|
if (ldc < MAX(1, m)) info = 8;
|
||||||
if (lda < MAX(1, m)) info = 5;
|
if (lda < MAX(1, m)) info = 5;
|
||||||
if (n < 0) info = 2;
|
if (n < 0) info = 1;
|
||||||
if (m < 0) info = 1;
|
if (m < 0) info = 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (info >= 0) {
|
if (info >= 0) {
|
||||||
|
|
|
@ -183,7 +183,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT) * 2;
|
if ( *rows > *cols )
|
||||||
|
msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT) * 2;
|
||||||
|
else
|
||||||
|
msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT) * 2;
|
||||||
|
|
||||||
b = malloc(msize);
|
b = malloc(msize);
|
||||||
if ( b == NULL )
|
if ( b == NULL )
|
||||||
|
|
|
@ -102,7 +102,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
|
||||||
if (ada >= h *safmin) {
|
if (ada >= h *safmin) {
|
||||||
*C = sqrt(ada/h);
|
*C = sqrt(ada/h);
|
||||||
*R = *DA / *C;
|
*R = *DA / *C;
|
||||||
*(R+1) = *(DA+1) / *(C+1);
|
*(R+1) = *(DA+1) / *C;
|
||||||
rtmax *= 2.;
|
rtmax *= 2.;
|
||||||
if ( ada > rtmin && h < rtmax) { // no risk of intermediate overflow
|
if ( ada > rtmin && h < rtmax) { // no risk of intermediate overflow
|
||||||
*S = *S1 * (*DA / adahsq) - *(S1+1)* (*(DA+1)/adahsq);
|
*S = *S1 * (*DA / adahsq) - *(S1+1)* (*(DA+1)/adahsq);
|
||||||
|
@ -115,7 +115,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
|
||||||
*C = ada / adahsq;
|
*C = ada / adahsq;
|
||||||
if (*C >= safmin) {
|
if (*C >= safmin) {
|
||||||
*R = *DA / *C;
|
*R = *DA / *C;
|
||||||
*(R+1) = *(DA+1) / *(C+1);
|
*(R+1) = *(DA+1) / *C;
|
||||||
} else {
|
} else {
|
||||||
*R = *DA * (h / adahsq);
|
*R = *DA * (h / adahsq);
|
||||||
*(R+1) = *(DA+1) * (h / adahsq);
|
*(R+1) = *(DA+1) * (h / adahsq);
|
||||||
|
|
|
@ -1349,6 +1349,9 @@ endif ()
|
||||||
set_target_properties(kernel${TSUFFIX} PROPERTIES COMPILE_FLAGS "${KERNEL_DEFINITIONS}")
|
set_target_properties(kernel${TSUFFIX} PROPERTIES COMPILE_FLAGS "${KERNEL_DEFINITIONS}")
|
||||||
get_target_property(KERNEL_INCLUDE_DIRECTORIES kernel${TSUFFIX} INCLUDE_DIRECTORIES)
|
get_target_property(KERNEL_INCLUDE_DIRECTORIES kernel${TSUFFIX} INCLUDE_DIRECTORIES)
|
||||||
set_target_properties(kernel${TSUFFIX} PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDE_DIRECTORIES};${TARGET_CONF_DIR}")
|
set_target_properties(kernel${TSUFFIX} PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDE_DIRECTORIES};${TARGET_CONF_DIR}")
|
||||||
|
if (USE_GEMM3M)
|
||||||
|
target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M)
|
||||||
|
endif()
|
||||||
endfunction ()
|
endfunction ()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -61,7 +61,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
if ( n == 1 ) return( ABS(x[0]) );
|
if ( n == 1 ) return( ABS(x[0]) );
|
||||||
|
|
||||||
n *= inc_x;
|
n *= inc_x;
|
||||||
while(i < n)
|
while(abs(i) < abs(n))
|
||||||
{
|
{
|
||||||
|
|
||||||
if ( x[i] != 0.0 )
|
if ( x[i] != 0.0 )
|
||||||
|
|
|
@ -62,7 +62,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
inc_x2 = 2 * inc_x;
|
inc_x2 = 2 * inc_x;
|
||||||
|
|
||||||
n *= inc_x2;
|
n *= inc_x2;
|
||||||
while(i < n)
|
while(abs(i) < abs(n))
|
||||||
{
|
{
|
||||||
|
|
||||||
if ( x[i] != 0.0 )
|
if ( x[i] != 0.0 )
|
||||||
|
|
|
@ -60,6 +60,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
temp = - da_i * x[ip+1] ;
|
temp = - da_i * x[ip+1] ;
|
||||||
|
if (isnan(x[ip]) || isinf(x[ip])) temp = NAN;
|
||||||
x[ip+1] = da_i * x[ip] ;
|
x[ip+1] = da_i * x[ip] ;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
CSUMKERNEL=csum.S
|
||||||
|
|
||||||
ifndef SNRM2KERNEL
|
ifndef SNRM2KERNEL
|
||||||
SNRM2KERNEL = ../arm/nrm2.c
|
SNRM2KERNEL = ../arm/nrm2.c
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
|
||||||
|
CSUMKERNEL = csum_thunderx2t99.c
|
||||||
|
ZSUMKERNEL = zsum_thunderx2t99.c
|
||||||
SAMINKERNEL = ../arm/amin.c
|
SAMINKERNEL = ../arm/amin.c
|
||||||
DAMINKERNEL = ../arm/amin.c
|
DAMINKERNEL = ../arm/amin.c
|
||||||
CAMINKERNEL = ../arm/zamin.c
|
CAMINKERNEL = ../arm/zamin.c
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
include $(KERNELDIR)/KERNEL.CORTEXA57
|
||||||
|
|
||||||
|
|
|
@ -91,8 +91,8 @@ IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||||
ICAMAXKERNEL = izamax_thunderx2t99.c
|
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||||
IZAMAXKERNEL = izamax_thunderx2t99.c
|
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||||
|
|
||||||
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
SNRM2KERNEL = nrm2.S
|
||||||
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
DNRM2KERNEL = nrm2.S
|
||||||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,247 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2017, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#include <arm_neon.h>
|
||||||
|
|
||||||
|
#define N "x0" /* vector length */
|
||||||
|
#define X "x1" /* "X" vector address */
|
||||||
|
#define INC_X "x2" /* "X" stride */
|
||||||
|
#define J "x5" /* loop variable */
|
||||||
|
|
||||||
|
#define REG0 "wzr"
|
||||||
|
#define SUMF "s0"
|
||||||
|
#define SUMFD "d0"
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
|
||||||
|
#define KERNEL_F1 \
|
||||||
|
"ldr d1, ["X"] \n" \
|
||||||
|
"add "X", "X", #8 \n" \
|
||||||
|
"ext v2.8b, v1.8b, v1.8b, #4 \n" \
|
||||||
|
"fadd s1, s1, s2 \n" \
|
||||||
|
"fadd "SUMF", "SUMF", s1 \n"
|
||||||
|
|
||||||
|
#define KERNEL_F32 \
|
||||||
|
"ldr q16, ["X"] \n" \
|
||||||
|
"ldr q17, ["X", #16] \n" \
|
||||||
|
"ldr q18, ["X", #32] \n" \
|
||||||
|
"ldr q19, ["X", #48] \n" \
|
||||||
|
"ldp q20, q21, ["X", #64] \n" \
|
||||||
|
"ldp q22, q23, ["X", #96] \n" \
|
||||||
|
"ldp q24, q25, ["X", #128] \n" \
|
||||||
|
"ldp q26, q27, ["X", #160] \n" \
|
||||||
|
"fadd v16.4s, v16.4s, v17.4s \n" \
|
||||||
|
"fadd v18.4s, v18.4s, v19.4s \n" \
|
||||||
|
"ldp q28, q29, ["X", #192] \n" \
|
||||||
|
"ldp q30, q31, ["X", #224] \n" \
|
||||||
|
"add "X", "X", #256 \n" \
|
||||||
|
"fadd v20.4s, v20.4s, v21.4s \n" \
|
||||||
|
"fadd v22.4s, v22.4s, v23.4s \n" \
|
||||||
|
"PRFM PLDL1KEEP, ["X", #1024] \n" \
|
||||||
|
"PRFM PLDL1KEEP, ["X", #1024+64] \n" \
|
||||||
|
"fadd v24.4s, v24.4s, v25.4s \n" \
|
||||||
|
"fadd v26.4s, v26.4s, v27.4s \n" \
|
||||||
|
"fadd v0.4s, v0.4s, v16.4s \n" \
|
||||||
|
"fadd v1.4s, v1.4s, v18.4s \n" \
|
||||||
|
"fadd v2.4s, v2.4s, v20.4s \n" \
|
||||||
|
"fadd v3.4s, v3.4s, v22.4s \n" \
|
||||||
|
"PRFM PLDL1KEEP, ["X", #1024+128] \n" \
|
||||||
|
"PRFM PLDL1KEEP, ["X", #1024+192] \n" \
|
||||||
|
"fadd v28.4s, v28.4s, v29.4s \n" \
|
||||||
|
"fadd v30.4s, v30.4s, v31.4s \n" \
|
||||||
|
"fadd v4.4s, v4.4s, v24.4s \n" \
|
||||||
|
"fadd v5.4s, v5.4s, v26.4s \n" \
|
||||||
|
"fadd v6.4s, v6.4s, v28.4s \n" \
|
||||||
|
"fadd v7.4s, v7.4s, v30.4s \n"
|
||||||
|
|
||||||
|
#define KERNEL_F32_FINALIZE \
|
||||||
|
"fadd v0.4s, v0.4s, v1.4s \n" \
|
||||||
|
"fadd v2.4s, v2.4s, v3.4s \n" \
|
||||||
|
"fadd v4.4s, v4.4s, v5.4s \n" \
|
||||||
|
"fadd v6.4s, v6.4s, v7.4s \n" \
|
||||||
|
"fadd v0.4s, v0.4s, v2.4s \n" \
|
||||||
|
"fadd v4.4s, v4.4s, v6.4s \n" \
|
||||||
|
"fadd v0.4s, v0.4s, v4.4s \n" \
|
||||||
|
"ext v1.16b, v0.16b, v0.16b, #8 \n" \
|
||||||
|
"fadd v0.2s, v0.2s, v1.2s \n" \
|
||||||
|
"faddp "SUMF", v0.2s \n"
|
||||||
|
|
||||||
|
#define INIT_S \
|
||||||
|
"lsl "INC_X", "INC_X", #3 \n"
|
||||||
|
|
||||||
|
#define KERNEL_S1 \
|
||||||
|
"ldr d1, ["X"] \n" \
|
||||||
|
"add "X", "X", "INC_X" \n" \
|
||||||
|
"ext v2.8b, v1.8b, v1.8b, #4 \n" \
|
||||||
|
"fadd s1, s1, s2 \n" \
|
||||||
|
"fadd "SUMF", "SUMF", s1 \n"
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(SMP)
|
||||||
|
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
|
||||||
|
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
|
||||||
|
void *c, BLASLONG ldc, int (*function)(), int nthreads);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
static FLOAT casum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
|
{
|
||||||
|
FLOAT asum = 0.0 ;
|
||||||
|
|
||||||
|
if ( n < 0 ) return(asum);
|
||||||
|
|
||||||
|
__asm__ __volatile__ (
|
||||||
|
" mov "N", %[N_] \n"
|
||||||
|
" mov "X", %[X_] \n"
|
||||||
|
" mov "INC_X", %[INCX_] \n"
|
||||||
|
" fmov "SUMF", "REG0" \n"
|
||||||
|
" fmov s1, "REG0" \n"
|
||||||
|
" fmov s2, "REG0" \n"
|
||||||
|
" fmov s3, "REG0" \n"
|
||||||
|
" fmov s4, "REG0" \n"
|
||||||
|
" fmov s5, "REG0" \n"
|
||||||
|
" fmov s6, "REG0" \n"
|
||||||
|
" fmov s7, "REG0" \n"
|
||||||
|
" cmp "N", xzr \n"
|
||||||
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
|
" cmp "INC_X", xzr \n"
|
||||||
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
|
" cmp "INC_X", #1 \n"
|
||||||
|
" bne 5f //asum_kernel_S_BEGIN \n"
|
||||||
|
|
||||||
|
"1: //asum_kernel_F_BEGIN: \n"
|
||||||
|
" asr "J", "N", #5 \n"
|
||||||
|
" cmp "J", xzr \n"
|
||||||
|
" beq 3f //asum_kernel_F1 \n"
|
||||||
|
|
||||||
|
"2: //asum_kernel_F32: \n"
|
||||||
|
" "KERNEL_F32" \n"
|
||||||
|
" subs "J", "J", #1 \n"
|
||||||
|
" bne 2b //asum_kernel_F32 \n"
|
||||||
|
" "KERNEL_F32_FINALIZE" \n"
|
||||||
|
|
||||||
|
"3: //asum_kernel_F1: \n"
|
||||||
|
" ands "J", "N", #31 \n"
|
||||||
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
|
|
||||||
|
"4: //asum_kernel_F10: \n"
|
||||||
|
" "KERNEL_F1" \n"
|
||||||
|
" subs "J", "J", #1 \n"
|
||||||
|
" bne 4b //asum_kernel_F10 \n"
|
||||||
|
" b 9f //asum_kernel_L999 \n"
|
||||||
|
|
||||||
|
"5: //asum_kernel_S_BEGIN: \n"
|
||||||
|
" "INIT_S" \n"
|
||||||
|
" asr "J", "N", #2 \n"
|
||||||
|
" cmp "J", xzr \n"
|
||||||
|
" ble 7f //asum_kernel_S1 \n"
|
||||||
|
|
||||||
|
"6: //asum_kernel_S4: \n"
|
||||||
|
" "KERNEL_S1" \n"
|
||||||
|
" "KERNEL_S1" \n"
|
||||||
|
" "KERNEL_S1" \n"
|
||||||
|
" "KERNEL_S1" \n"
|
||||||
|
" subs "J", "J", #1 \n"
|
||||||
|
" bne 6b //asum_kernel_S4 \n"
|
||||||
|
|
||||||
|
"7: //asum_kernel_S1: \n"
|
||||||
|
" ands "J", "N", #3 \n"
|
||||||
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
|
|
||||||
|
"8: //asum_kernel_S10: \n"
|
||||||
|
" "KERNEL_S1" \n"
|
||||||
|
" subs "J", "J", #1 \n"
|
||||||
|
" bne 8b //asum_kernel_S10 \n"
|
||||||
|
|
||||||
|
"9: //asum_kernel_L999: \n"
|
||||||
|
" fmov %[ASUM_], "SUMFD" \n"
|
||||||
|
|
||||||
|
: [ASUM_] "=r" (asum) //%0
|
||||||
|
: [N_] "r" (n), //%1
|
||||||
|
[X_] "r" (x), //%2
|
||||||
|
[INCX_] "r" (inc_x) //%3
|
||||||
|
: "cc",
|
||||||
|
"memory",
|
||||||
|
"x0", "x1", "x2", "x3", "x4", "x5",
|
||||||
|
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
|
||||||
|
);
|
||||||
|
|
||||||
|
return asum;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(SMP)
|
||||||
|
static int casum_thread_function(BLASLONG n, BLASLONG dummy0,
|
||||||
|
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
||||||
|
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
|
||||||
|
{
|
||||||
|
*result = casum_compute(n, x, inc_x);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
|
{
|
||||||
|
#if defined(SMP)
|
||||||
|
int nthreads;
|
||||||
|
FLOAT dummy_alpha;
|
||||||
|
#endif
|
||||||
|
FLOAT asum = 0.0;
|
||||||
|
|
||||||
|
#if defined(SMP)
|
||||||
|
if (inc_x == 0 || n <= 10000)
|
||||||
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
|
if (nthreads == 1) {
|
||||||
|
asum = casum_compute(n, x, inc_x);
|
||||||
|
} else {
|
||||||
|
int mode, i;
|
||||||
|
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
|
||||||
|
FLOAT *ptr;
|
||||||
|
|
||||||
|
mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||||
|
|
||||||
|
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
|
||||||
|
x, inc_x, NULL, 0, result, 0,
|
||||||
|
( void *)casum_thread_function, nthreads);
|
||||||
|
|
||||||
|
ptr = (FLOAT *)result;
|
||||||
|
for (i = 0; i < nthreads; i++) {
|
||||||
|
asum = asum + (*ptr);
|
||||||
|
ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
asum = casum_compute(n, x, inc_x);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return asum;
|
||||||
|
}
|
|
@ -77,7 +77,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
||||||
" cmp "N", xzr \n"
|
" cmp "N", xzr \n"
|
||||||
" ble 9f //nrm2_kernel_L999 \n"
|
" ble 9f //nrm2_kernel_L999 \n"
|
||||||
" cmp "INC_X", xzr \n"
|
" cmp "INC_X", xzr \n"
|
||||||
" ble 9f //nrm2_kernel_L999 \n"
|
" beq 9f //nrm2_kernel_L999 \n"
|
||||||
|
|
||||||
"1: //nrm2_kernel_F_BEGIN: \n"
|
"1: //nrm2_kernel_F_BEGIN: \n"
|
||||||
" mov x6, #0x7FF0000000000000 //+Infinity \n"
|
" mov x6, #0x7FF0000000000000 //+Infinity \n"
|
||||||
|
@ -345,7 +345,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
#endif
|
#endif
|
||||||
FLOAT ssq, scale;
|
FLOAT ssq, scale;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return 0.0;
|
if (n <= 0 || inc_x == 0) return 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
if (n <= 10000)
|
if (n <= 10000)
|
||||||
|
|
|
@ -229,7 +229,7 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
" cmp "N", xzr \n"
|
" cmp "N", xzr \n"
|
||||||
" ble 9f //nrm2_kernel_L999 \n"
|
" ble 9f //nrm2_kernel_L999 \n"
|
||||||
" cmp "INC_X", xzr \n"
|
" cmp "INC_X", xzr \n"
|
||||||
" ble 9f //nrm2_kernel_L999 \n"
|
" beq 9f //nrm2_kernel_L999 \n"
|
||||||
" cmp "INC_X", #1 \n"
|
" cmp "INC_X", #1 \n"
|
||||||
" bne 5f //nrm2_kernel_S_BEGIN \n"
|
" bne 5f //nrm2_kernel_S_BEGIN \n"
|
||||||
|
|
||||||
|
@ -315,7 +315,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
FLOAT nrm2 = 0.0;
|
FLOAT nrm2 = 0.0;
|
||||||
double nrm2_double = 0.0;
|
double nrm2_double = 0.0;
|
||||||
|
|
||||||
if (n <= 0 || inc_x <= 0) return 0.0;
|
if (n <= 0 || inc_x == 0) return 0.0;
|
||||||
|
|
||||||
#if defined(SMP)
|
#if defined(SMP)
|
||||||
if (n <= 10000)
|
if (n <= 10000)
|
||||||
|
|
|
@ -223,7 +223,7 @@ zscal_begin:
|
||||||
fcmp DA_I, #0.0
|
fcmp DA_I, #0.0
|
||||||
beq .Lzscal_kernel_RI_zero
|
beq .Lzscal_kernel_RI_zero
|
||||||
|
|
||||||
b .Lzscal_kernel_R_zero
|
// b .Lzscal_kernel_R_zero
|
||||||
|
|
||||||
.Lzscal_kernel_R_non_zero:
|
.Lzscal_kernel_R_non_zero:
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,244 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2017, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#include <arm_neon.h>
|
||||||
|
|
||||||
|
#define N "x0" /* vector length */
|
||||||
|
#define X "x1" /* "X" vector address */
|
||||||
|
#define INC_X "x2" /* "X" stride */
|
||||||
|
#define J "x5" /* loop variable */
|
||||||
|
|
||||||
|
#define REG0 "xzr"
|
||||||
|
#define SUMF "d0"
|
||||||
|
#define TMPF "d1"
|
||||||
|
|
||||||
|
/******************************************************************************/
|
||||||
|
|
||||||
|
#define KERNEL_F1 \
|
||||||
|
"ldr q1, ["X"] \n" \
|
||||||
|
"add "X", "X", #16 \n" \
|
||||||
|
"faddp d1, v1.2d \n" \
|
||||||
|
"fadd "SUMF", "SUMF", d1 \n"
|
||||||
|
|
||||||
|
#define KERNEL_F16 \
|
||||||
|
"ldr q16, ["X"] \n" \
|
||||||
|
"ldr q17, ["X", #16] \n" \
|
||||||
|
"ldr q18, ["X", #32] \n" \
|
||||||
|
"ldr q19, ["X", #48] \n" \
|
||||||
|
"ldp q20, q21, ["X", #64] \n" \
|
||||||
|
"ldp q22, q23, ["X", #96] \n" \
|
||||||
|
"ldp q24, q25, ["X", #128] \n" \
|
||||||
|
"ldp q26, q27, ["X", #160] \n" \
|
||||||
|
"fadd v16.2d, v16.2d, v17.2d \n" \
|
||||||
|
"fadd v18.2d, v18.2d, v19.2d \n" \
|
||||||
|
"ldp q28, q29, ["X", #192] \n" \
|
||||||
|
"ldp q30, q31, ["X", #224] \n" \
|
||||||
|
"add "X", "X", #256 \n" \
|
||||||
|
"fadd v20.2d, v20.2d, v21.2d \n" \
|
||||||
|
"fadd v22.2d, v22.2d, v23.2d \n" \
|
||||||
|
"PRFM PLDL1KEEP, ["X", #1024] \n" \
|
||||||
|
"PRFM PLDL1KEEP, ["X", #1024+64] \n" \
|
||||||
|
"fadd v24.2d, v24.2d, v25.2d \n" \
|
||||||
|
"fadd v26.2d, v26.2d, v27.2d \n" \
|
||||||
|
"fadd v28.2d, v28.2d, v29.2d \n" \
|
||||||
|
"fadd v30.2d, v30.2d, v31.2d \n" \
|
||||||
|
"fadd v0.2d, v0.2d, v16.2d \n" \
|
||||||
|
"fadd v1.2d, v1.2d, v18.2d \n" \
|
||||||
|
"fadd v2.2d, v2.2d, v20.2d \n" \
|
||||||
|
"fadd v3.2d, v3.2d, v22.2d \n" \
|
||||||
|
"PRFM PLDL1KEEP, ["X", #1024+128] \n" \
|
||||||
|
"PRFM PLDL1KEEP, ["X", #1024+192] \n" \
|
||||||
|
"fadd v4.2d, v4.2d, v24.2d \n" \
|
||||||
|
"fadd v5.2d, v5.2d, v26.2d \n" \
|
||||||
|
"fadd v6.2d, v6.2d, v28.2d \n" \
|
||||||
|
"fadd v7.2d, v7.2d, v30.2d \n"
|
||||||
|
|
||||||
|
#define KERNEL_F16_FINALIZE \
|
||||||
|
"fadd v0.2d, v0.2d, v1.2d \n" \
|
||||||
|
"fadd v2.2d, v2.2d, v3.2d \n" \
|
||||||
|
"fadd v4.2d, v4.2d, v5.2d \n" \
|
||||||
|
"fadd v6.2d, v6.2d, v7.2d \n" \
|
||||||
|
"fadd v0.2d, v0.2d, v2.2d \n" \
|
||||||
|
"fadd v4.2d, v4.2d, v6.2d \n" \
|
||||||
|
"fadd v0.2d, v0.2d, v4.2d \n" \
|
||||||
|
"faddp "SUMF", v0.2d \n"
|
||||||
|
|
||||||
|
#define INIT_S \
|
||||||
|
"lsl "INC_X", "INC_X", #4 \n"
|
||||||
|
|
||||||
|
#define KERNEL_S1 \
|
||||||
|
"ldr q1, ["X"] \n" \
|
||||||
|
"add "X", "X", "INC_X" \n" \
|
||||||
|
"faddp d1, v1.2d \n" \
|
||||||
|
"fadd "SUMF", "SUMF", d1 \n"
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(SMP)
|
||||||
|
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
|
||||||
|
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
|
||||||
|
void *c, BLASLONG ldc, int (*function)(), int nthreads);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
static FLOAT zasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
|
{
|
||||||
|
FLOAT asum = 0.0 ;
|
||||||
|
|
||||||
|
if ( n < 0 ) return(asum);
|
||||||
|
|
||||||
|
__asm__ __volatile__ (
|
||||||
|
" mov "N", %[N_] \n"
|
||||||
|
" mov "X", %[X_] \n"
|
||||||
|
" mov "INC_X", %[INCX_] \n"
|
||||||
|
" fmov "SUMF", "REG0" \n"
|
||||||
|
" fmov d1, "REG0" \n"
|
||||||
|
" fmov d2, "REG0" \n"
|
||||||
|
" fmov d3, "REG0" \n"
|
||||||
|
" fmov d4, "REG0" \n"
|
||||||
|
" fmov d5, "REG0" \n"
|
||||||
|
" fmov d6, "REG0" \n"
|
||||||
|
" fmov d7, "REG0" \n"
|
||||||
|
" cmp "N", xzr \n"
|
||||||
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
|
" cmp "INC_X", xzr \n"
|
||||||
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
|
" cmp "INC_X", #1 \n"
|
||||||
|
" bne 5f //asum_kernel_S_BEGIN \n"
|
||||||
|
|
||||||
|
"1: //asum_kernel_F_BEGIN: \n"
|
||||||
|
" asr "J", "N", #4 \n"
|
||||||
|
" cmp "J", xzr \n"
|
||||||
|
" beq 3f //asum_kernel_F1 \n"
|
||||||
|
|
||||||
|
".align 5 \n"
|
||||||
|
"2: //asum_kernel_F16: \n"
|
||||||
|
" "KERNEL_F16" \n"
|
||||||
|
" subs "J", "J", #1 \n"
|
||||||
|
" bne 2b //asum_kernel_F16 \n"
|
||||||
|
" "KERNEL_F16_FINALIZE" \n"
|
||||||
|
|
||||||
|
"3: //asum_kernel_F1: \n"
|
||||||
|
" ands "J", "N", #15 \n"
|
||||||
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
|
|
||||||
|
"4: //asum_kernel_F10: \n"
|
||||||
|
" "KERNEL_F1" \n"
|
||||||
|
" subs "J", "J", #1 \n"
|
||||||
|
" bne 4b //asum_kernel_F10 \n"
|
||||||
|
" b 9f //asum_kernel_L999 \n"
|
||||||
|
|
||||||
|
"5: //asum_kernel_S_BEGIN: \n"
|
||||||
|
" "INIT_S" \n"
|
||||||
|
" asr "J", "N", #2 \n"
|
||||||
|
" cmp "J", xzr \n"
|
||||||
|
" ble 7f //asum_kernel_S1 \n"
|
||||||
|
|
||||||
|
"6: //asum_kernel_S4: \n"
|
||||||
|
" "KERNEL_S1" \n"
|
||||||
|
" "KERNEL_S1" \n"
|
||||||
|
" "KERNEL_S1" \n"
|
||||||
|
" "KERNEL_S1" \n"
|
||||||
|
" subs "J", "J", #1 \n"
|
||||||
|
" bne 6b //asum_kernel_S4 \n"
|
||||||
|
|
||||||
|
"7: //asum_kernel_S1: \n"
|
||||||
|
" ands "J", "N", #3 \n"
|
||||||
|
" ble 9f //asum_kernel_L999 \n"
|
||||||
|
|
||||||
|
"8: //asum_kernel_S10: \n"
|
||||||
|
" "KERNEL_S1" \n"
|
||||||
|
" subs "J", "J", #1 \n"
|
||||||
|
" bne 8b //asum_kernel_S10 \n"
|
||||||
|
|
||||||
|
"9: //asum_kernel_L999: \n"
|
||||||
|
" fmov %[ASUM_], "SUMF" \n"
|
||||||
|
|
||||||
|
: [ASUM_] "=r" (asum) //%0
|
||||||
|
: [N_] "r" (n), //%1
|
||||||
|
[X_] "r" (x), //%2
|
||||||
|
[INCX_] "r" (inc_x) //%3
|
||||||
|
: "cc",
|
||||||
|
"memory",
|
||||||
|
"x0", "x1", "x2", "x3", "x4", "x5",
|
||||||
|
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
|
||||||
|
);
|
||||||
|
|
||||||
|
return asum;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(SMP)
|
||||||
|
static int zasum_thread_function(BLASLONG n, BLASLONG dummy0,
|
||||||
|
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
||||||
|
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
|
||||||
|
{
|
||||||
|
*result = zasum_compute(n, x, inc_x);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
|
{
|
||||||
|
#if defined(SMP)
|
||||||
|
int nthreads;
|
||||||
|
FLOAT dummy_alpha;
|
||||||
|
#endif
|
||||||
|
FLOAT asum = 0.0;
|
||||||
|
|
||||||
|
#if defined(SMP)
|
||||||
|
if (inc_x == 0 || n <= 10000)
|
||||||
|
nthreads = 1;
|
||||||
|
else
|
||||||
|
nthreads = num_cpu_avail(1);
|
||||||
|
|
||||||
|
if (nthreads == 1) {
|
||||||
|
asum = zasum_compute(n, x, inc_x);
|
||||||
|
} else {
|
||||||
|
int mode, i;
|
||||||
|
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
|
||||||
|
FLOAT *ptr;
|
||||||
|
|
||||||
|
mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||||
|
|
||||||
|
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
|
||||||
|
x, inc_x, NULL, 0, result, 0,
|
||||||
|
( void *)zasum_thread_function, nthreads);
|
||||||
|
|
||||||
|
ptr = (FLOAT *)result;
|
||||||
|
for (i = 0; i < nthreads; i++) {
|
||||||
|
asum = asum + (*ptr);
|
||||||
|
ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
asum = zasum_compute(n, x, inc_x);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return asum;
|
||||||
|
}
|
|
@ -0,0 +1,149 @@
|
||||||
|
SAMAXKERNEL = ../arm/amax.c
|
||||||
|
DAMAXKERNEL = ../arm/amax.c
|
||||||
|
CAMAXKERNEL = ../arm/zamax.c
|
||||||
|
ZAMAXKERNEL = ../arm/zamax.c
|
||||||
|
|
||||||
|
SAMINKERNEL = ../arm/amin.c
|
||||||
|
DAMINKERNEL = ../arm/amin.c
|
||||||
|
CAMINKERNEL = ../arm/zamin.c
|
||||||
|
ZAMINKERNEL = ../arm/zamin.c
|
||||||
|
|
||||||
|
SMAXKERNEL = ../arm/max.c
|
||||||
|
DMAXKERNEL = ../arm/max.c
|
||||||
|
|
||||||
|
SMINKERNEL = ../arm/min.c
|
||||||
|
DMINKERNEL = ../arm/min.c
|
||||||
|
|
||||||
|
ISAMAXKERNEL = ../arm/iamax.c
|
||||||
|
IDAMAXKERNEL = ../arm/iamax.c
|
||||||
|
ICAMAXKERNEL = ../arm/izamax.c
|
||||||
|
IZAMAXKERNEL = ../arm/izamax.c
|
||||||
|
|
||||||
|
ISAMINKERNEL = ../arm/iamin.c
|
||||||
|
IDAMINKERNEL = ../arm/iamin.c
|
||||||
|
ICAMINKERNEL = ../arm/izamin.c
|
||||||
|
IZAMINKERNEL = ../arm/izamin.c
|
||||||
|
|
||||||
|
ISMAXKERNEL = ../arm/imax.c
|
||||||
|
IDMAXKERNEL = ../arm/imax.c
|
||||||
|
|
||||||
|
ISMINKERNEL = ../arm/imin.c
|
||||||
|
IDMINKERNEL = ../arm/imin.c
|
||||||
|
|
||||||
|
SASUMKERNEL = ../arm/asum.c
|
||||||
|
DASUMKERNEL = ../arm/asum.c
|
||||||
|
CASUMKERNEL = ../arm/zasum.c
|
||||||
|
ZASUMKERNEL = ../arm/zasum.c
|
||||||
|
|
||||||
|
SSUMKERNEL = ../arm/sum.c
|
||||||
|
DSUMKERNEL = ../arm/sum.c
|
||||||
|
CSUMKERNEL = ../arm/zsum.c
|
||||||
|
ZSUMKERNEL = ../arm/zsum.c
|
||||||
|
|
||||||
|
SAXPYKERNEL = ../arm/axpy.c
|
||||||
|
DAXPYKERNEL = ../arm/axpy.c
|
||||||
|
CAXPYKERNEL = ../arm/zaxpy.c
|
||||||
|
ZAXPYKERNEL = ../arm/zaxpy.c
|
||||||
|
|
||||||
|
SCOPYKERNEL = ../arm/copy.c
|
||||||
|
DCOPYKERNEL = ../arm/copy.c
|
||||||
|
CCOPYKERNEL = ../arm/zcopy.c
|
||||||
|
ZCOPYKERNEL = ../arm/zcopy.c
|
||||||
|
|
||||||
|
SDOTKERNEL = ../arm/dot.c
|
||||||
|
DDOTKERNEL = ../arm/dot.c
|
||||||
|
CDOTKERNEL = ../arm/zdot.c
|
||||||
|
ZDOTKERNEL = ../arm/zdot.c
|
||||||
|
DSDOTKERNEL = ../generic/dot.c
|
||||||
|
|
||||||
|
SNRM2KERNEL = ../arm/nrm2.c
|
||||||
|
DNRM2KERNEL = ../arm/nrm2.c
|
||||||
|
CNRM2KERNEL = ../arm/znrm2.c
|
||||||
|
ZNRM2KERNEL = ../arm/znrm2.c
|
||||||
|
|
||||||
|
SROTKERNEL = ../arm/rot.c
|
||||||
|
DROTKERNEL = ../arm/rot.c
|
||||||
|
CROTKERNEL = ../arm/zrot.c
|
||||||
|
ZROTKERNEL = ../arm/zrot.c
|
||||||
|
|
||||||
|
SSCALKERNEL = ../arm/scal.c
|
||||||
|
DSCALKERNEL = ../arm/scal.c
|
||||||
|
CSCALKERNEL = ../arm/zscal.c
|
||||||
|
ZSCALKERNEL = ../arm/zscal.c
|
||||||
|
|
||||||
|
SSWAPKERNEL = ../arm/swap.c
|
||||||
|
DSWAPKERNEL = ../arm/swap.c
|
||||||
|
CSWAPKERNEL = ../arm/zswap.c
|
||||||
|
ZSWAPKERNEL = ../arm/zswap.c
|
||||||
|
|
||||||
|
SGEMVNKERNEL = ../arm/gemv_n.c
|
||||||
|
DGEMVNKERNEL = ../arm/gemv_n.c
|
||||||
|
CGEMVNKERNEL = ../arm/zgemv_n.c
|
||||||
|
ZGEMVNKERNEL = ../arm/zgemv_n.c
|
||||||
|
|
||||||
|
SGEMVTKERNEL = ../arm/gemv_t.c
|
||||||
|
DGEMVTKERNEL = ../arm/gemv_t.c
|
||||||
|
CGEMVTKERNEL = ../arm/zgemv_t.c
|
||||||
|
ZGEMVTKERNEL = ../arm/zgemv_t.c
|
||||||
|
|
||||||
|
STRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||||
|
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||||
|
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||||
|
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||||
|
|
||||||
|
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||||
|
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||||
|
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||||
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||||
|
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||||
|
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||||
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||||
|
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||||
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
|
||||||
|
SCABS_KERNEL = ../generic/cabs.c
|
||||||
|
DCABS_KERNEL = ../generic/cabs.c
|
||||||
|
QCABS_KERNEL = ../generic/cabs.c
|
||||||
|
LSAME_KERNEL = ../generic/lsame.c
|
||||||
|
|
||||||
|
SGEMM_BETA = ../generic/gemm_beta.c
|
||||||
|
DGEMM_BETA = ../generic/gemm_beta.c
|
||||||
|
CGEMM_BETA = ../generic/zgemm_beta.c
|
||||||
|
ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
clean ::
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -40,7 +40,6 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a,
|
||||||
|
|
||||||
if ( rows <= 0 ) return(0);
|
if ( rows <= 0 ) return(0);
|
||||||
if ( cols <= 0 ) return(0);
|
if ( cols <= 0 ) return(0);
|
||||||
if ( alpha_r == 1.0 && alpha_i == 0.0 ) return (0);
|
|
||||||
|
|
||||||
aptr = a;
|
aptr = a;
|
||||||
lda *= 2;
|
lda *= 2;
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,587 @@
|
||||||
|
/*******************************************************************************
|
||||||
|
Copyright (c) 2024, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*******************************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
||||||
|
|
||||||
|
BLASLONG i, j;
|
||||||
|
|
||||||
|
FLOAT *aoffset;
|
||||||
|
FLOAT *aoffset1, *aoffset2;
|
||||||
|
|
||||||
|
FLOAT *boffset;
|
||||||
|
|
||||||
|
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||||
|
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||||
|
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||||
|
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||||
|
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||||
|
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||||
|
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||||
|
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||||
|
|
||||||
|
FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
|
||||||
|
FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
|
||||||
|
FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
|
||||||
|
FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
|
||||||
|
FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
|
||||||
|
FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
|
||||||
|
FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
|
||||||
|
FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
|
||||||
|
|
||||||
|
aoffset = a;
|
||||||
|
boffset = b;
|
||||||
|
lda *= 2;
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
fprintf(stderr, "M = %d N = %d\n", m, n);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
j = (n >> 4);
|
||||||
|
if (j > 0){
|
||||||
|
do{
|
||||||
|
aoffset1 = aoffset;
|
||||||
|
aoffset2 = aoffset + lda;
|
||||||
|
aoffset += 32;
|
||||||
|
|
||||||
|
i = (m >> 1);
|
||||||
|
if (i > 0){
|
||||||
|
do{
|
||||||
|
ctemp01 = *(aoffset1 + 0);
|
||||||
|
ctemp02 = *(aoffset1 + 1);
|
||||||
|
ctemp03 = *(aoffset1 + 2);
|
||||||
|
ctemp04 = *(aoffset1 + 3);
|
||||||
|
ctemp05 = *(aoffset1 + 4);
|
||||||
|
ctemp06 = *(aoffset1 + 5);
|
||||||
|
ctemp07 = *(aoffset1 + 6);
|
||||||
|
ctemp08 = *(aoffset1 + 7);
|
||||||
|
ctemp09 = *(aoffset1 + 8);
|
||||||
|
ctemp10 = *(aoffset1 + 9);
|
||||||
|
ctemp11 = *(aoffset1 + 10);
|
||||||
|
ctemp12 = *(aoffset1 + 11);
|
||||||
|
ctemp13 = *(aoffset1 + 12);
|
||||||
|
ctemp14 = *(aoffset1 + 13);
|
||||||
|
ctemp15 = *(aoffset1 + 14);
|
||||||
|
ctemp16 = *(aoffset1 + 15);
|
||||||
|
ctemp17 = *(aoffset1 + 16);
|
||||||
|
ctemp18 = *(aoffset1 + 17);
|
||||||
|
ctemp19 = *(aoffset1 + 18);
|
||||||
|
ctemp20 = *(aoffset1 + 19);
|
||||||
|
ctemp21 = *(aoffset1 + 20);
|
||||||
|
ctemp22 = *(aoffset1 + 21);
|
||||||
|
ctemp23 = *(aoffset1 + 22);
|
||||||
|
ctemp24 = *(aoffset1 + 23);
|
||||||
|
ctemp25 = *(aoffset1 + 24);
|
||||||
|
ctemp26 = *(aoffset1 + 25);
|
||||||
|
ctemp27 = *(aoffset1 + 26);
|
||||||
|
ctemp28 = *(aoffset1 + 27);
|
||||||
|
ctemp29 = *(aoffset1 + 28);
|
||||||
|
ctemp30 = *(aoffset1 + 29);
|
||||||
|
ctemp31 = *(aoffset1 + 30);
|
||||||
|
ctemp32 = *(aoffset1 + 31);
|
||||||
|
|
||||||
|
ctemp33 = *(aoffset2 + 0);
|
||||||
|
ctemp34 = *(aoffset2 + 1);
|
||||||
|
ctemp35 = *(aoffset2 + 2);
|
||||||
|
ctemp36 = *(aoffset2 + 3);
|
||||||
|
ctemp37 = *(aoffset2 + 4);
|
||||||
|
ctemp38 = *(aoffset2 + 5);
|
||||||
|
ctemp39 = *(aoffset2 + 6);
|
||||||
|
ctemp40 = *(aoffset2 + 7);
|
||||||
|
ctemp41 = *(aoffset2 + 8);
|
||||||
|
ctemp42 = *(aoffset2 + 9);
|
||||||
|
ctemp43 = *(aoffset2 + 10);
|
||||||
|
ctemp44 = *(aoffset2 + 11);
|
||||||
|
ctemp45 = *(aoffset2 + 12);
|
||||||
|
ctemp46 = *(aoffset2 + 13);
|
||||||
|
ctemp47 = *(aoffset2 + 14);
|
||||||
|
ctemp48 = *(aoffset2 + 15);
|
||||||
|
ctemp49 = *(aoffset2 + 16);
|
||||||
|
ctemp50 = *(aoffset2 + 17);
|
||||||
|
ctemp51 = *(aoffset2 + 18);
|
||||||
|
ctemp52 = *(aoffset2 + 19);
|
||||||
|
ctemp53 = *(aoffset2 + 20);
|
||||||
|
ctemp54 = *(aoffset2 + 21);
|
||||||
|
ctemp55 = *(aoffset2 + 22);
|
||||||
|
ctemp56 = *(aoffset2 + 23);
|
||||||
|
ctemp57 = *(aoffset2 + 24);
|
||||||
|
ctemp58 = *(aoffset2 + 25);
|
||||||
|
ctemp59 = *(aoffset2 + 26);
|
||||||
|
ctemp60 = *(aoffset2 + 27);
|
||||||
|
ctemp61 = *(aoffset2 + 28);
|
||||||
|
ctemp62 = *(aoffset2 + 29);
|
||||||
|
ctemp63 = *(aoffset2 + 30);
|
||||||
|
ctemp64 = *(aoffset2 + 31);
|
||||||
|
|
||||||
|
*(boffset + 0) = -ctemp01;
|
||||||
|
*(boffset + 1) = -ctemp02;
|
||||||
|
*(boffset + 2) = -ctemp03;
|
||||||
|
*(boffset + 3) = -ctemp04;
|
||||||
|
*(boffset + 4) = -ctemp05;
|
||||||
|
*(boffset + 5) = -ctemp06;
|
||||||
|
*(boffset + 6) = -ctemp07;
|
||||||
|
*(boffset + 7) = -ctemp08;
|
||||||
|
|
||||||
|
*(boffset + 8) = -ctemp09;
|
||||||
|
*(boffset + 9) = -ctemp10;
|
||||||
|
*(boffset + 10) = -ctemp11;
|
||||||
|
*(boffset + 11) = -ctemp12;
|
||||||
|
*(boffset + 12) = -ctemp13;
|
||||||
|
*(boffset + 13) = -ctemp14;
|
||||||
|
*(boffset + 14) = -ctemp15;
|
||||||
|
*(boffset + 15) = -ctemp16;
|
||||||
|
|
||||||
|
*(boffset + 16) = -ctemp17;
|
||||||
|
*(boffset + 17) = -ctemp18;
|
||||||
|
*(boffset + 18) = -ctemp19;
|
||||||
|
*(boffset + 19) = -ctemp20;
|
||||||
|
*(boffset + 20) = -ctemp21;
|
||||||
|
*(boffset + 21) = -ctemp22;
|
||||||
|
*(boffset + 22) = -ctemp23;
|
||||||
|
*(boffset + 23) = -ctemp24;
|
||||||
|
|
||||||
|
*(boffset + 24) = -ctemp25;
|
||||||
|
*(boffset + 25) = -ctemp26;
|
||||||
|
*(boffset + 26) = -ctemp27;
|
||||||
|
*(boffset + 27) = -ctemp28;
|
||||||
|
*(boffset + 28) = -ctemp29;
|
||||||
|
*(boffset + 29) = -ctemp30;
|
||||||
|
*(boffset + 30) = -ctemp31;
|
||||||
|
*(boffset + 31) = -ctemp32;
|
||||||
|
|
||||||
|
*(boffset + 32) = -ctemp33;
|
||||||
|
*(boffset + 33) = -ctemp34;
|
||||||
|
*(boffset + 34) = -ctemp35;
|
||||||
|
*(boffset + 35) = -ctemp36;
|
||||||
|
*(boffset + 36) = -ctemp37;
|
||||||
|
*(boffset + 37) = -ctemp38;
|
||||||
|
*(boffset + 38) = -ctemp39;
|
||||||
|
*(boffset + 39) = -ctemp40;
|
||||||
|
|
||||||
|
*(boffset + 40) = -ctemp41;
|
||||||
|
*(boffset + 41) = -ctemp42;
|
||||||
|
*(boffset + 42) = -ctemp43;
|
||||||
|
*(boffset + 43) = -ctemp44;
|
||||||
|
*(boffset + 44) = -ctemp45;
|
||||||
|
*(boffset + 45) = -ctemp46;
|
||||||
|
*(boffset + 46) = -ctemp47;
|
||||||
|
*(boffset + 47) = -ctemp48;
|
||||||
|
|
||||||
|
*(boffset + 48) = -ctemp49;
|
||||||
|
*(boffset + 49) = -ctemp50;
|
||||||
|
*(boffset + 50) = -ctemp51;
|
||||||
|
*(boffset + 51) = -ctemp52;
|
||||||
|
*(boffset + 52) = -ctemp53;
|
||||||
|
*(boffset + 53) = -ctemp54;
|
||||||
|
*(boffset + 54) = -ctemp55;
|
||||||
|
*(boffset + 55) = -ctemp56;
|
||||||
|
|
||||||
|
*(boffset + 56) = -ctemp57;
|
||||||
|
*(boffset + 57) = -ctemp58;
|
||||||
|
*(boffset + 58) = -ctemp59;
|
||||||
|
*(boffset + 59) = -ctemp60;
|
||||||
|
*(boffset + 60) = -ctemp61;
|
||||||
|
*(boffset + 61) = -ctemp62;
|
||||||
|
*(boffset + 62) = -ctemp63;
|
||||||
|
*(boffset + 63) = -ctemp64;
|
||||||
|
|
||||||
|
aoffset1 += 2 * lda;
|
||||||
|
aoffset2 += 2 * lda;
|
||||||
|
boffset += 64;
|
||||||
|
|
||||||
|
i --;
|
||||||
|
}while(i > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 1){
|
||||||
|
ctemp01 = *(aoffset1 + 0);
|
||||||
|
ctemp02 = *(aoffset1 + 1);
|
||||||
|
ctemp03 = *(aoffset1 + 2);
|
||||||
|
ctemp04 = *(aoffset1 + 3);
|
||||||
|
ctemp05 = *(aoffset1 + 4);
|
||||||
|
ctemp06 = *(aoffset1 + 5);
|
||||||
|
ctemp07 = *(aoffset1 + 6);
|
||||||
|
ctemp08 = *(aoffset1 + 7);
|
||||||
|
ctemp09 = *(aoffset1 + 8);
|
||||||
|
ctemp10 = *(aoffset1 + 9);
|
||||||
|
ctemp11 = *(aoffset1 + 10);
|
||||||
|
ctemp12 = *(aoffset1 + 11);
|
||||||
|
ctemp13 = *(aoffset1 + 12);
|
||||||
|
ctemp14 = *(aoffset1 + 13);
|
||||||
|
ctemp15 = *(aoffset1 + 14);
|
||||||
|
ctemp16 = *(aoffset1 + 15);
|
||||||
|
ctemp17 = *(aoffset1 + 16);
|
||||||
|
ctemp18 = *(aoffset1 + 17);
|
||||||
|
ctemp19 = *(aoffset1 + 18);
|
||||||
|
ctemp20 = *(aoffset1 + 19);
|
||||||
|
ctemp21 = *(aoffset1 + 20);
|
||||||
|
ctemp22 = *(aoffset1 + 21);
|
||||||
|
ctemp23 = *(aoffset1 + 22);
|
||||||
|
ctemp24 = *(aoffset1 + 23);
|
||||||
|
ctemp25 = *(aoffset1 + 24);
|
||||||
|
ctemp26 = *(aoffset1 + 25);
|
||||||
|
ctemp27 = *(aoffset1 + 26);
|
||||||
|
ctemp28 = *(aoffset1 + 27);
|
||||||
|
ctemp29 = *(aoffset1 + 28);
|
||||||
|
ctemp30 = *(aoffset1 + 29);
|
||||||
|
ctemp31 = *(aoffset1 + 30);
|
||||||
|
ctemp32 = *(aoffset1 + 31);
|
||||||
|
|
||||||
|
*(boffset + 0) = -ctemp01;
|
||||||
|
*(boffset + 1) = -ctemp02;
|
||||||
|
*(boffset + 2) = -ctemp03;
|
||||||
|
*(boffset + 3) = -ctemp04;
|
||||||
|
*(boffset + 4) = -ctemp05;
|
||||||
|
*(boffset + 5) = -ctemp06;
|
||||||
|
*(boffset + 6) = -ctemp07;
|
||||||
|
*(boffset + 7) = -ctemp08;
|
||||||
|
|
||||||
|
*(boffset + 8) = -ctemp09;
|
||||||
|
*(boffset + 9) = -ctemp10;
|
||||||
|
*(boffset + 10) = -ctemp11;
|
||||||
|
*(boffset + 11) = -ctemp12;
|
||||||
|
*(boffset + 12) = -ctemp13;
|
||||||
|
*(boffset + 13) = -ctemp14;
|
||||||
|
*(boffset + 14) = -ctemp15;
|
||||||
|
*(boffset + 15) = -ctemp16;
|
||||||
|
|
||||||
|
*(boffset + 16) = -ctemp17;
|
||||||
|
*(boffset + 17) = -ctemp18;
|
||||||
|
*(boffset + 18) = -ctemp19;
|
||||||
|
*(boffset + 19) = -ctemp20;
|
||||||
|
*(boffset + 20) = -ctemp21;
|
||||||
|
*(boffset + 21) = -ctemp22;
|
||||||
|
*(boffset + 22) = -ctemp23;
|
||||||
|
*(boffset + 23) = -ctemp24;
|
||||||
|
|
||||||
|
*(boffset + 24) = -ctemp25;
|
||||||
|
*(boffset + 25) = -ctemp26;
|
||||||
|
*(boffset + 26) = -ctemp27;
|
||||||
|
*(boffset + 27) = -ctemp28;
|
||||||
|
*(boffset + 28) = -ctemp29;
|
||||||
|
*(boffset + 29) = -ctemp30;
|
||||||
|
*(boffset + 30) = -ctemp31;
|
||||||
|
*(boffset + 31) = -ctemp32;
|
||||||
|
|
||||||
|
boffset += 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
j--;
|
||||||
|
}while(j > 0);
|
||||||
|
} /* end of if(j > 0) */
|
||||||
|
|
||||||
|
if (n & 8){
|
||||||
|
aoffset1 = aoffset;
|
||||||
|
aoffset2 = aoffset + lda;
|
||||||
|
aoffset += 16;
|
||||||
|
|
||||||
|
i = (m >> 1);
|
||||||
|
if (i > 0){
|
||||||
|
do{
|
||||||
|
ctemp01 = *(aoffset1 + 0);
|
||||||
|
ctemp02 = *(aoffset1 + 1);
|
||||||
|
ctemp03 = *(aoffset1 + 2);
|
||||||
|
ctemp04 = *(aoffset1 + 3);
|
||||||
|
ctemp05 = *(aoffset1 + 4);
|
||||||
|
ctemp06 = *(aoffset1 + 5);
|
||||||
|
ctemp07 = *(aoffset1 + 6);
|
||||||
|
ctemp08 = *(aoffset1 + 7);
|
||||||
|
ctemp09 = *(aoffset1 + 8);
|
||||||
|
ctemp10 = *(aoffset1 + 9);
|
||||||
|
ctemp11 = *(aoffset1 + 10);
|
||||||
|
ctemp12 = *(aoffset1 + 11);
|
||||||
|
ctemp13 = *(aoffset1 + 12);
|
||||||
|
ctemp14 = *(aoffset1 + 13);
|
||||||
|
ctemp15 = *(aoffset1 + 14);
|
||||||
|
ctemp16 = *(aoffset1 + 15);
|
||||||
|
|
||||||
|
ctemp17 = *(aoffset2 + 0);
|
||||||
|
ctemp18 = *(aoffset2 + 1);
|
||||||
|
ctemp19 = *(aoffset2 + 2);
|
||||||
|
ctemp20 = *(aoffset2 + 3);
|
||||||
|
ctemp21 = *(aoffset2 + 4);
|
||||||
|
ctemp22 = *(aoffset2 + 5);
|
||||||
|
ctemp23 = *(aoffset2 + 6);
|
||||||
|
ctemp24 = *(aoffset2 + 7);
|
||||||
|
ctemp25 = *(aoffset2 + 8);
|
||||||
|
ctemp26 = *(aoffset2 + 9);
|
||||||
|
ctemp27 = *(aoffset2 + 10);
|
||||||
|
ctemp28 = *(aoffset2 + 11);
|
||||||
|
ctemp29 = *(aoffset2 + 12);
|
||||||
|
ctemp30 = *(aoffset2 + 13);
|
||||||
|
ctemp31 = *(aoffset2 + 14);
|
||||||
|
ctemp32 = *(aoffset2 + 15);
|
||||||
|
|
||||||
|
*(boffset + 0) = -ctemp01;
|
||||||
|
*(boffset + 1) = -ctemp02;
|
||||||
|
*(boffset + 2) = -ctemp03;
|
||||||
|
*(boffset + 3) = -ctemp04;
|
||||||
|
*(boffset + 4) = -ctemp05;
|
||||||
|
*(boffset + 5) = -ctemp06;
|
||||||
|
*(boffset + 6) = -ctemp07;
|
||||||
|
*(boffset + 7) = -ctemp08;
|
||||||
|
|
||||||
|
*(boffset + 8) = -ctemp09;
|
||||||
|
*(boffset + 9) = -ctemp10;
|
||||||
|
*(boffset + 10) = -ctemp11;
|
||||||
|
*(boffset + 11) = -ctemp12;
|
||||||
|
*(boffset + 12) = -ctemp13;
|
||||||
|
*(boffset + 13) = -ctemp14;
|
||||||
|
*(boffset + 14) = -ctemp15;
|
||||||
|
*(boffset + 15) = -ctemp16;
|
||||||
|
|
||||||
|
*(boffset + 16) = -ctemp17;
|
||||||
|
*(boffset + 17) = -ctemp18;
|
||||||
|
*(boffset + 18) = -ctemp19;
|
||||||
|
*(boffset + 19) = -ctemp20;
|
||||||
|
*(boffset + 20) = -ctemp21;
|
||||||
|
*(boffset + 21) = -ctemp22;
|
||||||
|
*(boffset + 22) = -ctemp23;
|
||||||
|
*(boffset + 23) = -ctemp24;
|
||||||
|
|
||||||
|
*(boffset + 24) = -ctemp25;
|
||||||
|
*(boffset + 25) = -ctemp26;
|
||||||
|
*(boffset + 26) = -ctemp27;
|
||||||
|
*(boffset + 27) = -ctemp28;
|
||||||
|
*(boffset + 28) = -ctemp29;
|
||||||
|
*(boffset + 29) = -ctemp30;
|
||||||
|
*(boffset + 30) = -ctemp31;
|
||||||
|
*(boffset + 31) = -ctemp32;
|
||||||
|
|
||||||
|
aoffset1 += 2 * lda;
|
||||||
|
aoffset2 += 2 * lda;
|
||||||
|
boffset += 32;
|
||||||
|
|
||||||
|
i --;
|
||||||
|
}while(i > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 1){
|
||||||
|
ctemp01 = *(aoffset1 + 0);
|
||||||
|
ctemp02 = *(aoffset1 + 1);
|
||||||
|
ctemp03 = *(aoffset1 + 2);
|
||||||
|
ctemp04 = *(aoffset1 + 3);
|
||||||
|
ctemp05 = *(aoffset1 + 4);
|
||||||
|
ctemp06 = *(aoffset1 + 5);
|
||||||
|
ctemp07 = *(aoffset1 + 6);
|
||||||
|
ctemp08 = *(aoffset1 + 7);
|
||||||
|
ctemp09 = *(aoffset1 + 8);
|
||||||
|
ctemp10 = *(aoffset1 + 9);
|
||||||
|
ctemp11 = *(aoffset1 + 10);
|
||||||
|
ctemp12 = *(aoffset1 + 11);
|
||||||
|
ctemp13 = *(aoffset1 + 12);
|
||||||
|
ctemp14 = *(aoffset1 + 13);
|
||||||
|
ctemp15 = *(aoffset1 + 14);
|
||||||
|
ctemp16 = *(aoffset1 + 15);
|
||||||
|
|
||||||
|
*(boffset + 0) = -ctemp01;
|
||||||
|
*(boffset + 1) = -ctemp02;
|
||||||
|
*(boffset + 2) = -ctemp03;
|
||||||
|
*(boffset + 3) = -ctemp04;
|
||||||
|
*(boffset + 4) = -ctemp05;
|
||||||
|
*(boffset + 5) = -ctemp06;
|
||||||
|
*(boffset + 6) = -ctemp07;
|
||||||
|
*(boffset + 7) = -ctemp08;
|
||||||
|
|
||||||
|
*(boffset + 8) = -ctemp09;
|
||||||
|
*(boffset + 9) = -ctemp10;
|
||||||
|
*(boffset + 10) = -ctemp11;
|
||||||
|
*(boffset + 11) = -ctemp12;
|
||||||
|
*(boffset + 12) = -ctemp13;
|
||||||
|
*(boffset + 13) = -ctemp14;
|
||||||
|
*(boffset + 14) = -ctemp15;
|
||||||
|
*(boffset + 15) = -ctemp16;
|
||||||
|
|
||||||
|
boffset += 16;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n & 4){
|
||||||
|
aoffset1 = aoffset;
|
||||||
|
aoffset2 = aoffset + lda;
|
||||||
|
aoffset += 8;
|
||||||
|
|
||||||
|
i = (m >> 1);
|
||||||
|
if (i > 0){
|
||||||
|
do{
|
||||||
|
ctemp01 = *(aoffset1 + 0);
|
||||||
|
ctemp02 = *(aoffset1 + 1);
|
||||||
|
ctemp03 = *(aoffset1 + 2);
|
||||||
|
ctemp04 = *(aoffset1 + 3);
|
||||||
|
ctemp05 = *(aoffset1 + 4);
|
||||||
|
ctemp06 = *(aoffset1 + 5);
|
||||||
|
ctemp07 = *(aoffset1 + 6);
|
||||||
|
ctemp08 = *(aoffset1 + 7);
|
||||||
|
|
||||||
|
ctemp09 = *(aoffset2 + 0);
|
||||||
|
ctemp10 = *(aoffset2 + 1);
|
||||||
|
ctemp11 = *(aoffset2 + 2);
|
||||||
|
ctemp12 = *(aoffset2 + 3);
|
||||||
|
ctemp13 = *(aoffset2 + 4);
|
||||||
|
ctemp14 = *(aoffset2 + 5);
|
||||||
|
ctemp15 = *(aoffset2 + 6);
|
||||||
|
ctemp16 = *(aoffset2 + 7);
|
||||||
|
|
||||||
|
*(boffset + 0) = -ctemp01;
|
||||||
|
*(boffset + 1) = -ctemp02;
|
||||||
|
*(boffset + 2) = -ctemp03;
|
||||||
|
*(boffset + 3) = -ctemp04;
|
||||||
|
*(boffset + 4) = -ctemp05;
|
||||||
|
*(boffset + 5) = -ctemp06;
|
||||||
|
*(boffset + 6) = -ctemp07;
|
||||||
|
*(boffset + 7) = -ctemp08;
|
||||||
|
|
||||||
|
*(boffset + 8) = -ctemp09;
|
||||||
|
*(boffset + 9) = -ctemp10;
|
||||||
|
*(boffset + 10) = -ctemp11;
|
||||||
|
*(boffset + 11) = -ctemp12;
|
||||||
|
*(boffset + 12) = -ctemp13;
|
||||||
|
*(boffset + 13) = -ctemp14;
|
||||||
|
*(boffset + 14) = -ctemp15;
|
||||||
|
*(boffset + 15) = -ctemp16;
|
||||||
|
|
||||||
|
aoffset1 += 2 * lda;
|
||||||
|
aoffset2 += 2 * lda;
|
||||||
|
boffset += 16;
|
||||||
|
|
||||||
|
i --;
|
||||||
|
}while(i > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 1){
|
||||||
|
ctemp01 = *(aoffset1 + 0);
|
||||||
|
ctemp02 = *(aoffset1 + 1);
|
||||||
|
ctemp03 = *(aoffset1 + 2);
|
||||||
|
ctemp04 = *(aoffset1 + 3);
|
||||||
|
ctemp05 = *(aoffset1 + 4);
|
||||||
|
ctemp06 = *(aoffset1 + 5);
|
||||||
|
ctemp07 = *(aoffset1 + 6);
|
||||||
|
ctemp08 = *(aoffset1 + 7);
|
||||||
|
|
||||||
|
*(boffset + 0) = -ctemp01;
|
||||||
|
*(boffset + 1) = -ctemp02;
|
||||||
|
*(boffset + 2) = -ctemp03;
|
||||||
|
*(boffset + 3) = -ctemp04;
|
||||||
|
*(boffset + 4) = -ctemp05;
|
||||||
|
*(boffset + 5) = -ctemp06;
|
||||||
|
*(boffset + 6) = -ctemp07;
|
||||||
|
*(boffset + 7) = -ctemp08;
|
||||||
|
|
||||||
|
boffset += 8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n & 2){
|
||||||
|
aoffset1 = aoffset;
|
||||||
|
aoffset2 = aoffset + lda;
|
||||||
|
aoffset += 4;
|
||||||
|
|
||||||
|
i = (m >> 1);
|
||||||
|
if (i > 0){
|
||||||
|
do{
|
||||||
|
ctemp01 = *(aoffset1 + 0);
|
||||||
|
ctemp02 = *(aoffset1 + 1);
|
||||||
|
ctemp03 = *(aoffset1 + 2);
|
||||||
|
ctemp04 = *(aoffset1 + 3);
|
||||||
|
|
||||||
|
ctemp05 = *(aoffset2 + 0);
|
||||||
|
ctemp06 = *(aoffset2 + 1);
|
||||||
|
ctemp07 = *(aoffset2 + 2);
|
||||||
|
ctemp08 = *(aoffset2 + 3);
|
||||||
|
|
||||||
|
*(boffset + 0) = -ctemp01;
|
||||||
|
*(boffset + 1) = -ctemp02;
|
||||||
|
*(boffset + 2) = -ctemp03;
|
||||||
|
*(boffset + 3) = -ctemp04;
|
||||||
|
*(boffset + 4) = -ctemp05;
|
||||||
|
*(boffset + 5) = -ctemp06;
|
||||||
|
*(boffset + 6) = -ctemp07;
|
||||||
|
*(boffset + 7) = -ctemp08;
|
||||||
|
|
||||||
|
aoffset1 += 2 * lda;
|
||||||
|
aoffset2 += 2 * lda;
|
||||||
|
boffset += 8;
|
||||||
|
|
||||||
|
i --;
|
||||||
|
}while(i > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 1){
|
||||||
|
ctemp01 = *(aoffset1 + 0);
|
||||||
|
ctemp02 = *(aoffset1 + 1);
|
||||||
|
ctemp03 = *(aoffset1 + 2);
|
||||||
|
ctemp04 = *(aoffset1 + 3);
|
||||||
|
|
||||||
|
*(boffset + 0) = -ctemp01;
|
||||||
|
*(boffset + 1) = -ctemp02;
|
||||||
|
*(boffset + 2) = -ctemp03;
|
||||||
|
*(boffset + 3) = -ctemp04;
|
||||||
|
|
||||||
|
boffset += 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n & 1){
|
||||||
|
aoffset1 = aoffset;
|
||||||
|
aoffset2 = aoffset + lda;
|
||||||
|
// aoffset += 2;
|
||||||
|
|
||||||
|
i = (m >> 1);
|
||||||
|
if (i > 0){
|
||||||
|
do{
|
||||||
|
ctemp01 = *(aoffset1 + 0);
|
||||||
|
ctemp02 = *(aoffset1 + 1);
|
||||||
|
ctemp03 = *(aoffset2 + 0);
|
||||||
|
ctemp04 = *(aoffset2 + 1);
|
||||||
|
|
||||||
|
*(boffset + 0) = -ctemp01;
|
||||||
|
*(boffset + 1) = -ctemp02;
|
||||||
|
*(boffset + 2) = -ctemp03;
|
||||||
|
*(boffset + 3) = -ctemp04;
|
||||||
|
|
||||||
|
aoffset1 += 2 * lda;
|
||||||
|
aoffset2 += 2 * lda;
|
||||||
|
boffset += 4;
|
||||||
|
|
||||||
|
i --;
|
||||||
|
}while(i > 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (m & 1){
|
||||||
|
ctemp01 = *(aoffset1 + 0);
|
||||||
|
ctemp02 = *(aoffset1 + 1);
|
||||||
|
|
||||||
|
*(boffset + 0) = -ctemp01;
|
||||||
|
*(boffset + 1) = -ctemp02;
|
||||||
|
// boffset += 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -0,0 +1,333 @@
|
||||||
|
/*******************************************************************************
|
||||||
|
Copyright (c) 2024, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*******************************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||||
|
|
||||||
|
BLASLONG i, js, offset;
|
||||||
|
|
||||||
|
FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
|
||||||
|
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
|
||||||
|
FLOAT data17, data18, data19, data20, data21, data22, data23, data24;
|
||||||
|
FLOAT data25, data26, data27, data28, data29, data30, data31, data32;
|
||||||
|
|
||||||
|
FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8;
|
||||||
|
FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16;
|
||||||
|
|
||||||
|
lda *= 2;
|
||||||
|
|
||||||
|
js = (n >> 4);
|
||||||
|
while (js > 0){
|
||||||
|
|
||||||
|
offset = posX - posY;
|
||||||
|
|
||||||
|
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
|
||||||
|
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
|
||||||
|
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
|
||||||
|
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
|
||||||
|
if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
|
||||||
|
if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
|
||||||
|
if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
|
||||||
|
if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
|
||||||
|
if (offset > -8) ao9 = a + (posX + 8) * 2 + posY * lda; else ao9 = a + posY * 2 + (posX + 8) * lda;
|
||||||
|
if (offset > -9) ao10 = a + (posX + 9) * 2 + posY * lda; else ao10 = a + posY * 2 + (posX + 9) * lda;
|
||||||
|
if (offset > -10) ao11 = a + (posX + 10) * 2 + posY * lda; else ao11 = a + posY * 2 + (posX + 10) * lda;
|
||||||
|
if (offset > -11) ao12 = a + (posX + 11) * 2 + posY * lda; else ao12 = a + posY * 2 + (posX + 11) * lda;
|
||||||
|
if (offset > -12) ao13 = a + (posX + 12) * 2 + posY * lda; else ao13 = a + posY * 2 + (posX + 12) * lda;
|
||||||
|
if (offset > -13) ao14 = a + (posX + 13) * 2 + posY * lda; else ao14 = a + posY * 2 + (posX + 13) * lda;
|
||||||
|
if (offset > -14) ao15 = a + (posX + 14) * 2 + posY * lda; else ao15 = a + posY * 2 + (posX + 14) * lda;
|
||||||
|
if (offset > -15) ao16 = a + (posX + 15) * 2 + posY * lda; else ao16 = a + posY * 2 + (posX + 15) * lda;
|
||||||
|
|
||||||
|
|
||||||
|
i = m;
|
||||||
|
|
||||||
|
while (i > 0) {
|
||||||
|
data01 = *(ao1 + 0);
|
||||||
|
data02 = *(ao1 + 1);
|
||||||
|
data03 = *(ao2 + 0);
|
||||||
|
data04 = *(ao2 + 1);
|
||||||
|
data05 = *(ao3 + 0);
|
||||||
|
data06 = *(ao3 + 1);
|
||||||
|
data07 = *(ao4 + 0);
|
||||||
|
data08 = *(ao4 + 1);
|
||||||
|
data09 = *(ao5 + 0);
|
||||||
|
data10 = *(ao5 + 1);
|
||||||
|
data11 = *(ao6 + 0);
|
||||||
|
data12 = *(ao6 + 1);
|
||||||
|
data13 = *(ao7 + 0);
|
||||||
|
data14 = *(ao7 + 1);
|
||||||
|
data15 = *(ao8 + 0);
|
||||||
|
data16 = *(ao8 + 1);
|
||||||
|
data17 = *(ao9 + 0);
|
||||||
|
data18 = *(ao9 + 1);
|
||||||
|
data19 = *(ao10 + 0);
|
||||||
|
data20 = *(ao10 + 1);
|
||||||
|
data21 = *(ao11 + 0);
|
||||||
|
data22 = *(ao11 + 1);
|
||||||
|
data23 = *(ao12 + 0);
|
||||||
|
data24 = *(ao12 + 1);
|
||||||
|
data25 = *(ao13 + 0);
|
||||||
|
data26 = *(ao13 + 1);
|
||||||
|
data27 = *(ao14 + 0);
|
||||||
|
data28 = *(ao14 + 1);
|
||||||
|
data29 = *(ao15 + 0);
|
||||||
|
data30 = *(ao15 + 1);
|
||||||
|
data31 = *(ao16 + 0);
|
||||||
|
data32 = *(ao16 + 1);
|
||||||
|
|
||||||
|
if (offset > 0) ao1 += lda; else ao1 += 2;
|
||||||
|
if (offset > -1) ao2 += lda; else ao2 += 2;
|
||||||
|
if (offset > -2) ao3 += lda; else ao3 += 2;
|
||||||
|
if (offset > -3) ao4 += lda; else ao4 += 2;
|
||||||
|
if (offset > -4) ao5 += lda; else ao5 += 2;
|
||||||
|
if (offset > -5) ao6 += lda; else ao6 += 2;
|
||||||
|
if (offset > -6) ao7 += lda; else ao7 += 2;
|
||||||
|
if (offset > -7) ao8 += lda; else ao8 += 2;
|
||||||
|
if (offset > -8) ao9 += lda; else ao9 += 2;
|
||||||
|
if (offset > -9) ao10 += lda; else ao10 += 2;
|
||||||
|
if (offset > -10) ao11 += lda; else ao11 += 2;
|
||||||
|
if (offset > -11) ao12 += lda; else ao12 += 2;
|
||||||
|
if (offset > -12) ao13 += lda; else ao13 += 2;
|
||||||
|
if (offset > -13) ao14 += lda; else ao14 += 2;
|
||||||
|
if (offset > -14) ao15 += lda; else ao15 += 2;
|
||||||
|
if (offset > -15) ao16 += lda; else ao16 += 2;
|
||||||
|
|
||||||
|
b[ 0] = data01;
|
||||||
|
b[ 1] = data02;
|
||||||
|
b[ 2] = data03;
|
||||||
|
b[ 3] = data04;
|
||||||
|
b[ 4] = data05;
|
||||||
|
b[ 5] = data06;
|
||||||
|
b[ 6] = data07;
|
||||||
|
b[ 7] = data08;
|
||||||
|
b[ 8] = data09;
|
||||||
|
b[ 9] = data10;
|
||||||
|
b[10] = data11;
|
||||||
|
b[11] = data12;
|
||||||
|
b[12] = data13;
|
||||||
|
b[13] = data14;
|
||||||
|
b[14] = data15;
|
||||||
|
b[15] = data16;
|
||||||
|
b[16] = data17;
|
||||||
|
b[17] = data18;
|
||||||
|
b[18] = data19;
|
||||||
|
b[19] = data20;
|
||||||
|
b[20] = data21;
|
||||||
|
b[21] = data22;
|
||||||
|
b[22] = data23;
|
||||||
|
b[23] = data24;
|
||||||
|
b[24] = data25;
|
||||||
|
b[25] = data26;
|
||||||
|
b[26] = data27;
|
||||||
|
b[27] = data28;
|
||||||
|
b[28] = data29;
|
||||||
|
b[29] = data30;
|
||||||
|
b[30] = data31;
|
||||||
|
b[31] = data32;
|
||||||
|
|
||||||
|
b += 32;
|
||||||
|
|
||||||
|
offset --;
|
||||||
|
i --;
|
||||||
|
}
|
||||||
|
|
||||||
|
posX += 16;
|
||||||
|
js --;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n & 8) {
|
||||||
|
offset = posX - posY;
|
||||||
|
|
||||||
|
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
|
||||||
|
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
|
||||||
|
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
|
||||||
|
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
|
||||||
|
if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
|
||||||
|
if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
|
||||||
|
if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
|
||||||
|
if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
|
||||||
|
|
||||||
|
i = m;
|
||||||
|
|
||||||
|
while (i > 0) {
|
||||||
|
data01 = *(ao1 + 0);
|
||||||
|
data02 = *(ao1 + 1);
|
||||||
|
data03 = *(ao2 + 0);
|
||||||
|
data04 = *(ao2 + 1);
|
||||||
|
data05 = *(ao3 + 0);
|
||||||
|
data06 = *(ao3 + 1);
|
||||||
|
data07 = *(ao4 + 0);
|
||||||
|
data08 = *(ao4 + 1);
|
||||||
|
data09 = *(ao5 + 0);
|
||||||
|
data10 = *(ao5 + 1);
|
||||||
|
data11 = *(ao6 + 0);
|
||||||
|
data12 = *(ao6 + 1);
|
||||||
|
data13 = *(ao7 + 0);
|
||||||
|
data14 = *(ao7 + 1);
|
||||||
|
data15 = *(ao8 + 0);
|
||||||
|
data16 = *(ao8 + 1);
|
||||||
|
|
||||||
|
if (offset > 0) ao1 += lda; else ao1 += 2;
|
||||||
|
if (offset > -1) ao2 += lda; else ao2 += 2;
|
||||||
|
if (offset > -2) ao3 += lda; else ao3 += 2;
|
||||||
|
if (offset > -3) ao4 += lda; else ao4 += 2;
|
||||||
|
if (offset > -4) ao5 += lda; else ao5 += 2;
|
||||||
|
if (offset > -5) ao6 += lda; else ao6 += 2;
|
||||||
|
if (offset > -6) ao7 += lda; else ao7 += 2;
|
||||||
|
if (offset > -7) ao8 += lda; else ao8 += 2;
|
||||||
|
|
||||||
|
b[ 0] = data01;
|
||||||
|
b[ 1] = data02;
|
||||||
|
b[ 2] = data03;
|
||||||
|
b[ 3] = data04;
|
||||||
|
b[ 4] = data05;
|
||||||
|
b[ 5] = data06;
|
||||||
|
b[ 6] = data07;
|
||||||
|
b[ 7] = data08;
|
||||||
|
b[ 8] = data09;
|
||||||
|
b[ 9] = data10;
|
||||||
|
b[10] = data11;
|
||||||
|
b[11] = data12;
|
||||||
|
b[12] = data13;
|
||||||
|
b[13] = data14;
|
||||||
|
b[14] = data15;
|
||||||
|
b[15] = data16;
|
||||||
|
|
||||||
|
b += 16;
|
||||||
|
|
||||||
|
offset --;
|
||||||
|
i --;
|
||||||
|
}
|
||||||
|
|
||||||
|
posX += 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n & 4) {
|
||||||
|
offset = posX - posY;
|
||||||
|
|
||||||
|
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
|
||||||
|
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
|
||||||
|
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
|
||||||
|
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
|
||||||
|
|
||||||
|
i = m;
|
||||||
|
|
||||||
|
while (i > 0) {
|
||||||
|
data01 = *(ao1 + 0);
|
||||||
|
data02 = *(ao1 + 1);
|
||||||
|
data03 = *(ao2 + 0);
|
||||||
|
data04 = *(ao2 + 1);
|
||||||
|
data05 = *(ao3 + 0);
|
||||||
|
data06 = *(ao3 + 1);
|
||||||
|
data07 = *(ao4 + 0);
|
||||||
|
data08 = *(ao4 + 1);
|
||||||
|
|
||||||
|
if (offset > 0) ao1 += lda; else ao1 += 2;
|
||||||
|
if (offset > -1) ao2 += lda; else ao2 += 2;
|
||||||
|
if (offset > -2) ao3 += lda; else ao3 += 2;
|
||||||
|
if (offset > -3) ao4 += lda; else ao4 += 2;
|
||||||
|
|
||||||
|
b[ 0] = data01;
|
||||||
|
b[ 1] = data02;
|
||||||
|
b[ 2] = data03;
|
||||||
|
b[ 3] = data04;
|
||||||
|
b[ 4] = data05;
|
||||||
|
b[ 5] = data06;
|
||||||
|
b[ 6] = data07;
|
||||||
|
b[ 7] = data08;
|
||||||
|
|
||||||
|
b += 8;
|
||||||
|
|
||||||
|
offset --;
|
||||||
|
i --;
|
||||||
|
}
|
||||||
|
|
||||||
|
posX += 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n & 2) {
|
||||||
|
|
||||||
|
offset = posX - posY;
|
||||||
|
|
||||||
|
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
|
||||||
|
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
|
||||||
|
|
||||||
|
i = m;
|
||||||
|
|
||||||
|
while (i > 0) {
|
||||||
|
data01 = *(ao1 + 0);
|
||||||
|
data02 = *(ao1 + 1);
|
||||||
|
data03 = *(ao2 + 0);
|
||||||
|
data04 = *(ao2 + 1);
|
||||||
|
|
||||||
|
if (offset > 0) ao1 += lda; else ao1 += 2;
|
||||||
|
if (offset > -1) ao2 += lda; else ao2 += 2;
|
||||||
|
|
||||||
|
b[ 0] = data01;
|
||||||
|
b[ 1] = data02;
|
||||||
|
b[ 2] = data03;
|
||||||
|
b[ 3] = data04;
|
||||||
|
|
||||||
|
b += 4;
|
||||||
|
|
||||||
|
offset --;
|
||||||
|
i --;
|
||||||
|
}
|
||||||
|
|
||||||
|
posX += 2;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n & 1) {
|
||||||
|
|
||||||
|
offset = posX - posY;
|
||||||
|
|
||||||
|
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
|
||||||
|
|
||||||
|
i = m;
|
||||||
|
|
||||||
|
while (i > 0) {
|
||||||
|
data01 = *(ao1 + 0);
|
||||||
|
data02 = *(ao1 + 1);
|
||||||
|
|
||||||
|
if (offset > 0) ao1 += lda; else ao1 += 2;
|
||||||
|
|
||||||
|
b[ 0] = data01;
|
||||||
|
b[ 1] = data02;
|
||||||
|
|
||||||
|
b += 2;
|
||||||
|
|
||||||
|
offset --;
|
||||||
|
i --;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue