Compare commits
No commits in common. "develop" and "risc-v" have entirely different histories.
174
.cirrus.yml
174
.cirrus.yml
|
@ -1,174 +0,0 @@
|
|||
macos_instance:
|
||||
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
|
||||
|
||||
#task:
|
||||
# name: AppleM1/LLVM
|
||||
# compile_script:
|
||||
# - brew install llvm
|
||||
# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
# - make TARGET=VORTEX USE_OPENMP=1 CC=clang
|
||||
|
||||
#task:
|
||||
# name: AppleM1/LLVM/ILP64
|
||||
# compile_script:
|
||||
# - brew install llvm
|
||||
# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
# - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1
|
||||
|
||||
#task:
|
||||
# name: AppleM1/LLVM/CMAKE
|
||||
# compile_script:
|
||||
# - brew install llvm
|
||||
# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
# - mkdir build
|
||||
# - cd build
|
||||
# - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
|
||||
# - make -j 4
|
||||
|
||||
#task:
|
||||
# name: AppleM1/GCC/MAKE/OPENMP
|
||||
# compile_script:
|
||||
# - brew install gcc@11
|
||||
# - export PATH=/opt/homebrew/bin:$PATH
|
||||
# - export LDFLAGS="-L/opt/homebrew/lib"
|
||||
# - export CPPFLAGS="-I/opt/homebrew/include"
|
||||
# - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
|
||||
|
||||
macos_instance:
|
||||
image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest
|
||||
task:
|
||||
name: AppleM1/LLVM x86_64 xbuild
|
||||
compile_script:
|
||||
- #brew install llvm
|
||||
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
- export ARCHS="i386 x86_64"
|
||||
- export ARCHS_STANDARD="i386 x86_64"
|
||||
- export ARCHS_STANDARD_32_64_BIT="i386 x86_64"
|
||||
- export ARCHS_STANDARD_64_BIT=x86_64
|
||||
- export ARCHS_STANDARD_INCLUDING_64_BIT="i386 x86_64"
|
||||
- export ARCHS_UNIVERSAL_IPHONE_OS="i386 x86_64"
|
||||
- export VALID_ARCHS="i386 x86_64"
|
||||
- xcrun --sdk macosx --show-sdk-path
|
||||
- xcodebuild -version
|
||||
- export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.5.sdk -arch x86_64"
|
||||
- make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
|
||||
always:
|
||||
config_artifacts:
|
||||
path: "*conf*"
|
||||
type: text/plain
|
||||
# lib_artifacts:
|
||||
# path: "libopenblas*"
|
||||
# type: application/octet-streamm
|
||||
|
||||
macos_instance:
|
||||
image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest
|
||||
task:
|
||||
name: AppleM1/LLVM armv8-ios xbuild
|
||||
compile_script:
|
||||
- #brew install llvm
|
||||
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
- export CC=/Applications/Xcode_15.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode_15.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.5.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
- xcrun --sdk iphoneos --show-sdk-path
|
||||
- ls -l /Applications
|
||||
- make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1
|
||||
always:
|
||||
config_artifacts:
|
||||
path: "*conf*"
|
||||
type: text/plain
|
||||
|
||||
macos_instance:
|
||||
image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest
|
||||
task:
|
||||
name: AppleM1/LLVM armv7-androidndk xbuild
|
||||
compile_script:
|
||||
- brew install --cask android-ndk
|
||||
- export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk"
|
||||
- export CC=/opt/homebrew/share/android-ndk/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
|
||||
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
|
||||
always:
|
||||
config_artifacts:
|
||||
path: "*conf*"
|
||||
type: text/plain
|
||||
|
||||
task:
|
||||
name: NeoverseN1
|
||||
arm_container:
|
||||
image: node:latest
|
||||
compile_script:
|
||||
- make
|
||||
|
||||
task:
|
||||
name: NeoverseN1-ILP64
|
||||
arm_container:
|
||||
image: node:latest
|
||||
compile_script:
|
||||
- make INTERFACE64=1
|
||||
|
||||
task:
|
||||
name: NeoverseN1-OMP
|
||||
arm_container:
|
||||
image: node:latest
|
||||
cpu: 8
|
||||
compile_script:
|
||||
- make USE_OPENMP=1
|
||||
|
||||
FreeBSD_task:
|
||||
name: FreeBSD-gcc12
|
||||
freebsd_instance:
|
||||
image_family: freebsd-13-3
|
||||
install_script:
|
||||
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
|
||||
compile_script:
|
||||
- ls -l /usr/local/lib
|
||||
- gmake CC=gcc
|
||||
|
||||
|
||||
FreeBSD_task:
|
||||
name: freebsd-gcc12-ilp64
|
||||
freebsd_instance:
|
||||
image_family: freebsd-13-3
|
||||
install_script:
|
||||
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
|
||||
compile_script:
|
||||
- ls -l /usr/local/lib
|
||||
- gmake CC=gcc INTERFACE64=1
|
||||
|
||||
FreeBSD_task:
|
||||
name: FreeBSD-clang-openmp
|
||||
freebsd_instance:
|
||||
image_family: freebsd-13-3
|
||||
install_script:
|
||||
- pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
|
||||
- ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so
|
||||
compile_script:
|
||||
- gmake CC=clang FC=gfortran USE_OPENMP=1 CPP_THREAD_SAFETY_TEST=1
|
||||
|
||||
#task:
|
||||
# name: Windows/LLVM16 --- too slow ---
|
||||
# windows_container:
|
||||
# image: cirrusci/windowsservercore:cmake-2021.12.07
|
||||
# install_script:
|
||||
# - choco list --localonly
|
||||
# - choco install -y llvm
|
||||
# - # choco install -y cmake --installargs '"ADD_CMAKE_TO_PATH=System"'
|
||||
# - choco install -y ninja
|
||||
# - refreshenv
|
||||
# - cd "c:/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/VC/Auxiliary/Build"
|
||||
# - vcvarsall x64
|
||||
# - cd "C:\Users\ContainerAdministrator\AppData\Local\Temp\cirrus-ci-build"
|
||||
# - cmake -S . -B build -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release
|
||||
# - cd build
|
||||
# - cmake --build .
|
||||
# - ctest
|
16
.cirun.yml
16
.cirun.yml
|
@ -1,16 +0,0 @@
|
|||
# Self-Hosted Github Action Runners on AWS via Cirun.io
|
||||
# Reference: https://docs.cirun.io/reference/yaml
|
||||
runners:
|
||||
- name: "aws-runner-graviton"
|
||||
# Cloud Provider: AWS
|
||||
cloud: "aws"
|
||||
region: "us-east-1"
|
||||
# Cheapest VM on AWS
|
||||
instance_type: "c7g.large"
|
||||
# Ubuntu-22.04, ami image
|
||||
machine_image: "ami-0a0c8eebcdd6dcbd0"
|
||||
preemptible: false
|
||||
# Add this label in the "runs-on" param in .github/workflows/<workflow-name>.yml
|
||||
# So that this runner is created for running the workflow
|
||||
labels:
|
||||
- "cirun-aws-runner-graviton"
|
|
@ -1,149 +0,0 @@
|
|||
name: apple m
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read # to fetch code (actions/checkout)
|
||||
|
||||
jobs:
|
||||
build:
|
||||
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||
runs-on: macos-14
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build: [cmake, make]
|
||||
fortran: [gfortran]
|
||||
openmp: [0, 1]
|
||||
ilp64: [0, 1]
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Print system information
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
cat /proc/cpuinfo
|
||||
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||
sysctl -a | grep machdep.cpu
|
||||
else
|
||||
echo "::error::$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Install Dependencies
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
sudo apt-get install -y gfortran cmake ccache libtinfo5
|
||||
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
|
||||
brew reinstall gcc
|
||||
brew install coreutils cmake ccache
|
||||
brew install llvm
|
||||
else
|
||||
echo "::error::$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Compilation cache
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.ccache
|
||||
# We include the commit sha in the cache key, as new cache entries are
|
||||
# only created if there is no existing entry for the key yet.
|
||||
# GNU make and cmake call the compilers differently. It looks like
|
||||
# that causes the cache to mismatch. Keep the ccache for both build
|
||||
# tools separate to avoid polluting each other.
|
||||
key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
|
||||
# Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
|
||||
restore-keys: |
|
||||
ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}-${{ github.ref }}
|
||||
ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}
|
||||
ccache-${{ runner.os }}-${{ matrix.build }}
|
||||
|
||||
- name: Configure ccache
|
||||
run: |
|
||||
if [ "${{ matrix.build }}" = "make" ]; then
|
||||
# Add ccache to path
|
||||
if [ "$RUNNER_OS" = "Linux" ]; then
|
||||
echo "/usr/lib/ccache" >> $GITHUB_PATH
|
||||
elif [ "$RUNNER_OS" = "macOS" ]; then
|
||||
echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH
|
||||
echo "/opt/homebrew/opt/llvm/bin" >>$GITHUB_PATH
|
||||
echo "" >>$GITHUB_PATH
|
||||
else
|
||||
echo "::error::$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
|
||||
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||
echo "max_size = 300M" > ~/.ccache/ccache.conf
|
||||
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||
ccache -s
|
||||
|
||||
- name: Build OpenBLAS
|
||||
run: |
|
||||
export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
export CC="/opt/homebrew/opt/llvm/bin/clang"
|
||||
case "${{ matrix.build }}" in
|
||||
"make")
|
||||
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=${{matrix.openmp}} INTERFACE64=${{matrix.ilp64}} FC="ccache ${{ matrix.fortran }}"
|
||||
;;
|
||||
"cmake")
|
||||
export LDFLAGS="$LDFLAGS -Wl,-ld_classic"
|
||||
mkdir build && cd build
|
||||
cmake -DDYNAMIC_ARCH=1 \
|
||||
-DUSE_OPENMP=${{matrix.openmp}} \
|
||||
-DINTERFACE64=${{matrix.ilp64}} \
|
||||
-DNOFORTRAN=0 \
|
||||
-DBUILD_WITHOUT_LAPACK=0 \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=ON \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
|
||||
..
|
||||
cmake --build .
|
||||
;;
|
||||
*)
|
||||
echo "::error::Configuration not supported"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
- name: Show ccache status
|
||||
continue-on-error: true
|
||||
run: ccache -s
|
||||
|
||||
- name: Run tests
|
||||
timeout-minutes: 60
|
||||
run: |
|
||||
case "${{ matrix.build }}" in
|
||||
"make")
|
||||
MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0'
|
||||
echo "::group::Tests in 'test' directory"
|
||||
make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
|
||||
echo "::endgroup::"
|
||||
echo "::group::Tests in 'ctest' directory"
|
||||
make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
|
||||
echo "::endgroup::"
|
||||
echo "::group::Tests in 'utest' directory"
|
||||
make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
|
||||
echo "::endgroup::"
|
||||
;;
|
||||
"cmake")
|
||||
cd build && ctest
|
||||
;;
|
||||
*)
|
||||
echo "::error::Configuration not supported"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
|
@ -1,139 +0,0 @@
|
|||
name: arm64 graviton cirun
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- develop
|
||||
- release-**
|
||||
pull_request:
|
||||
branches:
|
||||
- develop
|
||||
- release-**
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read # to fetch code (actions/checkout)
|
||||
|
||||
jobs:
|
||||
build:
|
||||
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||
runs-on: "cirun-aws-runner-graviton--${{ github.run_id }}"
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
fortran: [gfortran]
|
||||
build: [cmake, make]
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Print system information
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
cat /proc/cpuinfo
|
||||
else
|
||||
echo "::error::$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Install Dependencies
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
sudo apt update
|
||||
sudo apt-get install -y gfortran cmake ccache libtinfo5
|
||||
else
|
||||
echo "::error::$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Compilation cache
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.ccache
|
||||
# We include the commit sha in the cache key, as new cache entries are
|
||||
# only created if there is no existing entry for the key yet.
|
||||
# GNU make and cmake call the compilers differently. It looks like
|
||||
# that causes the cache to mismatch. Keep the ccache for both build
|
||||
# tools separate to avoid polluting each other.
|
||||
key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
|
||||
# Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
|
||||
restore-keys: |
|
||||
ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}
|
||||
ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}
|
||||
ccache-${{ runner.os }}-${{ matrix.build }}
|
||||
|
||||
- name: Configure ccache
|
||||
run: |
|
||||
if [ "${{ matrix.build }}" = "make" ]; then
|
||||
# Add ccache to path
|
||||
if [ "$RUNNER_OS" = "Linux" ]; then
|
||||
echo "/usr/lib/ccache" >> $GITHUB_PATH
|
||||
else
|
||||
echo "::error::$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
|
||||
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||
echo "max_size = 300M" > ~/.ccache/ccache.conf
|
||||
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||
ccache -s
|
||||
|
||||
- name: Build OpenBLAS
|
||||
run: |
|
||||
case "${{ matrix.build }}" in
|
||||
"make")
|
||||
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}"
|
||||
;;
|
||||
"cmake")
|
||||
mkdir build && cd build
|
||||
cmake -DDYNAMIC_ARCH=1 \
|
||||
-DNOFORTRAN=0 \
|
||||
-DBUILD_WITHOUT_LAPACK=0 \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=ON \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
|
||||
..
|
||||
cmake --build .
|
||||
;;
|
||||
*)
|
||||
echo "::error::Configuration not supported"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
- name: Show ccache status
|
||||
continue-on-error: true
|
||||
run: ccache -s
|
||||
|
||||
- name: Run tests
|
||||
timeout-minutes: 60
|
||||
run: |
|
||||
case "${{ matrix.build }}" in
|
||||
"make")
|
||||
MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0'
|
||||
echo "::group::Tests in 'test' directory"
|
||||
make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
|
||||
echo "::endgroup::"
|
||||
echo "::group::Tests in 'ctest' directory"
|
||||
make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
|
||||
echo "::endgroup::"
|
||||
echo "::group::Tests in 'utest' directory"
|
||||
make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
|
||||
echo "::endgroup::"
|
||||
;;
|
||||
"cmake")
|
||||
cd build && ctest
|
||||
;;
|
||||
*)
|
||||
echo "::error::Configuration not supported"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
|
@ -1,127 +0,0 @@
|
|||
name: c910v qemu test
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read # to fetch code (actions/checkout)
|
||||
|
||||
jobs:
|
||||
TEST:
|
||||
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1698113812618
|
||||
toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0-20231018.tar.gz
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- target: RISCV64_GENERIC
|
||||
triple: riscv64-linux-gnu
|
||||
apt_triple: riscv64-linux-gnu
|
||||
opts: NO_SHARED=1 TARGET=RISCV64_GENERIC
|
||||
- target: C910V
|
||||
triple: riscv64-unknown-linux-gnu
|
||||
apt_triple: riscv64-linux-gnu
|
||||
opts: NO_SHARED=1 TARGET=C910V
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: install build deps
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
|
||||
gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross
|
||||
|
||||
- name: checkout qemu
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
repository: T-head-Semi/qemu
|
||||
path: qemu
|
||||
ref: 1e692ebb43d396c52352406323fc782c1ac99a42
|
||||
|
||||
- name: build qemu
|
||||
run: |
|
||||
# Force use c910v qemu-user
|
||||
wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
|
||||
cd qemu
|
||||
patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
|
||||
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
|
||||
make -j$(nproc)
|
||||
make install
|
||||
|
||||
- name: Compilation cache
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.ccache
|
||||
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
|
||||
restore-keys: |
|
||||
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
|
||||
ccache-${{ runner.os }}-${{ matrix.target }}
|
||||
|
||||
- name: Configure ccache
|
||||
run: |
|
||||
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||
echo "max_size = 300M" > ~/.ccache/ccache.conf
|
||||
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||
ccache -s
|
||||
|
||||
- name: build OpenBLAS
|
||||
run: |
|
||||
wget ${xuetie_toolchain}/${toolchain_file_name}
|
||||
tar -xvf ${toolchain_file_name} -C /opt
|
||||
export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0/bin:$PATH"
|
||||
|
||||
make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
|
||||
|
||||
- name: test
|
||||
run: |
|
||||
export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
|
||||
qemu-riscv64 ./utest/openblas_utest
|
||||
qemu-riscv64 ./utest/openblas_utest_ext
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat2 < ./ctest/sin2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat2 < ./ctest/din2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat2 < ./ctest/cin2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat2 < ./ctest/zin2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat3 < ./ctest/sin3
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat3 < ./ctest/din3
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat3 < ./ctest/cin3
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat3 < ./ctest/zin3
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat1
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat1
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat1
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat1
|
||||
rm -f ./test/?BLAT2.SUMM
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat
|
||||
rm -f ./test/?BLAT2.SUMM
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat
|
||||
rm -f ./test/?BLAT3.SUMM
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat
|
||||
rm -f ./test/?BLAT3.SUMM
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat
|
|
@ -1,157 +0,0 @@
|
|||
name: Run codspeed benchmarks
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read # to fetch code (actions/checkout)
|
||||
|
||||
jobs:
|
||||
benchmarks:
|
||||
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ubuntu-latest]
|
||||
fortran: [gfortran]
|
||||
build: [make]
|
||||
pyver: ["3.12"]
|
||||
runs-on: ${{ matrix.os }}
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/setup-python@v3
|
||||
with:
|
||||
python-version: ${{ matrix.pyver }}
|
||||
|
||||
- name: Print system information
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
cat /proc/cpuinfo
|
||||
fi
|
||||
|
||||
- name: Install Dependencies
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y gfortran cmake ccache libtinfo5
|
||||
else
|
||||
echo "::error::$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Compilation cache
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.ccache
|
||||
# We include the commit sha in the cache key, as new cache entries are
|
||||
# only created if there is no existing entry for the key yet.
|
||||
# GNU make and cmake call the compilers differently. It looks like
|
||||
# that causes the cache to mismatch. Keep the ccache for both build
|
||||
# tools separate to avoid polluting each other.
|
||||
key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
|
||||
# Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
|
||||
restore-keys: |
|
||||
ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}
|
||||
ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}
|
||||
ccache-${{ runner.os }}-${{ matrix.build }}
|
||||
|
||||
- name: Write out the .pc
|
||||
run: |
|
||||
cd benchmark/pybench
|
||||
cat > openblas.pc << EOF
|
||||
libdir=${{ github.workspace }}
|
||||
includedir= ${{ github.workspace }}
|
||||
openblas_config= OpenBLAS 0.3.27 DYNAMIC_ARCH NO_AFFINITY Haswell MAX_THREADS=64
|
||||
version=0.0.99
|
||||
extralib=-lm -lpthread -lgfortran -lquadmath -L${{ github.workspace }} -lopenblas
|
||||
Name: openblas
|
||||
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
||||
Version: ${version}
|
||||
URL: https://github.com/xianyi/OpenBLAS
|
||||
Libs: ${{ github.workspace }}/libopenblas.so -Wl,-rpath,${{ github.workspace }}
|
||||
Libs.private: -lm -lpthread -lgfortran -lquadmath -L${{ github.workspace }} -lopenblas
|
||||
Cflags: -I${{ github.workspace}}
|
||||
EOF
|
||||
cat openblas.pc
|
||||
|
||||
- name: Configure ccache
|
||||
run: |
|
||||
if [ "${{ matrix.build }}" = "make" ]; then
|
||||
# Add ccache to path
|
||||
if [ "$RUNNER_OS" = "Linux" ]; then
|
||||
echo "/usr/lib/ccache" >> $GITHUB_PATH
|
||||
elif [ "$RUNNER_OS" = "macOS" ]; then
|
||||
echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH
|
||||
else
|
||||
echo "::error::$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
|
||||
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||
echo "max_size = 300M" > ~/.ccache/ccache.conf
|
||||
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||
ccache -s
|
||||
|
||||
- name: Build OpenBLAS
|
||||
run: |
|
||||
case "${{ matrix.build }}" in
|
||||
"make")
|
||||
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}"
|
||||
;;
|
||||
"cmake")
|
||||
mkdir build && cd build
|
||||
cmake -DDYNAMIC_ARCH=1 \
|
||||
-DNOFORTRAN=0 \
|
||||
-DBUILD_WITHOUT_LAPACK=0 \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=ON \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
|
||||
..
|
||||
cmake --build .
|
||||
;;
|
||||
*)
|
||||
echo "::error::Configuration not supported"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
- name: Show ccache status
|
||||
continue-on-error: true
|
||||
run: ccache -s
|
||||
|
||||
- name: Install benchmark dependencies
|
||||
run: pip install meson ninja numpy pytest pytest-codspeed --user
|
||||
|
||||
- name: Build the wrapper
|
||||
run: |
|
||||
cd benchmark/pybench
|
||||
export PKG_CONFIG_PATH=$PWD
|
||||
meson setup build --prefix=$PWD/build-install
|
||||
meson install -C build
|
||||
#
|
||||
# sanity check
|
||||
cd build/openblas_wrap
|
||||
python -c'import _flapack; print(dir(_flapack))'
|
||||
|
||||
- name: Run benchmarks under pytest-benchmark
|
||||
run: |
|
||||
cd benchmark/pybench
|
||||
pip install pytest-benchmark
|
||||
export PYTHONPATH=$PWD/build-install/lib/python${{matrix.pyver}}/site-packages/
|
||||
OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py -k 'gesdd'
|
||||
|
||||
- name: Run benchmarks
|
||||
uses: CodSpeedHQ/action@v2
|
||||
with:
|
||||
token: ${{ secrets.CODSPEED_TOKEN }}
|
||||
run: |
|
||||
cd benchmark/pybench
|
||||
export PYTHONPATH=$PWD/build-install/lib/python${{matrix.pyver}}/site-packages/
|
||||
OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py --codspeed
|
||||
|
|
@ -1,40 +0,0 @@
|
|||
name: Publish docs via GitHub Pages
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- develop
|
||||
pull_request:
|
||||
branches:
|
||||
- develop
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: Deploy docs
|
||||
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: "3.10"
|
||||
|
||||
- name: Install MkDocs and doc theme packages
|
||||
run: pip install mkdocs mkdocs-material mkdocs-git-revision-date-localized-plugin
|
||||
|
||||
- name: Build docs site
|
||||
run: mkdocs build
|
||||
|
||||
# mkdocs gh-deploy command only builds to the top-level, hence deploying
|
||||
# with this action instead.
|
||||
# Deploys to http://www.openmathlib.org/OpenBLAS/docs/
|
||||
- name: Deploy docs
|
||||
uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0
|
||||
if: ${{ github.ref == 'refs/heads/develop' }}
|
||||
with:
|
||||
github_token: ${{ secrets.GITHUB_TOKEN }}
|
||||
publish_dir: ./site
|
||||
destination_dir: docs/
|
|
@ -2,16 +2,11 @@ name: continuous build
|
|||
|
||||
on: [push, pull_request]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read # to fetch code (actions/checkout)
|
||||
|
||||
jobs:
|
||||
build:
|
||||
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
strategy:
|
||||
|
@ -42,8 +37,7 @@ jobs:
|
|||
- name: Install Dependencies
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y gfortran cmake ccache libtinfo5
|
||||
sudo apt-get install -y gfortran cmake ccache
|
||||
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
|
||||
brew reinstall gcc
|
||||
|
@ -152,59 +146,45 @@ jobs:
|
|||
|
||||
|
||||
msys2:
|
||||
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
msystem: [UCRT64, MINGW32, CLANG64, CLANG32]
|
||||
msystem: [MINGW64, MINGW32, CLANG64]
|
||||
idx: [int32, int64]
|
||||
build-type: [Release]
|
||||
include:
|
||||
- msystem: UCRT64
|
||||
- msystem: MINGW64
|
||||
idx: int32
|
||||
target-prefix: mingw-w64-ucrt-x86_64
|
||||
fc-pkg: fc
|
||||
target-prefix: mingw-w64-x86_64
|
||||
fc-pkg: mingw-w64-x86_64-gcc-fortran
|
||||
- msystem: MINGW32
|
||||
idx: int32
|
||||
target-prefix: mingw-w64-i686
|
||||
fc-pkg: fc
|
||||
fc-pkg: mingw-w64-i686-gcc-fortran
|
||||
- msystem: CLANG64
|
||||
idx: int32
|
||||
target-prefix: mingw-w64-clang-x86_64
|
||||
fc-pkg: fc
|
||||
# Compiling with Flang 16 seems to cause test errors on machines
|
||||
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
|
||||
no-avx512-flags: -DNO_AVX512=1
|
||||
- msystem: CLANG32
|
||||
idx: int32
|
||||
target-prefix: mingw-w64-clang-i686
|
||||
fc-pkg: cc
|
||||
c-lapack-flags: -DC_LAPACK=ON
|
||||
- msystem: UCRT64
|
||||
- msystem: MINGW64
|
||||
idx: int64
|
||||
idx64-flags: -DBINARY=64 -DINTERFACE64=1
|
||||
target-prefix: mingw-w64-ucrt-x86_64
|
||||
fc-pkg: fc
|
||||
target-prefix: mingw-w64-x86_64
|
||||
fc-pkg: mingw-w64-x86_64-gcc-fortran
|
||||
- msystem: CLANG64
|
||||
idx: int64
|
||||
idx64-flags: -DBINARY=64 -DINTERFACE64=1
|
||||
target-prefix: mingw-w64-clang-x86_64
|
||||
fc-pkg: fc
|
||||
# Compiling with Flang 16 seems to cause test errors on machines
|
||||
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
|
||||
no-avx512-flags: -DNO_AVX512=1
|
||||
- msystem: UCRT64
|
||||
c-lapack-flags: -DC_LAPACK=ON
|
||||
- msystem: MINGW64
|
||||
idx: int32
|
||||
target-prefix: mingw-w64-ucrt-x86_64
|
||||
fc-pkg: fc
|
||||
target-prefix: mingw-w64-x86_64
|
||||
fc-pkg: mingw-w64-x86_64-gcc-fortran
|
||||
build-type: None
|
||||
exclude:
|
||||
- msystem: MINGW32
|
||||
idx: int64
|
||||
- msystem: CLANG32
|
||||
idx: int64
|
||||
|
||||
defaults:
|
||||
run:
|
||||
|
@ -229,7 +209,7 @@ jobs:
|
|||
install: >-
|
||||
base-devel
|
||||
${{ matrix.target-prefix }}-cc
|
||||
${{ matrix.target-prefix }}-${{ matrix.fc-pkg }}
|
||||
${{ matrix.fc-pkg }}
|
||||
${{ matrix.target-prefix }}-cmake
|
||||
${{ matrix.target-prefix }}-ninja
|
||||
${{ matrix.target-prefix }}-ccache
|
||||
|
@ -237,21 +217,14 @@ jobs:
|
|||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Prepare ccache
|
||||
# Get cache location of ccache
|
||||
# Create key that is used in action/cache/restore and action/cache/save steps
|
||||
id: ccache-prepare
|
||||
run: |
|
||||
echo "ccachedir=$(cygpath -m $(ccache -k cache_dir))" >> $GITHUB_OUTPUT
|
||||
- name: Compilation cache
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
# It looks like this path needs to be hard-coded.
|
||||
path: C:/msys64/home/runneradmin/.ccache
|
||||
# We include the commit sha in the cache key, as new cache entries are
|
||||
# only created if there is no existing entry for the key yet.
|
||||
echo "key=ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Restore ccache
|
||||
uses: actions/cache/restore@v3
|
||||
with:
|
||||
path: ${{ steps.ccache-prepare.outputs.ccachedir }}
|
||||
key: ${{ steps.ccache-prepare.outputs.key }}
|
||||
key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}
|
||||
# Restore a matching ccache cache entry. Prefer same branch.
|
||||
restore-keys: |
|
||||
ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}
|
||||
|
@ -261,10 +234,9 @@ jobs:
|
|||
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota.
|
||||
run: |
|
||||
which ccache
|
||||
test -d ${{ steps.ccache-prepare.outputs.ccachedir }} || mkdir -p ${{ steps.ccache-prepare.outputs.ccachedir }}
|
||||
echo "max_size = 250M" > ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf
|
||||
echo "compression = true" >> ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf
|
||||
ccache -p
|
||||
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||
echo "max_size = 250M" > ~/.ccache/ccache.conf
|
||||
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||
ccache -s
|
||||
echo $HOME
|
||||
cygpath -w $HOME
|
||||
|
@ -281,7 +253,6 @@ jobs:
|
|||
-DTARGET=CORE2 \
|
||||
${{ matrix.idx64-flags }} \
|
||||
${{ matrix.c-lapack-flags }} \
|
||||
${{ matrix.no-avx512-flags }} \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
|
||||
..
|
||||
|
@ -293,33 +264,12 @@ jobs:
|
|||
continue-on-error: true
|
||||
run: ccache -s
|
||||
|
||||
- name: Save ccache
|
||||
# Save the cache after we are done (successfully) building
|
||||
uses: actions/cache/save@v3
|
||||
with:
|
||||
path: ${{ steps.ccache-prepare.outputs.ccachedir }}
|
||||
key: ${{ steps.ccache-prepare.outputs.key }}
|
||||
|
||||
- name: Run tests
|
||||
id: run-ctest
|
||||
timeout-minutes: 60
|
||||
run: cd build && ctest
|
||||
|
||||
- name: Re-run tests
|
||||
if: always() && (steps.run-ctest.outcome == 'failure')
|
||||
timeout-minutes: 60
|
||||
run: |
|
||||
cd build
|
||||
echo "::group::Re-run ctest"
|
||||
ctest --rerun-failed --output-on-failure || true
|
||||
echo "::endgroup::"
|
||||
echo "::group::Log from these tests"
|
||||
[ ! -f Testing/Temporary/LastTest.log ] || cat Testing/Temporary/LastTest.log
|
||||
echo "::endgroup::"
|
||||
|
||||
|
||||
cross_build:
|
||||
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
strategy:
|
||||
|
@ -345,7 +295,6 @@ jobs:
|
|||
|
||||
- name: Install Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y ccache gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-${{ matrix.target }}-cross
|
||||
|
||||
- name: Compilation cache
|
||||
|
|
|
@ -1,119 +0,0 @@
|
|||
name: loongarch64 qemu test
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
TEST:
|
||||
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||
runs-on: ubuntu-24.04
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- target: LOONGSONGENERIC
|
||||
triple: loongarch64-linux-gnu
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSONGENERIC
|
||||
- target: LOONGSON3R5
|
||||
triple: loongarch64-linux-gnu
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5
|
||||
- target: LOONGSON2K1000
|
||||
triple: loongarch64-linux-gnu
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000
|
||||
- target: LA64_GENERIC
|
||||
triple: loongarch64-linux-gnu
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA64_GENERIC
|
||||
- target: LA464
|
||||
triple: loongarch64-linux-gnu
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA464
|
||||
- target: LA264
|
||||
triple: loongarch64-linux-gnu
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA264
|
||||
- target: DYNAMIC_ARCH
|
||||
triple: loongarch64-linux-gnu
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Install APT deps
|
||||
run: |
|
||||
sudo apt-get update && \
|
||||
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache qemu-user-static \
|
||||
gcc-14-loongarch64-linux-gnu g++-14-loongarch64-linux-gnu gfortran-14-loongarch64-linux-gnu
|
||||
|
||||
- name: Compilation cache
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.ccache
|
||||
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
|
||||
restore-keys: |
|
||||
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
|
||||
ccache-${{ runner.os }}-${{ matrix.target }}
|
||||
|
||||
- name: Configure ccache
|
||||
run: |
|
||||
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||
echo "max_size = 300M" > ~/.ccache/ccache.conf
|
||||
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||
ccache -s
|
||||
|
||||
- name: Disable utest dsdot:dsdot_n_1
|
||||
run: |
|
||||
echo -n > utest/test_dsdot.c
|
||||
echo "Due to the current version of qemu causing utest cases to fail,"
|
||||
echo "the utest dsdot:dsdot_n_1 have been temporarily disabled."
|
||||
|
||||
- name: Build OpenBLAS
|
||||
run: |
|
||||
make CC='ccache ${{ matrix.triple }}-gcc-14 -static' FC='ccache ${{ matrix.triple }}-gfortran-14 -static' \
|
||||
RANLIB='ccache ${{ matrix.triple }}-gcc-ranlib-14' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
|
||||
|
||||
- name: Test
|
||||
run: |
|
||||
qemu-loongarch64-static ./utest/openblas_utest
|
||||
qemu-loongarch64-static ./utest/openblas_utest_ext
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1
|
||||
rm -f ./test/?BLAT2.SUMM
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
|
||||
rm -f ./test/?BLAT2.SUMM
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
|
||||
rm -f ./test/?BLAT3.SUMM
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat
|
||||
rm -f ./test/?BLAT3.SUMM
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat
|
|
@ -1,141 +0,0 @@
|
|||
name: loongarch64 clang qemu test
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
TEST:
|
||||
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- target: LOONGSONGENERIC
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSONGENERIC
|
||||
- target: LOONGSON3R5
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5
|
||||
- target: LOONGSON2K1000
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000
|
||||
- target: LA64_GENERIC
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA64_GENERIC
|
||||
- target: LA464
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA464
|
||||
- target: LA264
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA264
|
||||
- target: DYNAMIC_ARCH
|
||||
opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Install libffi6
|
||||
run: |
|
||||
wget http://ftp.ca.debian.org/debian/pool/main/libf/libffi/libffi6_3.2.1-9_amd64.deb
|
||||
sudo dpkg -i libffi6_3.2.1-9_amd64.deb
|
||||
|
||||
- name: Install APT deps
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache
|
||||
|
||||
- name: Download and install loongarch64-toolchain
|
||||
run: |
|
||||
wget https://github.com/XiWeiGu/loongarch64_toolchain/releases/download/V0.1/clang+llvm_8.0.1-6_amd64-linux-gnu_debian-10.tar.gz
|
||||
wget https://github.com/XiWeiGu/loongarch64_toolchain/releases/download/V0.1/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.3.tar.xz
|
||||
tar -xf clang+llvm_8.0.1-6_amd64-linux-gnu_debian-10.tar.gz -C /opt
|
||||
tar -xf loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.3.tar.xz -C /opt
|
||||
|
||||
- name: Checkout qemu
|
||||
uses: actions/checkout@v3
|
||||
with:
|
||||
repository: qemu/qemu
|
||||
path: qemu
|
||||
ref: master
|
||||
|
||||
- name: Install qemu
|
||||
run: |
|
||||
cd qemu
|
||||
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=loongarch64-linux-user --disable-system --static
|
||||
make -j$(nproc)
|
||||
make install
|
||||
|
||||
- name: Set env
|
||||
run: |
|
||||
echo "PATH=$GITHUB_WORKSPACE:/opt/clang+llvm_8.0.1-6_amd64-linux-gnu_debian-10/bin:/opt/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.3/bin:$PATH" >> $GITHUB_ENV
|
||||
|
||||
- name: Compilation cache
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.ccache
|
||||
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
|
||||
restore-keys: |
|
||||
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
|
||||
ccache-${{ runner.os }}-${{ matrix.target }}
|
||||
|
||||
- name: Configure ccache
|
||||
run: |
|
||||
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||
echo "max_size = 300M" > ~/.ccache/ccache.conf
|
||||
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||
ccache -s
|
||||
|
||||
- name: Disable utest dsdot:dsdot_n_1
|
||||
run: |
|
||||
echo -n > utest/test_dsdot.c
|
||||
echo "Due to the qemu versions 7.2 causing utest cases to fail,"
|
||||
echo "the utest dsdot:dsdot_n_1 have been temporarily disabled."
|
||||
|
||||
- name: Build OpenBLAS
|
||||
run: make CC='ccache clang --target=loongarch64-linux-gnu --sysroot=/opt/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.3/loongarch64-linux-gnu/sysroot/ -static' FC='ccache loongarch64-linux-gnu-gfortran -static' HOSTCC='ccache clang' CROSS_SUFFIX=llvm- NO_SHARED=1 ${{ matrix.opts }} -j$(nproc)
|
||||
|
||||
- name: Test
|
||||
run: |
|
||||
export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
|
||||
qemu-loongarch64 ./utest/openblas_utest
|
||||
qemu-loongarch64 ./utest/openblas_utest_ext
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat2 < ./ctest/sin2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat2 < ./ctest/din2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat2 < ./ctest/cin2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat2 < ./ctest/zin2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat3 < ./ctest/sin3
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat3 < ./ctest/din3
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat3 < ./ctest/cin3
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat3 < ./ctest/zin3
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat1
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat1
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat1
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat1
|
||||
rm -f ./test/?BLAT2.SUMM
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat2 < ./test/sblat2.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat2 < ./test/dblat2.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat2 < ./test/cblat2.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat2 < ./test/zblat2.dat
|
||||
rm -f ./test/?BLAT2.SUMM
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat2 < ./test/sblat2.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat2 < ./test/dblat2.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat2 < ./test/cblat2.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat2 < ./test/zblat2.dat
|
||||
rm -f ./test/?BLAT3.SUMM
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat3 < ./test/sblat3.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat3 < ./test/dblat3.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat3 < ./test/cblat3.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat3 < ./test/zblat3.dat
|
||||
rm -f ./test/?BLAT3.SUMM
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat3 < ./test/sblat3.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat3 < ./test/dblat3.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat3 < ./test/cblat3.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat3 < ./test/zblat3.dat
|
||||
|
|
@ -2,16 +2,11 @@ name: mips64 qemu test
|
|||
|
||||
on: [push, pull_request]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read # to fetch code (actions/checkout)
|
||||
|
||||
jobs:
|
||||
TEST:
|
||||
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
|
@ -80,7 +75,6 @@ jobs:
|
|||
run: |
|
||||
export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
|
||||
qemu-mips64el ./utest/openblas_utest
|
||||
qemu-mips64el ./utest/openblas_utest_ext
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat1
|
||||
|
|
|
@ -18,16 +18,11 @@ on:
|
|||
|
||||
name: Nightly-Homebrew-Build
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read # to fetch code (actions/checkout)
|
||||
|
||||
jobs:
|
||||
build-OpenBLAS-with-Homebrew:
|
||||
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||
runs-on: macos-latest
|
||||
env:
|
||||
DEVELOPER_DIR: /Applications/Xcode_11.4.1.app/Contents/Developer
|
||||
|
@ -37,8 +32,6 @@ jobs:
|
|||
HOMEBREW_NO_AUTO_UPDATE: "ON"
|
||||
HOMEBREW_NO_BOTTLE_SOURCE_FALLBACK: "ON"
|
||||
HOMEBREW_NO_INSTALL_CLEANUP: "ON"
|
||||
HOMEBREW_NO_INSTALLED_DEPENDENTS_CHECK: "ON"
|
||||
HOMEBREW_NO_INSTALL_FROM_API: "ON"
|
||||
|
||||
steps:
|
||||
- name: Random delay for cron job
|
||||
|
@ -69,7 +62,7 @@ jobs:
|
|||
mv *.bottle.tar.gz bottles
|
||||
|
||||
- name: Upload bottle
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v1
|
||||
with:
|
||||
name: openblas--HEAD.catalina.bottle.tar.gz
|
||||
path: bottles
|
||||
|
|
|
@ -1,256 +0,0 @@
|
|||
name: riscv64 zvl256b qemu test
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read # to fetch code (actions/checkout)
|
||||
|
||||
jobs:
|
||||
TEST:
|
||||
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
triple: riscv64-unknown-linux-gnu
|
||||
riscv_gnu_toolchain: https://github.com/riscv-collab/riscv-gnu-toolchain
|
||||
riscv_gnu_toolchain_version: 13.2.0
|
||||
riscv_gnu_toolchain_nightly_download_path: /releases/download/2024.02.02/riscv64-glibc-ubuntu-22.04-llvm-nightly-2024.02.02-nightly.tar.gz
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- target: RISCV64_ZVL128B
|
||||
opts: TARGET=RISCV64_ZVL128B BINARY=64 ARCH=riscv64
|
||||
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=128,elen=64
|
||||
- target: RISCV64_ZVL256B
|
||||
opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64
|
||||
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64
|
||||
- target: DYNAMIC_ARCH=1
|
||||
opts: TARGET=RISCV64_GENERIC BINARY=64 ARCH=riscv64 DYNAMIC_ARCH=1
|
||||
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: install build deps
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install autoconf automake autotools-dev ninja-build make \
|
||||
libgomp1-riscv64-cross ccache
|
||||
wget ${riscv_gnu_toolchain}/${riscv_gnu_toolchain_nightly_download_path}
|
||||
tar -xvf $(basename ${riscv_gnu_toolchain_nightly_download_path}) -C /opt
|
||||
|
||||
- name: Compilation cache
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.ccache
|
||||
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
|
||||
restore-keys: |
|
||||
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
|
||||
ccache-${{ runner.os }}-${{ matrix.target }}
|
||||
|
||||
- name: Configure ccache
|
||||
run: |
|
||||
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||
echo "max_size = 300M" > ~/.ccache/ccache.conf
|
||||
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||
ccache -s
|
||||
|
||||
- name: build OpenBLAS libs
|
||||
run: |
|
||||
export PATH="/opt/riscv/bin:$PATH"
|
||||
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
|
||||
CC='ccache clang --rtlib=compiler-rt -target ${triple} --sysroot /opt/riscv/sysroot --gcc-toolchain=/opt/riscv/lib/gcc/riscv64-unknown-linux-gnu/${riscv_gnu_toolchain_version}/' \
|
||||
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
|
||||
RANLIB='ccache ${triple}-ranlib' \
|
||||
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
|
||||
HOSTCC=gcc HOSTFC=gfortran -j$(nproc)
|
||||
|
||||
- name: build OpenBLAS tests
|
||||
run: |
|
||||
export PATH="/opt/riscv/bin:$PATH"
|
||||
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
|
||||
CC='${triple}-gcc' \
|
||||
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
|
||||
RANLIB='ccache ${triple}-ranlib' \
|
||||
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
|
||||
HOSTCC=gcc HOSTFC=gfortran -j$(nproc) tests
|
||||
|
||||
- name: build lapack-netlib tests
|
||||
working-directory: ./lapack-netlib/TESTING
|
||||
run: |
|
||||
export PATH="/opt/riscv/bin:$PATH"
|
||||
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
|
||||
CC='${triple}-gcc' \
|
||||
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
|
||||
RANLIB='ccache ${triple}-ranlib' \
|
||||
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
|
||||
HOSTCC=gcc HOSTFC=gfortran -j$(nproc) \
|
||||
LIN/xlintsts LIN/xlintstc LIN/xlintstd LIN/xlintstz LIN/xlintstrfs \
|
||||
LIN/xlintstrfc LIN/xlintstrfd LIN/xlintstrfz LIN/xlintstds \
|
||||
LIN/xlintstzc EIG/xeigtsts EIG/xeigtstc EIG/xeigtstd EIG/xeigtstz \
|
||||
|
||||
- name: OpenBLAS tests
|
||||
shell: bash
|
||||
run: |
|
||||
export PATH="/opt/riscv/bin:$PATH"
|
||||
export QEMU_CPU=${{ matrix.qemu_cpu }}
|
||||
rm -rf ./test_out
|
||||
mkdir -p ./test_out
|
||||
run_test() { local DIR=$1; local CMD=$2; local DATA=$3; local OUTPUT="./test_out/$DIR.$CMD"; \
|
||||
echo "`pwd`/$DIR/$CMD $DIR/$DATA" >> $OUTPUT; \
|
||||
if [[ -z $DATA ]]; then qemu-riscv64 ./$DIR/$CMD |& tee $OUTPUT ; \
|
||||
else qemu-riscv64 ./$DIR/$CMD < ./$DIR/$DATA |& tee $OUTPUT ; fi ; \
|
||||
RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi \
|
||||
}
|
||||
run_test test cblat1 &
|
||||
run_test test cblat2 cblat2.dat &
|
||||
run_test test cblat3 cblat3.dat &
|
||||
run_test test dblat1 &
|
||||
run_test test dblat2 dblat2.dat &
|
||||
run_test test dblat3 dblat3.dat &
|
||||
run_test test sblat1 &
|
||||
run_test test sblat2 sblat2.dat &
|
||||
run_test test sblat3 sblat3.dat &
|
||||
run_test test zblat1 &
|
||||
run_test test zblat2 zblat2.dat &
|
||||
run_test test zblat3 zblat3.dat &
|
||||
run_test ctest xccblat1 &
|
||||
run_test ctest xccblat2 cin2 &
|
||||
run_test ctest xccblat3 cin3 &
|
||||
run_test ctest xdcblat1 &
|
||||
run_test ctest xdcblat2 din2 &
|
||||
run_test ctest xdcblat3 din3 &
|
||||
run_test ctest xscblat1 &
|
||||
run_test ctest xscblat2 sin2 &
|
||||
run_test ctest xscblat3 sin3 &
|
||||
run_test ctest xzcblat1 &
|
||||
run_test ctest xzcblat2 zin2 &
|
||||
run_test ctest xzcblat3 zin3 &
|
||||
wait
|
||||
while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*)
|
||||
if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi
|
||||
|
||||
- name: netlib tests
|
||||
shell: bash
|
||||
run: |
|
||||
: # these take a very long time
|
||||
echo "Skipping netlib tests in CI"
|
||||
exit 0
|
||||
: # comment out exit above to enable the tests
|
||||
: # probably we want to identify a subset to run in CI
|
||||
export PATH="/opt/riscv/bin:$PATH"
|
||||
export QEMU_CPU=${{ matrix.qemu_cpu }}
|
||||
rm -rf ./test_out
|
||||
mkdir -p ./test_out
|
||||
run_test() { local OUTPUT="./test_out/$1"; local DATA="./lapack-netlib/TESTING/$2"; local CMD="./lapack-netlib/TESTING/$3"; \
|
||||
echo "$4" >> $OUTPUT; \
|
||||
echo "$CMD" >> $OUTPUT; \
|
||||
qemu-riscv64 $CMD < $DATA |& tee $OUTPUT; \
|
||||
RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi; \
|
||||
if grep -q fail $OUTPUT ; then echo "*** FAIL: log contains 'fail'" >> $OUTPUT ; fi ; \
|
||||
if grep -q rror $OUTPUT | grep -v -q "passed" | grep -v "largest error" ; then echo "*** FAIL: log contains 'error'" >> $OUTPUT ; fi \
|
||||
}
|
||||
run_test stest.out stest.in LIN/xlintsts "Testing REAL LAPACK linear equation routines" &
|
||||
run_test ctest.out ctest.in LIN/xlintstc "Testing COMPLEX LAPACK linear equation routines" &
|
||||
run_test dtest.out dtest.in LIN/xlintstd "Testing DOUBLE PRECISION LAPACK linear equation routines" &
|
||||
run_test ztest.out ztest.in LIN/xlintstz "Testing COMPLEX16 LAPACK linear equation routines" &
|
||||
run_test dstest.out dstest.in LIN/xlintstds "Testing SINGLE-DOUBLE PRECISION LAPACK prototype linear equation routines" &
|
||||
run_test zctest.out zctest.in LIN/xlintstzc "Testing COMPLEX-COMPLEX16 LAPACK prototype linear equation routines" &
|
||||
run_test stest_rfp.out stest_rfp.in LIN/xlintstrfs "Testing REAL LAPACK RFP prototype linear equation routines" &
|
||||
run_test dtest_rfp.out dtest_rfp.in LIN/xlintstrfd "Testing DOUBLE PRECISION LAPACK RFP prototype linear equation routines" &
|
||||
run_test ctest_rfp.out ctest_rfp.in LIN/xlintstrfc "Testing COMPLEX LAPACK RFP prototype linear equation routines" &
|
||||
run_test ztest_rfp.out ztest_rfp.in LIN/xlintstrfz "Testing COMPLEX16 LAPACK RFP prototype linear equation routines" &
|
||||
run_test snep.out nep.in EIG/xeigtsts "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
|
||||
run_test ssep.out sep.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test sse2.out se2.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test ssvd.out svd.in EIG/xeigtsts "SVD - Testing Singular Value Decomposition routines" &
|
||||
run_test sec.out sec.in EIG/xeigtsts "SEC - Testing REAL Eigen Condition Routines" &
|
||||
run_test sed.out sed.in EIG/xeigtsts "SEV - Testing REAL Nonsymmetric Eigenvalue Driver" &
|
||||
run_test sgg.out sgg.in EIG/xeigtsts "SGG - Testing REAL Nonsymmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test sgd.out sgd.in EIG/xeigtsts "SGD - Testing REAL Nonsymmetric Generalized Eigenvalue Problem driver routines" &
|
||||
run_test ssb.out ssb.in EIG/xeigtsts "SSB - Testing REAL Symmetric Eigenvalue Problem routines" &
|
||||
run_test ssg.out ssg.in EIG/xeigtsts "SSG - Testing REAL Symmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test sbal.out sbal.in EIG/xeigtsts "SGEBAL - Testing the balancing of a REAL general matrix" &
|
||||
run_test sbak.out sbak.in EIG/xeigtsts "SGEBAK - Testing the back transformation of a REAL balanced matrix" &
|
||||
run_test sgbal.out sgbal.in EIG/xeigtsts "SGGBAL - Testing the balancing of a pair of REAL general matrices" &
|
||||
run_test sgbak.out sgbak.in EIG/xeigtsts "SGGBAK - Testing the back transformation of a pair of REAL balanced matrices" &
|
||||
run_test sbb.out sbb.in EIG/xeigtsts "SBB - Testing banded Singular Value Decomposition routines" &
|
||||
run_test sglm.out glm.in EIG/xeigtsts "GLM - Testing Generalized Linear Regression Model routines" &
|
||||
run_test sgqr.out gqr.in EIG/xeigtsts "GQR - Testing Generalized QR and RQ factorization routines" &
|
||||
run_test sgsv.out gsv.in EIG/xeigtsts "GSV - Testing Generalized Singular Value Decomposition routines" &
|
||||
run_test scsd.out csd.in EIG/xeigtsts "CSD - Testing CS Decomposition routines" &
|
||||
run_test slse.out lse.in EIG/xeigtsts "LSE - Testing Constrained Linear Least Squares routines" &
|
||||
run_test cnep.out nep.in EIG/xeigtstc "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
|
||||
run_test csep.out sep.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test cse2.out se2.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test csvd.out svd.in EIG/xeigtstc "SVD - Testing Singular Value Decomposition routines" &
|
||||
run_test cec.out cec.in EIG/xeigtstc "CEC - Testing COMPLEX Eigen Condition Routines" &
|
||||
run_test ced.out ced.in EIG/xeigtstc "CES - Testing COMPLEX Nonsymmetric Schur Form Driver" &
|
||||
run_test cgg.out cgg.in EIG/xeigtstc "CGG - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test cgd.out cgd.in EIG/xeigtstc "CGD - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem driver routines" &
|
||||
run_test csb.out csb.in EIG/xeigtstc "CHB - Testing Hermitian Eigenvalue Problem routines" &
|
||||
run_test csg.out csg.in EIG/xeigtstc "CSG - Testing Symmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test cbal.out cbal.in EIG/xeigtstc "CGEBAL - Testing the balancing of a COMPLEX general matrix" &
|
||||
run_test cbak.out cbak.in EIG/xeigtstc "CGEBAK - Testing the back transformation of a COMPLEX balanced matrix" &
|
||||
run_test cgbal.out cgbal.in EIG/xeigtstc "CGGBAL - Testing the balancing of a pair of COMPLEX general matrices" &
|
||||
run_test cgbak.out cgbak.in EIG/xeigtstc "CGGBAK - Testing the back transformation of a pair of COMPLEX balanced matrices" &
|
||||
run_test cbb.out cbb.in EIG/xeigtstc "CBB - Testing banded Singular Value Decomposition routines" &
|
||||
run_test cglm.out glm.in EIG/xeigtstc "GLM - Testing Generalized Linear Regression Model routines" &
|
||||
run_test cgqr.out gqr.in EIG/xeigtstc "GQR - Testing Generalized QR and RQ factorization routines" &
|
||||
run_test cgsv.out gsv.in EIG/xeigtstc "GSV - Testing Generalized Singular Value Decomposition routines" &
|
||||
run_test ccsd.out csd.in EIG/xeigtstc "CSD - Testing CS Decomposition routines" &
|
||||
run_test clse.out lse.in EIG/xeigtstc "LSE - Testing Constrained Linear Least Squares routines" &
|
||||
run_test dnep.out nep.in EIG/xeigtstd "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
|
||||
run_test dsep.out sep.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test dse2.out se2.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test dsvd.out svd.in EIG/xeigtstd "SVD - Testing Singular Value Decomposition routines" &
|
||||
run_test dec.out dec.in EIG/xeigtstd "DEC - Testing DOUBLE PRECISION Eigen Condition Routines" &
|
||||
run_test ded.out ded.in EIG/xeigtstd "DEV - Testing DOUBLE PRECISION Nonsymmetric Eigenvalue Driver" &
|
||||
run_test dgg.out dgg.in EIG/xeigtstd "DGG - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test dgd.out dgd.in EIG/xeigtstd "DGD - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem driver routines" &
|
||||
run_test dsb.out dsb.in EIG/xeigtstd "DSB - Testing DOUBLE PRECISION Symmetric Eigenvalue Problem routines" &
|
||||
run_test dsg.out dsg.in EIG/xeigtstd "DSG - Testing DOUBLE PRECISION Symmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test dbal.out dbal.in EIG/xeigtstd "DGEBAL - Testing the balancing of a DOUBLE PRECISION general matrix" &
|
||||
run_test dbak.out dbak.in EIG/xeigtstd "DGEBAK - Testing the back transformation of a DOUBLE PRECISION balanced matrix" &
|
||||
run_test dgbal.out dgbal.in EIG/xeigtstd "DGGBAL - Testing the balancing of a pair of DOUBLE PRECISION general matrices" &
|
||||
run_test dgbak.out dgbak.in EIG/xeigtstd "DGGBAK - Testing the back transformation of a pair of DOUBLE PRECISION balanced matrices" &
|
||||
run_test dbb.out dbb.in EIG/xeigtstd "DBB - Testing banded Singular Value Decomposition routines" &
|
||||
run_test dglm.out glm.in EIG/xeigtstd "GLM - Testing Generalized Linear Regression Model routines" &
|
||||
run_test dgqr.out gqr.in EIG/xeigtstd "GQR - Testing Generalized QR and RQ factorization routines" &
|
||||
run_test dgsv.out gsv.in EIG/xeigtstd "GSV - Testing Generalized Singular Value Decomposition routines" &
|
||||
run_test dcsd.out csd.in EIG/xeigtstd "CSD - Testing CS Decomposition routines" &
|
||||
run_test dlse.out lse.in EIG/xeigtstd "LSE - Testing Constrained Linear Least Squares routines" &
|
||||
run_test znep.out nep.in EIG/xeigtstz "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
|
||||
run_test zsep.out sep.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test zse2.out se2.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test zsvd.out svd.in EIG/xeigtstz "SVD - Testing Singular Value Decomposition routines" &
|
||||
run_test zec.out zec.in EIG/xeigtstz "ZEC - Testing COMPLEX16 Eigen Condition Routines" &
|
||||
run_test zed.out zed.in EIG/xeigtstz "ZES - Testing COMPLEX16 Nonsymmetric Schur Form Driver" &
|
||||
run_test zgg.out zgg.in EIG/xeigtstz "ZGG - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test zgd.out zgd.in EIG/xeigtstz "ZGD - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem driver routines" &
|
||||
run_test zsb.out zsb.in EIG/xeigtstz "ZHB - Testing Hermitian Eigenvalue Problem routines" &
|
||||
run_test zsg.out zsg.in EIG/xeigtstz "ZSG - Testing Symmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test zbal.out zbal.in EIG/xeigtstz "ZGEBAL - Testing the balancing of a COMPLEX16 general matrix" &
|
||||
run_test zbak.out zbak.in EIG/xeigtstz "ZGEBAK - Testing the back transformation of a COMPLEX16 balanced matrix" &
|
||||
run_test zgbal.out zgbal.in EIG/xeigtstz "ZGGBAL - Testing the balancing of a pair of COMPLEX general matrices" &
|
||||
run_test zgbak.out zgbak.in EIG/xeigtstz "ZGGBAK - Testing the back transformation of a pair of COMPLEX16 balanced matrices" &
|
||||
run_test zbb.out zbb.in EIG/xeigtstz "ZBB - Testing banded Singular Value Decomposition routines" &
|
||||
run_test zglm.out glm.in EIG/xeigtstz "GLM - Testing Generalized Linear Regression Model routines" &
|
||||
run_test zgqr.out gqr.in EIG/xeigtstz "GQR - Testing Generalized QR and RQ factorization routines" &
|
||||
run_test zgsv.out gsv.in EIG/xeigtstz "GSV - Testing Generalized Singular Value Decomposition routines" &
|
||||
run_test zcsd.out csd.in EIG/xeigtstz "CSD - Testing CS Decomposition routines" &
|
||||
run_test zlse.out lse.in EIG/xeigtstz "LSE - Testing Constrained Linear Least Squares routines" &
|
||||
wait
|
||||
while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*)
|
||||
python ./lapack-netlib/lapack_testing.py -d ./test_out -e > netlib_summary
|
||||
TOTALS="$(grep 'ALL PRECISIONS' netlib_summary)"
|
||||
NUMERICAL_ERRORS=-1
|
||||
OTHER_ERRORS=-1
|
||||
. <(awk '/ALL PRECISIONS/{printf "NUMERICAL_ERRORS=%s\nOTHER_ERRORS=%s\n", $5, $7}' netlib_summary
|
||||
if (( NUMERICAL_ERRORS != 0 )) || (( OTHER_ERRORS != 0 )) ; then cat netlib_summary ; FAILURES=1 ; fi
|
||||
if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi
|
|
@ -14,7 +14,6 @@ lapack-3.4.2
|
|||
lapack-3.4.2.tgz
|
||||
lapack-netlib/make.inc
|
||||
lapack-netlib/lapacke/include/lapacke_mangling.h
|
||||
lapack-netlib/SRC/la_constants.mod
|
||||
lapack-netlib/TESTING/testing_results.txt
|
||||
lapack-netlib/INSTALL/test*
|
||||
lapack-netlib/TESTING/xeigtstc
|
||||
|
@ -47,59 +46,44 @@ config_last.h
|
|||
getarch
|
||||
getarch_2nd
|
||||
utest/openblas_utest
|
||||
utest/openblas_utest_ext
|
||||
ctest/xccblat1
|
||||
ctest/xccblat2
|
||||
ctest/xccblat3
|
||||
ctest/xccblat3_3m
|
||||
ctest/xdcblat1
|
||||
ctest/xdcblat2
|
||||
ctest/xdcblat3
|
||||
ctest/xdcblat3_3m
|
||||
ctest/xscblat1
|
||||
ctest/xscblat2
|
||||
ctest/xscblat3
|
||||
ctest/xscblat3_3m
|
||||
ctest/xzcblat1
|
||||
ctest/xzcblat2
|
||||
ctest/xzcblat3
|
||||
ctest/xzcblat3_3m
|
||||
exports/linktest.c
|
||||
exports/linux.def
|
||||
kernel/setparam_*.c
|
||||
kernel/kernel_*.h
|
||||
test/CBLAT2.SUMM
|
||||
test/CBLAT3.SUMM
|
||||
test/CBLAT3_3M.SUMM
|
||||
test/DBLAT2.SUMM
|
||||
test/DBLAT3.SUMM
|
||||
test/DBLAT3_3M.SUMM
|
||||
test/SBLAT2.SUMM
|
||||
test/SBLAT3.SUMM
|
||||
test/SBLAT3_3M.SUMM
|
||||
test/ZBLAT2.SUMM
|
||||
test/ZBLAT3.SUMM
|
||||
test/ZBLAT3_3M.SUMM
|
||||
test/SHBLAT3.SUMM
|
||||
test/SBBLAT3.SUMM
|
||||
test/cblat1
|
||||
test/cblat2
|
||||
test/cblat3
|
||||
test/cblat3_3m
|
||||
test/dblat1
|
||||
test/dblat2
|
||||
test/dblat3
|
||||
test/dblat3_3m
|
||||
test/sblat1
|
||||
test/sblat2
|
||||
test/sblat3
|
||||
test/sblat3_3m
|
||||
test/test_shgemm
|
||||
test/test_sbgemm
|
||||
test/zblat1
|
||||
test/zblat2
|
||||
test/zblat3
|
||||
test/zblat3_3m
|
||||
build
|
||||
build.*
|
||||
*.swp
|
||||
|
@ -109,4 +93,3 @@ benchmark/smallscaling
|
|||
CMakeCache.txt
|
||||
CMakeFiles/*
|
||||
.vscode
|
||||
**/__pycache__
|
||||
|
|
19
.travis.yml
19
.travis.yml
|
@ -285,25 +285,6 @@ matrix:
|
|||
- gfortran
|
||||
script:
|
||||
- travis_wait 45 make && make lapack-test
|
||||
env:
|
||||
- TARGET_BOX=NEOVERSE_N1
|
||||
|
||||
- &test-neon1-gcc8
|
||||
os: linux
|
||||
arch: arm64
|
||||
dist: focal
|
||||
group: edge
|
||||
virt: lxd
|
||||
compiler: gcc
|
||||
addons:
|
||||
apt:
|
||||
packages:
|
||||
- gcc-8
|
||||
- gfortran-8
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 CC=gcc-8 FC=gfortran-8 DYNAMIC_ARCH=1
|
||||
env:
|
||||
- TARGET_BOX=NEOVERSE_N1-GCC8
|
||||
|
||||
# whitelist
|
||||
branches:
|
||||
|
|
156
CMakeLists.txt
156
CMakeLists.txt
|
@ -2,13 +2,13 @@
|
|||
## Author: Hank Anderson <hank@statease.com>
|
||||
##
|
||||
|
||||
cmake_minimum_required(VERSION 3.16.0)
|
||||
cmake_minimum_required(VERSION 2.8.5)
|
||||
|
||||
project(OpenBLAS C ASM)
|
||||
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 28.dev)
|
||||
set(OpenBLAS_PATCH_VERSION 21.dev)
|
||||
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
|
@ -20,19 +20,13 @@ include(CMakePackageConfigHelpers)
|
|||
#######
|
||||
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF)
|
||||
|
||||
option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON)
|
||||
|
||||
set(LAPACK_STRLEN "" CACHE STRING "When building LAPACK, use this type (e.g. \"int\") for character lengths (defaults to size_t)")
|
||||
|
||||
option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
|
||||
|
||||
option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF)
|
||||
|
||||
option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
|
||||
|
||||
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
|
||||
|
||||
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64, ppc or RISCV64-RVV1.0 only)" OFF)
|
||||
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
|
||||
|
||||
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
|
||||
|
||||
|
@ -42,13 +36,6 @@ option(USE_LOCKING "Use locks even in single-threaded builds to make them callab
|
|||
|
||||
option(USE_PERL "Use the older PERL scripts for build preparation instead of universal shell scripts" OFF)
|
||||
|
||||
option(NO_WARMUP "Do not run a benchmark on each startup just to find the best location for the memory buffer" ON)
|
||||
|
||||
option(FIXED_LIBNAME "Use a non-versioned name for the library and no symbolic linking to variant names" OFF)
|
||||
|
||||
set(LIBNAMEPREFIX "" CACHE STRING "Add a prefix to the openblas part of the library name" )
|
||||
set(LIBNAMESUFFIX "" CACHE STRING "Add a suffix after the openblas part of the library name" )
|
||||
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
|
||||
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
|
||||
else()
|
||||
|
@ -102,14 +89,10 @@ endif()
|
|||
|
||||
message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")
|
||||
|
||||
if (USE_OPENMP)
|
||||
find_package(OpenMP REQUIRED)
|
||||
endif ()
|
||||
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
|
||||
|
||||
set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
|
||||
set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE})
|
||||
|
||||
set(BLASDIRS interface driver/level2 driver/level3 driver/others)
|
||||
|
||||
|
@ -253,7 +236,7 @@ endif()
|
|||
set_target_properties(${OpenBLAS_LIBS} PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME})
|
||||
|
||||
# Android needs to explicitly link against libm
|
||||
if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|DragonFly|Darwin")
|
||||
if(ANDROID)
|
||||
if(BUILD_STATIC_LIBS)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME}_static m)
|
||||
endif()
|
||||
|
@ -262,30 +245,20 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago
|
|||
endif()
|
||||
endif()
|
||||
|
||||
if (USE_OPENMP)
|
||||
if(BUILD_STATIC_LIBS)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C)
|
||||
endif()
|
||||
if(BUILD_SHARED_LIBS)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Seems that this hack doesn't required since macOS 11 Big Sur
|
||||
if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
|
||||
if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS)
|
||||
set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
|
||||
if (NOT NOFORTRAN)
|
||||
set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
|
||||
set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
|
||||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
|
||||
"sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
|
||||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
|
||||
"sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
|
||||
"sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
|
||||
"sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
|
||||
"sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
|
||||
else ()
|
||||
set (CMAKE_C_CREATE_SHARED_LIBRARY
|
||||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
|
||||
"sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
|
||||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
|
||||
"sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
|
||||
"sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
|
||||
endif ()
|
||||
endif()
|
||||
|
@ -334,36 +307,29 @@ endif()
|
|||
|
||||
#if (MSVC OR NOT NOFORTRAN)
|
||||
if (NOT NO_CBLAS)
|
||||
if (NOT ONLY_CBLAS)
|
||||
# Broken without fortran on unix
|
||||
add_subdirectory(utest)
|
||||
endif()
|
||||
add_subdirectory(utest)
|
||||
endif()
|
||||
|
||||
if (NOT NOFORTRAN)
|
||||
if (NOT ONLY_CBLAS)
|
||||
# Build test and ctest
|
||||
add_subdirectory(test)
|
||||
endif()
|
||||
if (BUILD_TESTING AND NOT BUILD_WITHOUT_LAPACK)
|
||||
if (BUILD_TESTING)
|
||||
add_subdirectory(lapack-netlib/TESTING)
|
||||
endif()
|
||||
endif()
|
||||
if(NOT NO_CBLAS)
|
||||
if (NOT ONLY_CBLAS)
|
||||
add_subdirectory(ctest)
|
||||
endif()
|
||||
endif()
|
||||
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
|
||||
add_subdirectory(cpp_thread_test)
|
||||
endif()
|
||||
|
||||
if (NOT FIXED_LIBNAME)
|
||||
set_target_properties(${OpenBLAS_LIBS} PROPERTIES
|
||||
VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}
|
||||
SOVERSION ${OpenBLAS_MAJOR_VERSION}
|
||||
)
|
||||
endif()
|
||||
|
||||
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
|
||||
if (NOT MSVC)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition")
|
||||
|
@ -430,106 +396,21 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
|
|||
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
|
||||
endif()
|
||||
|
||||
if (${BUILD_LAPACK_DEPRECATED})
|
||||
set (BLD 1)
|
||||
else ()
|
||||
set (BLD 0)
|
||||
endif()
|
||||
if (${BUILD_BFLOAT16})
|
||||
set (BBF16 1)
|
||||
else ()
|
||||
set (BBF16 0)
|
||||
endif()
|
||||
if (${BUILD_SINGLE})
|
||||
set (BS 1)
|
||||
else ()
|
||||
set (BS 0)
|
||||
endif()
|
||||
if (${BUILD_DOUBLE})
|
||||
set (BD 1)
|
||||
else ()
|
||||
set (BD 0)
|
||||
endif()
|
||||
if (${BUILD_COMPLEX})
|
||||
set (BC 1)
|
||||
else ()
|
||||
set (BC 0)
|
||||
endif()
|
||||
if (${BUILD_COMPLEX16})
|
||||
set (BZ 1)
|
||||
else ()
|
||||
set (BZ 0)
|
||||
endif()
|
||||
if (NOT USE_PERL)
|
||||
if (NOT DEFINED USE_PERL)
|
||||
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
|
||||
COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||
COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
|
||||
COMMENT "renaming symbols"
|
||||
)
|
||||
else()
|
||||
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
|
||||
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
|
||||
COMMENT "renaming symbols"
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (BUILD_BENCHMARKS)
|
||||
#find_package(OpenMP REQUIRED)
|
||||
file(GLOB SOURCES "benchmark/*.c")
|
||||
if (NOT USE_OPENMP)
|
||||
file(GLOB REMFILE "benchmark/smallscaling.c")
|
||||
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||
endif()
|
||||
if (BUILD_WITHOUT_LAPACK)
|
||||
file(GLOB REMFILE "benchmark/cholesky.c")
|
||||
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||
file(GLOB REMFILE "benchmark/geev.c")
|
||||
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||
file(GLOB REMFILE "benchmark/gesv.c")
|
||||
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||
file(GLOB REMFILE "benchmark/getri.c")
|
||||
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||
file(GLOB REMFILE "benchmark/potrf.c")
|
||||
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||
file(GLOB REMFILE "benchmark/spmv.c")
|
||||
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||
file(GLOB REMFILE "benchmark/symv.c")
|
||||
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||
file(GLOB REMFILE "benchmark/linpack.c")
|
||||
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||
endif()
|
||||
if (NOT USE_GEMM3M)
|
||||
file(GLOB REMFILE "benchmark/gemm3m.c")
|
||||
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||
endif()
|
||||
foreach(source ${SOURCES})
|
||||
get_filename_component(name ${source} NAME_WE)
|
||||
if ((NOT ${name} STREQUAL "zdot-intel") AND (NOT ${name} STREQUAL "cula_wrapper"))
|
||||
set(defines DEFAULT COMPLEX DOUBLE "COMPLEX\;DOUBLE")
|
||||
foreach(define ${defines})
|
||||
set(target_name "benchmark_${name}")
|
||||
if (NOT "${define}" STREQUAL "DEFAULT")
|
||||
string(JOIN "_" define_str ${define})
|
||||
set(target_name "${target_name}_${define_str}")
|
||||
endif()
|
||||
if ((NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX_DOUBLE") AND
|
||||
(NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX_DOUBLE") AND
|
||||
(NOT ${target_name} STREQUAL "benchmark_max_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX_DOUBLE") AND
|
||||
(NOT ${target_name} STREQUAL "benchmark_min_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX_DOUBLE"))
|
||||
add_executable(${target_name} ${source})
|
||||
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
||||
target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} )
|
||||
# target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C)
|
||||
if (NOT "${define}" STREQUAL "DEFAULT")
|
||||
target_compile_definitions(${target_name} PRIVATE ${define})
|
||||
endif()
|
||||
endif()
|
||||
endforeach()
|
||||
endif()
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
|
||||
# Install project
|
||||
|
@ -620,7 +501,7 @@ if(NOT NO_LAPACKE)
|
|||
ADD_CUSTOM_TARGET(genlapacke
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
|
||||
)
|
||||
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
|
||||
endif()
|
||||
|
||||
# Install pkg-config files
|
||||
|
@ -628,8 +509,9 @@ configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/
|
|||
install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
|
||||
|
||||
|
||||
# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
|
||||
set(PN OpenBLAS)
|
||||
set(CMAKECONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PN}${SUFFIX64}")
|
||||
set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}")
|
||||
configure_package_config_file(cmake/${PN}Config.cmake.in
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake"
|
||||
INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR})
|
||||
|
|
|
@ -23,9 +23,6 @@
|
|||
* Optimization on AMD Piledriver
|
||||
* Optimization on Intel Haswell
|
||||
|
||||
* Chris Sidebottom <chris.sidebottom@arm.com>
|
||||
* Optimizations and other improvements targeting AArch64
|
||||
|
||||
## Previous Developers
|
||||
|
||||
* Zaheer Chothia <zaheer.chothia@gmail.com>
|
||||
|
@ -198,9 +195,6 @@ In chronological order:
|
|||
* PingTouGe Semiconductor Co., Ltd.
|
||||
* [2020-10] Add RISC-V Vector (0.7.1) support. Optimize BLAS kernels for Xuantie C910
|
||||
|
||||
* Jake Arkinstall <https://github.com/jake-arkinstall>
|
||||
* [2021-02-10] Remove in-source configure_file to enable builds in read-only contexts (issue #3100, PR #3101)
|
||||
|
||||
* River Dillon <oss@outerpassage.net>
|
||||
* [2021-07-10] fix compilation with musl libc
|
||||
|
||||
|
@ -218,14 +212,4 @@ In chronological order:
|
|||
* [2022-03] Support RISC-V Vector Intrinisc 1.0 version.
|
||||
|
||||
* Pablo Romero <https://github.com/pablorcum>
|
||||
* [2022-08] Fix building from sources for QNX
|
||||
|
||||
* Mark Seminatore <https://github.com/mseminatore>
|
||||
* [2023-11-09] Improve Windows threading performance scaling
|
||||
* [2024-02-09] Introduce MT_TRACE facility and improve code consistency
|
||||
|
||||
* Dirreke <https://github.com/mseminatore>
|
||||
* [2024-01-16] Add basic support for the CSKY architecture
|
||||
|
||||
* Christopher Daley <https://github.com/cdaley>
|
||||
* [2024-01-24] Optimize GEMV forwarding on ARM64 systems
|
||||
* [2022-08] Fix building from sources for QNX
|
507
Changelog.txt
507
Changelog.txt
|
@ -1,511 +1,4 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.28
|
||||
8-Aug-2024
|
||||
|
||||
general:
|
||||
- Reworked the unfinished implementation of HUGETLB from GotoBLAS
|
||||
for allocating huge memory pages as buffers on suitable systems
|
||||
- Changed the unfinished implementation of GEMM3M for the generic
|
||||
target on all architectures to at least forward to regular GEMM
|
||||
- Improved multithreaded GEMM performance for large non-skinny matrices
|
||||
- Improved BLAS3 performance on larger multicore systems through improved
|
||||
parallelism
|
||||
- Improved performance of the initial memory allocation by reducing
|
||||
locking overhead
|
||||
- Improved performance of GBMV at small problem sizes by introducing
|
||||
a size barrier for the switch to multithreading
|
||||
- Added an implementation of the CBLAS_GEMM_BATCH extension
|
||||
- Fixed miscompilation of CAXPYC and ZAXPYC on all architectures in
|
||||
CMAKE builds (error introduced in 0.3.27)
|
||||
- Fixed corner cases involving the handling of NAN and INFINITY
|
||||
arguments in ?SCAL on all architectures
|
||||
- Added support for cross-compiling to WEBM with CMAKE (in addition
|
||||
to the already present makefile support)
|
||||
- Fixed NAN handling and potential accuracy issues in compilations with
|
||||
Intel ICX by supplying a suitable fp-model option by default
|
||||
- The contents of the github project wiki have been converted into
|
||||
a new set of documentation included with the source code.
|
||||
- It is now possible to register a callback function that replaces
|
||||
the built-in support for multithreading with an external backend
|
||||
like TBB (openblas_set_threads_callback_function)
|
||||
- Fixed potential duplication of suffixes in shared library naming
|
||||
- Improved C compiler detection by the build system to tolerate more
|
||||
naming variants for gcc builds
|
||||
- Fixed an unnecessary dependency of the utest on CBLAS
|
||||
- Fixed spurious error reports from the BLAS extensions utest
|
||||
- Fixed unwanted invocation of the GEMM3M tests in cross-compilation
|
||||
- Fixed a flaw in the makefile build that could lead to the pkgconfig
|
||||
file containing an entry of UNKNOWN for the target cpu after installing
|
||||
- Integrated fixes from the Reference-LAPACK project:
|
||||
- Fixed uninitialized variables in the LAPACK tests for ?QP3RK (PR 961)
|
||||
- Fixed potential bounds error in ?UNHR_COL/?ORHR_COL (PR 1018)
|
||||
- Fixed potential infinite loop in the LAPACK testsuite (PR 1024)
|
||||
- Make the variable type used for hidden length arguments configurable (PR 1025)
|
||||
- Fixed SYTRD workspace computation and various typos (PR 1030)
|
||||
- Prevent compiler use of FMA that could increase numerical error in ?GEEVX (PR 1033)
|
||||
|
||||
x86-64:
|
||||
- reverted thread management under Windows to its state before 0.3.26
|
||||
due to signs of race conditions in some circumstances now under study
|
||||
- fixed accidental selection of the unoptimized generic SBGEMM kernel
|
||||
in CMAKE builds for CooperLake and SapphireRapids targets
|
||||
- fixed a potential thread buffer overrun in SBSTOBF16 on small systems
|
||||
- fixed an accuracy issue in ZSCAL introduced in 0.3.26
|
||||
- fixed compilation with CMAKE and recent releases of LLVM
|
||||
- added support for Intel Emerald Rapids and Meteor Lake cpus
|
||||
- added autodetection support for the Zhaoxin KX-7000 cpu
|
||||
- fixed autodetection of Intel Prescott (probably broken since 0.3.19)
|
||||
- fixed compilation for older targets with the Yocto SDK
|
||||
- fixed compilation of the converter-generated C versions
|
||||
of the LAPACK sources with gcc-14
|
||||
- improved compiler options when building with CMAKE and LLVM for
|
||||
AVX512-capable targets
|
||||
- added support for supplying the L2 cache size via an environment
|
||||
variable (OPENBLAS_L2_SIZE) in case it is not correctly reported
|
||||
(as in some VM configurations)
|
||||
- improved the error message shown when thread creation fails on startup
|
||||
- fixed setting the rpath entry of the dylib in CMAKE builds on MacOS
|
||||
|
||||
arm:
|
||||
- fixed building for baremetal targets with make
|
||||
|
||||
arm64:
|
||||
- Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1
|
||||
matrix to the corresponding GEMV kernel
|
||||
- added optimized SGEMV and DGEMV kernels for A64FX
|
||||
- added optimized SVE kernels for small-matrix GEMM
|
||||
- added A64FX to the cpu list for DYNAMIC_ARCH
|
||||
- fixed building with support for cpu affinity
|
||||
- worked around accuracy problems with C/ZNRM2 on NeoverseN1 and
|
||||
Apple M targets
|
||||
- improved GEMM performance on Neoverse V1
|
||||
- fixed compilation for NEOVERSEN2 with older compilers
|
||||
- fixed potential miscompilation of the SVE SDOT and DDOT kernels
|
||||
- fixed potential miscompilation of the non-SVE CDOT and ZDOT kernels
|
||||
- fixed a potential overflow when using very large user-defined BUFFERSIZE
|
||||
- fixed setting the rpath entry of the dylib in CMAKE builds on MacOS
|
||||
|
||||
power:
|
||||
- Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1
|
||||
matrix to the corresponding GEMV kernel
|
||||
- significantly improved performance of SBGEMM on POWER10
|
||||
- fixed compilation with OpenMP and the XLF compiler
|
||||
- fixed building of the BLAS extension utests under AIX
|
||||
- fixed building of parts of the LAPACK testsuite with XLF
|
||||
- fixed CSWAP/ZSWAP on big-endian POWER10 targets
|
||||
- fixed a performance regression in SAXPY on POWER10 with OpenXL
|
||||
- fixed accuracy issues in CSCAL/ZSCAL when compiled with LLVM
|
||||
- fixed building for POWER9 under FreeBSD
|
||||
- fixed a potential overflow when using very large user-defined BUFFERSIZE
|
||||
- fixed an accuracy issue in the POWER6 kernels for GEMM and GEMV
|
||||
|
||||
riscv64:
|
||||
- Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1
|
||||
matrix to the corresponding GEMV kernel
|
||||
- fixed building for RISCV64_GENERIC with OpenMP enabled
|
||||
- added DYNAMIC_ARCH support (comprising GENERIC_RISCV64 and the two
|
||||
RVV 1.0 targets with vector length of 128 and 256)
|
||||
- worked around the ZVL128B kernels for AXPBY mishandling the special
|
||||
case of zero Y increment
|
||||
|
||||
loongarch64:
|
||||
- improved GEMM performance on servers of the 3C5000 generation
|
||||
- improved performance and stability of DGEMM
|
||||
- improved GEMV and TRSM kernels for LSX and LASX vector ABIs
|
||||
- fixed CMAKE compilation with the INTERFACE64 option set
|
||||
- fixed compilation with CMAKE
|
||||
- worked around spurious errors flagged by the BLAS3 tests
|
||||
- worked around a miscompilation of the POTRS utest by gcc 14.1
|
||||
|
||||
mips64:
|
||||
- fixed ASUM and SUM kernels to accept negative step sizes in X
|
||||
- fixed complex GEMV kernels for MSA
|
||||
|
||||
====================================================================
|
||||
Version 0.3.27
|
||||
4-Apr-2024
|
||||
|
||||
general:
|
||||
- added initial (generic) support for the CSKY architecture
|
||||
- capped the maximum number of threads used in GEMM, GETRF and POTRF to avoid creating
|
||||
underutilized or idle threads
|
||||
- sped up multithreaded POTRF on all platforms
|
||||
- added extension openblas_set_num_threads_local() that returns the previous thread count
|
||||
- re-evaluated the SGEMV and DGEMV load thresholds to avoid activating multithreading
|
||||
for too small workloads
|
||||
- improved the fallback code used when the precompiled number of threads is exceeded,
|
||||
and made it callable multiple times during the lifetime of an instance
|
||||
- added CBLAS interfaces for the BLAS extensions ?AMIN,?AMAX, CAXPYC and ZAXPYC
|
||||
- fixed a potential buffer overflow in the interface to the GEMMT kernels
|
||||
- fixed use of incompatible pointer types in GEMMT and C/ZAXPBY as flagged by GCC-14
|
||||
- fixed unwanted case sensitivity of the character parameters in ?TRTRS
|
||||
- sped up the OpenMP thread management code
|
||||
- fixed sizing of logical variables in INTERFACE64 builds of the C version of LAPACK
|
||||
- fixed inclusion of new LAPACK and LAPACKE functions from LAPACK 3.11 in the shared library
|
||||
- added a testsuite for the BLAS extensions
|
||||
- modified the error thresholds for SGS/DGS functions in the LAPACK testsuite to suppress
|
||||
spurious errors
|
||||
- added support for building the benchmark collection with CMAKE
|
||||
- added rewriting of linker options to avoid linking both libgomp and libomp in CMAKE builds
|
||||
with OpenMP enabled that use clang with gfortran
|
||||
- fixed building on systems with ucLibc
|
||||
- added support for calling ?NRM2 with a negative increment value on all architectures
|
||||
- added support for the LLVM18 version of the flang-new compiler
|
||||
- fixed handling of the OPENBLAS_LOOPS variable in several benchmarks
|
||||
- Integrated fixes from the Reference-LAPACK project:
|
||||
- Increased accuracy in C/ZLARFGP (Reference-LAPACK PR 981)
|
||||
|
||||
x86:
|
||||
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||
- fixed GEMM3M functions failing in CMAKE builds
|
||||
|
||||
x86-64:
|
||||
- removed all instances of sched_yield() on Linux and BSD
|
||||
- fixed a potential deadlock in the thread server on MSWindows (introduced in 0.3.26)
|
||||
- fixed GEMM3M functions failing in CMAKE builds
|
||||
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||
- added compiler checks for AVX512BF16 compatibility
|
||||
- fixed LLVM compiler options for Sapphire Rapids
|
||||
- fixed cpu handling fallbacks for Sapphire Rapids with
|
||||
disabled AVX2 in DYNAMIC_ARCH mode
|
||||
- fixed extensions SCSUM and DZSUM
|
||||
- improved GEMM performance for ZEN targets
|
||||
|
||||
arm:
|
||||
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||
|
||||
arm64:
|
||||
- added initial support for the Cortex-A76 cpu
|
||||
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||
- fixed default compiler options for gcc (-march and -mtune)
|
||||
- added support for ArmCompilerForLinux
|
||||
- added support for the NeoverseV2 cpu in DYNAMIC_ARCH builds
|
||||
- fixed mishandling of the INTERFACE64 option in CMAKE builds
|
||||
- corrected SCSUM kernels (erroneously duplicating SCASUM behaviour)
|
||||
- added SVE-enabled kernels for CSUM/ZSUM
|
||||
- worked around an inaccuracy in the NRM2 kernels for NeoverseN1 and Apple M
|
||||
|
||||
power:
|
||||
- improved performance of SGEMM on POWER8/9/10
|
||||
- improved performance of DGEMM on POWER10
|
||||
- added support for OpenMP builds with xlc/xlf on AIX
|
||||
- improved cpu autodetection for DYNAMIC_ARCH builds on older AIX
|
||||
- fixed cpu core counting on AIX
|
||||
- added support for building a shared library on AIX
|
||||
|
||||
riscv64:
|
||||
- added support for the X280 cpu
|
||||
- added support for semi-generic RISCV models with vector length 128 or 256
|
||||
- added support for compiling with either RVV 0.7.1 or RVV 1.0 standard compilers
|
||||
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||
- improved cpu model autodetection
|
||||
- fixed corner cases in ?AXPBY for C910V
|
||||
- fixed handling of zero increments in ?AXPY kernels for C910V
|
||||
|
||||
loongarch64:
|
||||
- added optimized kernels for ?AMIN and ?AMAX
|
||||
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||
- fixed handling of corner cases in ?AXPBY
|
||||
- fixed computation of SAMIN and DAMIN in LSX mode
|
||||
- fixed computation of ?ROT
|
||||
- added optimized SSYMV and DSYMV kernels for LSX and LASX mode
|
||||
- added optimized CGEMM and ZGEMM kernels for LSX and LASX mode
|
||||
- added optimized CGEMV and ZGEMV kernels
|
||||
|
||||
mips:
|
||||
- fixed utilizing MSA on P5600 and related cpus (broken in 0.3.22)
|
||||
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||
- fixed mishandling of the INTERFACE64 option in CMAKE builds
|
||||
|
||||
zarch:
|
||||
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||
- fixed calculation of ?SUM on Z13
|
||||
|
||||
====================================================================
|
||||
Version 0.3.26
|
||||
2-Jan-2024
|
||||
|
||||
general:
|
||||
- improved the version of openblas.pc that is created by the CMAKE build
|
||||
- fixed a CMAKE-specific build problem on older versions of MacOS
|
||||
- worked around linking problems on old versions of MacOS
|
||||
- corrected installation location of the lapacke_mangling header in CMAKE builds
|
||||
- added type declarations for complex variables to the MSVC-specific parts of the LAPACK header
|
||||
- significantly sped up ?GESV for small problem sizes by introducing a lower bound for multithreading
|
||||
- imported additions and corrections from the Reference-LAPACK project:
|
||||
- added new LAPACK functions for truncated QR with pivoting (Reference-LAPACK PRs 891&941)
|
||||
- handle miscalculation of minimum work array size in corner cases (Reference-LAPACK PR 942)
|
||||
- fixed use of uninitialized variables in ?GEDMD and improved inline documentation (PR 959)
|
||||
- fixed use of uninitialized variables (and consequential failures) in ?BBCSD (PR 967)
|
||||
- added tests for the recently introduced Dynamic Mode Decomposition functions (PR 736)
|
||||
- fixed several memory leaks in the LAPACK testsuite (PR 953)
|
||||
- fixed counting of testsuite results by the Python script (PR 954)
|
||||
|
||||
x86-64:
|
||||
- fixed computation of CASUM on SkylakeX and newer targets in the special
|
||||
case that AVX512 is not supported by the compiler or operating environment
|
||||
- fixed potential undefined behaviour in the CASUM/ZASUM kernels for AVX512 targets
|
||||
- worked around a problem in the pre-AVX kernels for GEMV
|
||||
- sped up the thread management code on MS Windows
|
||||
|
||||
arm64:
|
||||
- fixed building of the LAPACK testsuite with Xcode 15 on Apple M1 and newer
|
||||
- sped up the thread management code on MS Windows
|
||||
- sped up SGEMM and DGEMM on Neoverse V1 and N1
|
||||
- sped up ?DOT on SVE-capable targets
|
||||
- reduced the number of targets in DYNAMIC_ARCH builds by eliminating functionally equivalent ones
|
||||
- included support for Apple M1 and newer targets in DYNAMIC_ARCH builds
|
||||
|
||||
power:
|
||||
- improved the SGEMM kernel for POWER10
|
||||
- fixed compilation with (very) old versions of gcc
|
||||
- fixed detection of old 32bit PPC targets in CMAKE-based builds
|
||||
- added autodetection of the POWERPC 7400 subtype
|
||||
- fixed CMAKE-based compilation for PPCG4 and PPC970 targets
|
||||
|
||||
loongarch64:
|
||||
- added and improved optimized kernels for almost all BLAS functions
|
||||
|
||||
====================================================================
|
||||
Version 0.3.25
|
||||
12-Nov-2023
|
||||
|
||||
general:
|
||||
- improved the error message shown on exceeding the maximum thread count
|
||||
- improved the code to add supplementary thread buffers in case of overflow
|
||||
- fixed a potential division by zero in ?ROTG
|
||||
- improved the ?MATCOPY functions to accept zero-sized rows or columns
|
||||
- corrected empty prototypes in function declarations
|
||||
- cleaned up unused declarations in the f2c-converted versions of the LAPACK sources
|
||||
- fixed compilation with the Cray CCE Compiler suite
|
||||
- improved link line rewriting to avoid mixed libgomp/libomp builds with clang&gfortran
|
||||
- worked around OPENMP builds with LLVM14's libomp hanging on FreeBSD
|
||||
- improved the Makefiles to require less option duplication on "make install"
|
||||
- imported the following changes from the upcoming release 3.12 of Reference-LAPACK
|
||||
- deprecate utility functions ?GELQS and ?GEQRS (LAPACK PR 900)
|
||||
- apply rounding up to workspace calculations done in floating point (LAPACK PR 904)
|
||||
- avoid overflow in STGEX2/DTGEX2 (LAPACK PR 907)
|
||||
- fix accumulation in ?LASSQ (LAPACK PR 909)
|
||||
- fix handling of NaN values in ?GECON (LAPACK PR 926)
|
||||
- avoid overflow in CBDSQR/ZBDSQR (LAPACK PR 927)
|
||||
- fix poor vector orthogonalizations in ?ORBDB5/?UNBDB5 (LAPACK PR 928 & 930)
|
||||
|
||||
x86-64:
|
||||
- fixed compile-time autodetection of AMD Ryzen3 and Ryzen4 cpus
|
||||
- fixed capability-based fallback selection for unknown cpus in DYNAMIC_ARCH
|
||||
- added AVX512 optimizations for ?ASUM on Sapphire Rapids and Cooper Lake
|
||||
|
||||
ARM64:
|
||||
- fixed building on Apple with homebrew gcc
|
||||
- fixed building with XCODE 15
|
||||
- fixed building on A64FX and Cortex A710/X1/X2
|
||||
- increased the default buffer size for recent ARM server cpus
|
||||
|
||||
POWER:
|
||||
- fixed building with the IBM xlf 16.1.1 compiler
|
||||
- fixed building with IBM XL C
|
||||
- added support for DYNAMIC_ARCH builds with clang
|
||||
- fixed union declaration in the BFLOAT16 test case
|
||||
- enable optimizations for the AIX assembler on POWER10
|
||||
|
||||
LOONGARCH64:
|
||||
- added an optimized SGEMV kernel
|
||||
- added an optimized DTRSM kernel
|
||||
|
||||
====================================================================
|
||||
Version 0.3.24
|
||||
03-Sep-2023
|
||||
|
||||
general:
|
||||
- declared the arguments of cblas_xerbla as const (in accordance with the reference implementation
|
||||
and others, the previous discrepancy appears to have dated back to GotoBLAS)
|
||||
- fixed the implementation of ?GEMMT that was added in 0.3.23
|
||||
- made cpu-specific SWITCH_RATIO parameters for GEMM available to DYNAMIC_ARCH builds
|
||||
- fixed application of SYMBOLSUFFIX in CMAKE builds
|
||||
- fixed missing SSYCONVF function in the shared library
|
||||
- fixed parallel build logic used with gmake
|
||||
- added support for compilation with LLVM17, in particular its new Fortran compiler
|
||||
- added support for CMAKE builds using the NVIDIA HPC compiler
|
||||
- fixed INTERFACE64 builds with CMAKE and the f95 Fortran compiler
|
||||
- fixed cross-build detection and management in c_check
|
||||
- disabled building of the tests with CMAKE when ONLY_CBLAS is defined
|
||||
- fixed several issues with the handling of runtime limits on the number of OPENMP threads
|
||||
- corrected the error code returned by SGEADD/DGEADD when LDA is too small
|
||||
- corrected the error code returned by IMATCOPY when LDB is too small
|
||||
- updated ?NRM2 to support negative increment values (as introduced in release 3.10
|
||||
of the reference BLAS)
|
||||
- fixed OpenMP builds with CLANG for the case where libomp is not in a standard location
|
||||
- fixed a potential overwrite of unrelated memory during thread initialisation on startup
|
||||
- fixed a potential integer overflow in the multithreading threshold for ?SYMM/?SYRK
|
||||
- fixed build of the LAPACKE interfaces for the LAPACK 3.11.0 ?TRSYL functions added in 0.3.22
|
||||
- fixed installation of .cmake files in concurrent 32 and 64bit builds with CMAKE
|
||||
- applied additions and corrections from the development branch of Reference-LAPACK:
|
||||
- fixed actual arguments passed to a number of LAPACK functions (from Reference-LAPACK PR 885)
|
||||
- fixed workspace query results in LAPACK ?SYTRF/?TRECV3 (from Reference-LAPACK PR 883)
|
||||
- fixed derivation of the UPLO parameter in LAPACKE_?larfb (from Reference-LAPACK PR 878)
|
||||
- fixed a crash in LAPACK ?GELSDD on NRHS=0 (from Reference-LAPACK PR 876)
|
||||
- added new LAPACK utility functions CRSCL and ZRSCL (from Reference-LAPACK PR 839)
|
||||
- corrected the order of eigenvalues for 2x2 matrices in ?STEMR (Reference-LAPACK PR 867)
|
||||
- removed spurious reference to OpenMP variables outside OpenMP contexts (Reference-LAPACK PR 860)
|
||||
- updated file comments on use of LAMBDA variable in LAPACK (Reference-LAPACK PR 852)
|
||||
- fixed documentation of LAPACK SLASD0/DLASD0 (Reference-LAPACK PR 855)
|
||||
- fixed confusing use of "minor" in LAPACK documentation (Reference-LAPACK PR 849)
|
||||
- added new LAPACK functions ?GEDMD for dynamic mode decomposition (Reference-LAPACK PR 736)
|
||||
- fixed potential stack overflows in the EIG part of the LAPACK testsuite (Reference-LAPACK PR 854)
|
||||
- applied small improvements to the variants of Cholesky and QR functions (Reference-LAPACK PR 847)
|
||||
- removed unused variables from LAPACK ?BDSQR (Reference-LAPACK PR 832)
|
||||
- fixed a potential crash on allocation failure in LAPACKE SGEESX/DGEESX (Reference-LAPACK PR 836)
|
||||
- added a quick return from SLARUV/DLARUV for N < 1 (Reference-LAPACK PR 837)
|
||||
- updated function descriptions in LAPACK ?GEGS/?GEGV (Reference-LAPACK PR 831)
|
||||
- improved algorithm description in ?GELSY (Reference-LAPACK PR 833)
|
||||
- fixed scaling in LAPACK STGSNA/DTGSNA (Reference-LAPACK PR 830)
|
||||
- fixed crash in LAPACKE_?geqrt with row-major data (Reference-LAPACK PR 768)
|
||||
- added LAPACKE interfaces for C/ZUNHR_COL and S/DORHR_COL (Reference-LAPACK PR 827)
|
||||
- added error exit tests for SYSV/SYTD2/GEHD2 to the testsuite (Reference-LAPACK PR 795)
|
||||
- fixed typos in LAPACK source and comments (Reference-LAPACK PRs 809,811,812,814,820)
|
||||
- adopt refactored ?GEBAL implementation (Reference-LAPACK PR 808)
|
||||
|
||||
x86_64:
|
||||
- added cpu model autodetection for Intel Alder Lake N
|
||||
- added activation of the AMX tile to the Sapphire Rapids SBGEMM kernel
|
||||
- worked around miscompilations of GEMV/SYMV kernels by gcc's tree-vectorizer
|
||||
- fixed compilation of Cooperlake and Sapphire Rapids kernels with CLANG
|
||||
- fixed runtime detection of Cooperlake and Sapphire Rapids in DYNAMIC_ARCH
|
||||
- fixed feature-based cputype fallback in DYNAMIC_ARCH
|
||||
- added support for building the AVX512 kernels with the NVIDIA HPC compiler
|
||||
- corrected ZAXPY result on old pre-AVX hardware for the INCX=0 case
|
||||
- fixed a potential use of uninitialized variables in ZTRSM
|
||||
|
||||
ARM64:
|
||||
- added cpu model autodetection for Apple M2
|
||||
- fixed wrong results of CGEMM/CTRMM/DNRM2 under OSX (use of reserved register)
|
||||
- added support for building the SVE kernels with the NVIDIA HPC compiler
|
||||
- added support for building the SVE kernels with the Apple Clang compiler
|
||||
- fixed compiler option handling for building the SVE kernels with LLVM
|
||||
- implemented SWITCH_RATIO parameter for improved GEMM performance on Neoverse
|
||||
- activated SVE SGEMM and DGEMM kernels for Neoverse V1
|
||||
- improved performance of the SVE CGEMM and ZGEMM kernels on Neoverse V1
|
||||
- improved kernel selection for the ARMV8SVE target and added it to DYNAMIC_ARCH
|
||||
- fixed runtime check for SVE availability in DYNAMIC_ARCH builds to take OS or
|
||||
container restrictions into account
|
||||
- fixed a potential use of uninitialized variables in ZTRSM
|
||||
- fix a potential misdetection of ARMV8 hardware as 32bit in CMAKE builds
|
||||
|
||||
LOONGARCH64:
|
||||
- added ABI detection
|
||||
- added support for cpu affinity handling
|
||||
- fixed compilation with early versions of the Loongson toolchain
|
||||
- added an optimized SGEMM kernel for 3A5000
|
||||
- added optimized DGEMV kernels for 3A5000
|
||||
- improved the performance of the DGEMM kernel for 3A5000
|
||||
|
||||
MIPS64:
|
||||
- fixed miscompilation of TRMM kernels for the MIPS64_GENERIC target
|
||||
|
||||
POWER:
|
||||
- fixed compiler warnings in the POWER10 SBGEMM kernel
|
||||
|
||||
RISCV:
|
||||
- fixed application of the INTERFACE64 option when building with CMAKE
|
||||
- fix a potential misdetection of RISCV hardware as 32bit in CMAKE builds
|
||||
- fixed IDAMAX and DOT kernels for C910V
|
||||
- fixed corner cases in the ROT and SWAP kernels for C910V
|
||||
- fixed compilation of the C910V target with recent vendor compilers
|
||||
|
||||
====================================================================
|
||||
Version 0.3.23
|
||||
01-Apr-2023
|
||||
|
||||
general:
|
||||
- fixed a serious regression in GETRF/GETF2 and ZGETRF/ZGETF2 where
|
||||
subnormal but nonzero data elements triggered the singularity flag
|
||||
- fixed a long-standing bug in CSPR/ZSPR in single-threaded operation
|
||||
for cases where elements of the X vector are real numbers (or
|
||||
complex with only the real part zero)
|
||||
- fixed gmake builds with the option NO_LAPACK
|
||||
- fixed a few instances in the gmake Makefiles where expressly
|
||||
setting NO_LAPACK=0 or NO_LAPACKE=0 would have the opposite effect
|
||||
|
||||
x86_64:
|
||||
- added further CPUID values for Intel Raptor Lake
|
||||
|
||||
====================================================================
|
||||
Version 0.3.22
|
||||
26-Mar-2023
|
||||
|
||||
general:
|
||||
- Updated the included LAPACK to Reference-LAPACK release 3.11.0
|
||||
plus post-release corrections and improvements
|
||||
- Added initial support for processing with the EMSCRIPTEN javascript
|
||||
converter (yielding a single-threaded build only)
|
||||
- Added a threshold for multithreading in SYMM, SYMV and SYR2K
|
||||
- Increased the threshold for multithreading in SYRK
|
||||
- OpenBLAS no longer decreases the global OMP_NUM_THREADS when it
|
||||
exceeds the maximum thread count the library was compiled for.
|
||||
- fixed ?GETF2 potentially returning NaN with tiny matrix elements
|
||||
- fixed openblas_set_num_threads to work in USE_OPENMP builds
|
||||
- fixed cpu core counting in USE_OPENMP builds returning the number
|
||||
of OMP "places" rather than cores
|
||||
- fixed interpretation of USE_PERL=0 in build scripts
|
||||
- fixed linking of the library with libm in CMAKE builds
|
||||
- fixed startup delays resulting from a wrong default setting of
|
||||
NO_WARMUP in CMAKE builds
|
||||
- fixed inconsistent defaults for overriding of LAPACK SPMV, SPR,
|
||||
SYMV, SYR functions in gmake and CMAKE builds
|
||||
- fixed stride calculation in the optimized small-matrix path of
|
||||
complex SYR
|
||||
- fixed compilation of ReLAPACK with CMAKE
|
||||
- fixed pkgconfig file contents for INTERFACE64 builds
|
||||
- fixed building of Reference-LAPACK with recent gfortran
|
||||
- fixed building with only a subset of precision types on Windows
|
||||
- added new environment variable OPENBLAS_DEFAULT_NUM_THREADS
|
||||
- added a GEMV-based implementation of GEMMT
|
||||
- added support for building under QNX
|
||||
- updated support for (cross-)building for ALPHA targets
|
||||
|
||||
x86_64:
|
||||
- added autodetection of Intel Raptor Lake cpu models
|
||||
- added SSCAL microkernels for Haswell and newer targets
|
||||
- improved the performance of the Haswell DSCAL microkernel
|
||||
- added CSCAL and ZSCAL microkernels for SkylakeX targets
|
||||
- fixed detection of gfortran and Cray CCE compilers
|
||||
- fixed detection of recent versions of the Intel Fortran compiler
|
||||
- fixed compilation with LLVM to no longer run out of AVX512 registers
|
||||
- fix cpu type option setting with recent NVIDIA HPC compiler versions
|
||||
- fixed compilation for/on AMD Ryzen 4 cpus
|
||||
- fixed compilation of AVX2-capable targets with Apple Clang
|
||||
- fixed runtime selection of COOPERLAKE in DYNAMIC_ARCH builds
|
||||
- worked around gcc/llvm using risky FMA operations in CSCAL/ZSCAL
|
||||
- worked around miscompilations of GEMV, SYMV and ZDOT kernels
|
||||
by gcc12's tree-vectorizer on OSX and Windows
|
||||
|
||||
ARM:
|
||||
- fixed cross-compilation to ARMV5 and ARMV6 targets with CMAKE
|
||||
|
||||
ARMV8:
|
||||
- fixed cross-compilation to CortexA53 with CMAKE
|
||||
- fixed compilation with CMAKE and "Arm Compiler for Linux 22.1"
|
||||
- added cpu autodetection for Cortex X3 and A715
|
||||
- fixed conditional compilation of SVE-capable targets in DYNAMIC_ARCH
|
||||
- sped up SVE kernels by removing unnecessary prefetches
|
||||
- improved the GEMM performance of Neoverse V1
|
||||
- added SVE kernels for SDOT and DDOT
|
||||
- added an SBGEMM kernel for Neoverse N2
|
||||
- improved cpu-specific compiler option selection for Neoverse cpus
|
||||
- added support for setting CONSISTENT_FPCSR
|
||||
|
||||
MIPS64:
|
||||
- improved MSA capability detection and handling
|
||||
- added a MIPS64_GENERIC build target
|
||||
- fixed corner cases in DNRM2
|
||||
|
||||
LOONGARCH64:
|
||||
- fixed handling of the INTERFACE64 option
|
||||
|
||||
RISCV:
|
||||
- fixed handling of the INTERFACE64 option
|
||||
|
||||
====================================================================
|
||||
Version 0.3.21
|
||||
07-Aug-2022
|
||||
|
|
|
@ -80,7 +80,7 @@
|
|||
SUN
|
||||
Fujitsu
|
||||
|
||||
4. Supported precision
|
||||
4. Suported precision
|
||||
|
||||
Now x86/x86_64 version support 80bit FP precision in addition to
|
||||
normal double presicion and single precision. Currently only
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
operation is finished.
|
||||
|
||||
|
||||
2. Similar problem may happen under virtual machine. If supervisor
|
||||
2. Simlar problem may happen under virtual machine. If supervisor
|
||||
allocates different cores for each scheduling, BLAS performnace
|
||||
will be bad. This is because BLAS also utilizes all cache,
|
||||
unexpected re-schedule for different core may result of heavy
|
||||
|
|
|
@ -1,14 +1,9 @@
|
|||
pipeline {
|
||||
agent {
|
||||
docker {
|
||||
image 'osuosl/ubuntu-s390x'
|
||||
node {
|
||||
stage('Checkout') {
|
||||
checkout
|
||||
}
|
||||
}
|
||||
stages {
|
||||
|
||||
stage('Build') {
|
||||
steps {
|
||||
sh 'make clean && make'
|
||||
}
|
||||
sh("make")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,16 +0,0 @@
|
|||
pipeline {
|
||||
agent {
|
||||
docker {
|
||||
image 'osuosl/ubuntu-ppc64le:18.04'
|
||||
}
|
||||
}
|
||||
stages {
|
||||
stage('Build') {
|
||||
steps {
|
||||
sh 'sudo apt update'
|
||||
sh 'sudo apt install gfortran -y'
|
||||
sh 'make clean && make'
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
68
Makefile
68
Makefile
|
@ -1,9 +1,5 @@
|
|||
TOPDIR = .
|
||||
include ./Makefile.system
|
||||
LNCMD = ln -fs
|
||||
ifeq ($(FIXED_LIBNAME), 1)
|
||||
LNCMD = true
|
||||
endif
|
||||
|
||||
BLASDIRS = interface driver/level2 driver/level3 driver/others
|
||||
|
||||
|
@ -39,22 +35,14 @@ export NO_LAPACK
|
|||
export C_LAPACK
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER),CRAY)
|
||||
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -Og -Os,$(LAPACK_FFLAGS))
|
||||
else
|
||||
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
|
||||
endif
|
||||
|
||||
ifdef LAPACK_STRLEN
|
||||
LAPACK_FFLAGS += -DLAPACK_STRLEN=$(LAPACK_STRLEN)
|
||||
endif
|
||||
|
||||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
|
||||
|
||||
.PHONY : all libs netlib $(RELA) test ctest shared install
|
||||
.NOTPARALLEL : shared
|
||||
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
|
||||
|
||||
all :: tests
|
||||
all :: libs netlib $(RELA) tests shared
|
||||
@echo
|
||||
@echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
|
||||
@echo
|
||||
|
@ -138,21 +126,21 @@ endif
|
|||
@echo "to circumvent any install errors."
|
||||
@echo
|
||||
|
||||
shared : libs netlib $(RELA)
|
||||
shared :
|
||||
ifneq ($(NO_SHARED), 1)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
|
||||
@$(MAKE) -C exports so
|
||||
@$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so
|
||||
@$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD))
|
||||
@$(MAKE) -C exports so
|
||||
@$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
@$(MAKE) -C exports dyn
|
||||
@$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||
@$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
@$(MAKE) -C exports dll
|
||||
|
@ -160,12 +148,9 @@ endif
|
|||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
@$(MAKE) -C exports dll
|
||||
endif
|
||||
ifeq ($(OSNAME), AIX)
|
||||
@$(MAKE) -C exports so
|
||||
endif
|
||||
endif
|
||||
|
||||
tests : shared
|
||||
tests :
|
||||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
touch $(LIBNAME)
|
||||
ifndef NO_FBLAS
|
||||
|
@ -221,32 +206,16 @@ ifeq ($(DYNAMIC_OLDER), 1)
|
|||
@echo DYNAMIC_OLDER=1 >> Makefile.conf_last
|
||||
endif
|
||||
endif
|
||||
@echo TARGET=$(CORE) >> Makefile.conf_last
|
||||
ifdef USE_THREAD
|
||||
@echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
|
||||
endif
|
||||
ifdef SMP
|
||||
ifdef NUM_THREADS
|
||||
@echo NUM_THREADS=$(NUM_THREADS) >> Makefile.conf_last
|
||||
else
|
||||
@echo NUM_THREADS=$(NUM_CORES) >> Makefile.conf_last
|
||||
endif
|
||||
endif
|
||||
ifeq ($(USE_OPENMP),1)
|
||||
@echo USE_OPENMP=1 >> Makefile.conf_last
|
||||
endif
|
||||
ifeq ($(INTERFACE64),1)
|
||||
@echo INTERFACE64=1 >> Makefile.conf_last
|
||||
endif
|
||||
@echo THELIBNAME=$(LIBNAME) >> Makefile.conf_last
|
||||
@echo THELIBSONAME=$(LIBSONAME) >> Makefile.conf_last
|
||||
@-$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
@touch lib.grd
|
||||
|
||||
prof : prof_blas prof_lapack
|
||||
|
||||
prof_blas :
|
||||
$(LNCMD) $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
|
||||
ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
|
||||
for d in $(SUBDIRS) ; \
|
||||
do if test -d $$d; then \
|
||||
$(MAKE) -C $$d prof || exit 1 ; \
|
||||
|
@ -257,7 +226,7 @@ ifeq ($(DYNAMIC_ARCH), 1)
|
|||
endif
|
||||
|
||||
blas :
|
||||
$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
for d in $(BLASDIRS) ; \
|
||||
do if test -d $$d; then \
|
||||
$(MAKE) -C $$d libs || exit 1 ; \
|
||||
|
@ -265,7 +234,7 @@ blas :
|
|||
done
|
||||
|
||||
hpl :
|
||||
$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
for d in $(BLASDIRS) ../laswp exports ; \
|
||||
do if test -d $$d; then \
|
||||
$(MAKE) -C $$d $(@F) || exit 1 ; \
|
||||
|
@ -279,7 +248,7 @@ ifeq ($(DYNAMIC_ARCH), 1)
|
|||
endif
|
||||
|
||||
hpl_p :
|
||||
$(LNCMD) $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
|
||||
ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
|
||||
for d in $(SUBDIRS) ../laswp exports ; \
|
||||
do if test -d $$d; then \
|
||||
$(MAKE) -C $$d $(@F) || exit 1 ; \
|
||||
|
@ -320,12 +289,8 @@ endif
|
|||
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1)
|
||||
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
else
|
||||
ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGIBM1)
|
||||
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
else
|
||||
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
endif
|
||||
endif
|
||||
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
|
@ -408,15 +373,14 @@ ifneq ($(CROSS), 1)
|
|||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING)
|
||||
endif
|
||||
|
||||
lapack-runtest: lapack-test
|
||||
lapack-runtest:
|
||||
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
|
||||
./testsecond; ./testdsecnd; ./testieee; ./testversion )
|
||||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING )
|
||||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
|
||||
|
||||
|
||||
blas-test:
|
||||
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out)
|
||||
|
||||
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
|
||||
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out)
|
||||
|
||||
|
|
119
Makefile.arm64
119
Makefile.arm64
|
@ -58,13 +58,6 @@ FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA76)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a76
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a76
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), FT2000)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
|
@ -76,13 +69,13 @@ endif
|
|||
# in GCC>=9
|
||||
ifeq ($(CORE), NEOVERSEN1)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=neoverse-n1
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
endif
|
||||
|
@ -99,37 +92,26 @@ endif
|
|||
# in GCC>=10.4
|
||||
ifeq ($(CORE), NEOVERSEV1)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
|
||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
|
||||
CCOMMON_OPT += -march=armv8.4-a+sve
|
||||
ifeq (1, $(ISCLANG))
|
||||
CCOMMON_OPT += -mtune=cortex-x1
|
||||
else
|
||||
CCOMMON_OPT += -mtune=neoverse-v1
|
||||
endif
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
|
||||
CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-v1
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.4-a+sve
|
||||
ifneq ($(CROSS), 1)
|
||||
CCOMMON_OPT += -mtune=native
|
||||
endif
|
||||
CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.4-a
|
||||
ifneq ($(CROSS), 1)
|
||||
FCOMMON_OPT += -mtune=native
|
||||
endif
|
||||
FCOMMON_OPT += -march=armv8.4-a -mtune=native
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
|
@ -140,56 +122,40 @@ endif
|
|||
# in GCC>=10.4
|
||||
ifeq ($(CORE), NEOVERSEN2)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
|
||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
|
||||
ifneq ($(OSNAME), Darwin)
|
||||
CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a+sve+bf16 -mtune=cortex-a72
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
endif
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.5-a+sve+bf16
|
||||
ifneq ($(CROSS), 1)
|
||||
CCOMMON_OPT += -mtune=native
|
||||
endif
|
||||
CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.5-a
|
||||
ifneq ($(CROSS), 1)
|
||||
FCOMMON_OPT += -mtune=native
|
||||
endif
|
||||
FCOMMON_OPT += -march=armv8.5-a -mtune=native
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a+sve+bf16 -mtune=cortex-a72
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8-a+sve+bf16 -mtune=cortex-a72
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# Detect ARM Neoverse V2.
|
||||
ifeq ($(CORE), NEOVERSEV2)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ12) $(ISCLANG)))
|
||||
CCOMMON_OPT += -march=armv9-a -mtune=neoverse-v2
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv9-a -mtune=neoverse-v2
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# Use a53 tunings because a55 is only available in GCC>=8.1
|
||||
ifeq ($(CORE), CORTEXA55)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ8) $(ISCLANG)))
|
||||
ifeq ($(GCCVERSIONGTEQ8), 1)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
|
||||
|
@ -230,13 +196,8 @@ endif
|
|||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX3T110)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
|
||||
CCOMMON_OPT += -march=armv8.3-a
|
||||
ifeq (0, $(ISCLANG))
|
||||
CCOMMON_OPT += -mtune=thunderx3t110
|
||||
else
|
||||
CCOMMON_OPT += -mtune=thunderx2t99
|
||||
endif
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
|
||||
endif
|
||||
|
@ -264,47 +225,29 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq ($(CORE), EMAG8180)
|
||||
CCOMMON_OPT += -march=armv8-a
|
||||
ifeq ($(ISCLANG), 0)
|
||||
CCOMMON_OPT += -mtune=emag
|
||||
endif
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=emag
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=emag
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
|
||||
ifeq ($(CORE), A64FX)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
|
||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ3) $(GCCVERSIONGTEQ11) $(ISCLANG)))
|
||||
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-n1
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-n1
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
|
||||
ifeq ($(CORE), CORTEXX1)
|
||||
CCOMMON_OPT += -march=armv8.2-a
|
||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ12) $(ISCLANG)))
|
||||
CCOMMON_OPT += -mtune=cortex-x1
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-x1
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
endif
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
@ -315,12 +258,6 @@ CCOMMON_OPT += -march=armv8.4-a+sve
|
|||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.4-a+sve
|
||||
endif
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ12) $(ISCLANG)))
|
||||
CCOMMON_OPT += -mtune=cortex-x2
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -mtune=cortex-x2
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
@ -340,12 +277,6 @@ CCOMMON_OPT += -march=armv8.4-a+sve
|
|||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.4-a+sve
|
||||
endif
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ12) $(ISCLANG)))
|
||||
CCOMMON_OPT += -mtune=cortex-a710
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -mtune=cortex-a710
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
|
|
@ -1,4 +0,0 @@
|
|||
ifeq ($(CORE), CK860FV)
|
||||
CCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float
|
||||
FCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float -static
|
||||
endif
|
|
@ -2,21 +2,6 @@ TOPDIR = .
|
|||
export GOTOBLAS_MAKEFILE = 1
|
||||
-include $(TOPDIR)/Makefile.conf_last
|
||||
include ./Makefile.system
|
||||
LNCMD = ln -fs
|
||||
|
||||
ifdef THELIBNAME
|
||||
LIBNAME=$(THELIBNAME)
|
||||
LIBSONAME=$(THELIBSONAME)
|
||||
endif
|
||||
ifeq ($(FIXED_LIBNAME), 1)
|
||||
LNCMD = true
|
||||
endif
|
||||
ifeq ($(INTERFACE64),1)
|
||||
USE_64BITINT=1
|
||||
endif
|
||||
ifeq ($(USE_OPENMP),1)
|
||||
FOMP_OPT:= -fopenmp
|
||||
endif
|
||||
|
||||
PREFIX ?= /opt/OpenBLAS
|
||||
|
||||
|
@ -75,24 +60,24 @@ ifndef NO_CBLAS
|
|||
@echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@cp cblas.h cblas.tmp
|
||||
ifdef SYMBOLPREFIX
|
||||
@sed 's/cblas[^() ]*/$(SYMBOLPREFIX)&/g' cblas.tmp > cblas.tmp2
|
||||
@sed 's/openblas[^() ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp
|
||||
@sed 's/cblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp > cblas.tmp2
|
||||
@sed 's/openblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp
|
||||
#change back any openblas_complex_float and double that got hit
|
||||
@sed 's/$(SYMBOLPREFIX)openblas_complex_/openblas_complex_/g' cblas.tmp > cblas.tmp2
|
||||
@sed 's/goto[^() ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp
|
||||
@sed 's/goto[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp
|
||||
endif
|
||||
ifdef SYMBOLSUFFIX
|
||||
@sed 's/cblas[^() ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp > cblas.tmp2
|
||||
@sed 's/openblas[^() ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp
|
||||
@sed 's/cblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp > cblas.tmp2
|
||||
@sed 's/openblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp
|
||||
#change back any openblas_complex_float and double that got hit
|
||||
@sed 's/\(openblas_complex_\)\([^ ]*\)$(SYMBOLSUFFIX)/\1\2 /g' cblas.tmp > cblas.tmp2
|
||||
@sed 's/goto[^() ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp
|
||||
@sed 's/goto[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp
|
||||
endif
|
||||
@sed 's/common/openblas_config/g' cblas.tmp > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
|
||||
endif
|
||||
|
||||
ifneq ($(OSNAME), AIX)
|
||||
ifneq ($(NO_LAPACKE), 1)
|
||||
ifndef NO_LAPACKE
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
|
||||
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
|
@ -106,7 +91,7 @@ ifneq ($(NO_STATIC),1)
|
|||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
endif
|
||||
#for install shared library
|
||||
ifneq ($(NO_SHARED),1)
|
||||
|
@ -114,21 +99,21 @@ ifneq ($(NO_SHARED),1)
|
|||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
|
||||
@install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD))
|
||||
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@-install_name_tool -id "$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(MAJOR_VERSION).dylib" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
|
||||
$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
|
||||
ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
|
||||
|
@ -142,7 +127,7 @@ endif
|
|||
|
||||
else
|
||||
#install on AIX has different options syntax
|
||||
ifneq ($(NO_LAPACKE), 1)
|
||||
ifndef NO_LAPACKE
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
|
@ -156,15 +141,15 @@ ifneq ($(NO_STATIC),1)
|
|||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
endif
|
||||
#for install shared library
|
||||
ifneq ($(NO_SHARED),1)
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
|
||||
endif
|
||||
|
@ -177,12 +162,9 @@ endif
|
|||
|
||||
@echo Generating $(LIBSONAMEBASE)$(SUFFIX64).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
|
||||
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(PKGFILE)"
|
||||
@echo 'libprefix='$(LIBNAMEPREFIX) >> "$(PKGFILE)"
|
||||
@echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)"
|
||||
@echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)"
|
||||
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)"
|
||||
@echo 'omp_opt='$(FOMP_OPT) >> "$(PKGFILE)"
|
||||
@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(TARGET) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
|
||||
@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
|
||||
@echo 'version='$(VERSION) >> "$(PKGFILE)"
|
||||
@echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)"
|
||||
@cat openblas.pc.in >> "$(PKGFILE)"
|
||||
|
@ -196,7 +178,7 @@ endif
|
|||
ifneq ($(NO_SHARED),1)
|
||||
#ifeq logical or
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
|
||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
endif
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
|
||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
MSA_FLAGS = -mmsa -mfp64 -mload-store-pairs
|
||||
ifdef BINARY64
|
||||
else
|
||||
endif
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
MSA_FLAGS = -mmsa -mfp64 -mload-store-pairs
|
||||
ifdef BINARY64
|
||||
else
|
||||
endif
|
||||
|
|
|
@ -11,23 +11,11 @@ endif
|
|||
|
||||
ifeq ($(CORE), POWER10)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
|
||||
else ifneq ($(GCCVERSIONGT4), 1)
|
||||
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
|
||||
CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
|
||||
else
|
||||
$(warning your compiler is too old to fully support POWER10, getting a newer version of gcc is recommended)
|
||||
CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
|
||||
endif
|
||||
ifeq ($(F_COMPILER), IBM)
|
||||
FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr10 -qtune=pwr10 -qfloat=nomaf -qzerosize
|
||||
FCOMMON_OPT += -O2 -qrecur -qnosave
|
||||
else
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
@ -50,18 +38,19 @@ CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
|
|||
endif
|
||||
ifneq ($(F_COMPILER), PGI)
|
||||
ifeq ($(F_COMPILER), IBM)
|
||||
FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr9 -qtune=pwr9 -qfloat=nomaf -qzerosize
|
||||
FCOMMON_OPT += -O2 -qrecur -qnosave
|
||||
else
|
||||
FCOMMON_OPT += -O2 -frecursive -fno-fast-math -mcpu=power9 -mtune=power9
|
||||
FCOMMON_OPT += -O2 -frecursive -fno-fast-math
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
ifneq ($(GCCVERSIONGT4), 1)
|
||||
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
|
||||
FCOMMON_OPT += -mcpu=power8 -mtune=power8
|
||||
else
|
||||
FCOMMON_OPT += -mcpu=power9 -mtune=power9
|
||||
endif
|
||||
else
|
||||
FCOMMON_OPT += -mcpu=power9 -mtune=power9
|
||||
endif
|
||||
else
|
||||
FCOMMON_OPT += -O2 -Mrecursive
|
||||
|
@ -77,16 +66,12 @@ endif
|
|||
ifneq ($(F_COMPILER), PGI)
|
||||
ifeq ($(OSNAME), AIX)
|
||||
ifeq ($(F_COMPILER), IBM)
|
||||
FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr8 -qtune=pwr8 -qfloat=nomaf -qzerosize
|
||||
FCOMMON_OPT += -O2 -qrecur -qnosave
|
||||
else
|
||||
FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
|
||||
FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
|
||||
endif
|
||||
else
|
||||
ifeq ($(F_COMPILER), IBM)
|
||||
FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr8 -qtune=pwr8 -qfloat=nomaf -qzerosize
|
||||
else
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
|
||||
endif
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
|
||||
endif
|
||||
else
|
||||
FCOMMON_OPT += -O2 -Mrecursive
|
||||
|
@ -99,20 +84,13 @@ CCOMMON_OPT += -DUSE_OPENMP -fopenmp
|
|||
else
|
||||
CCOMMON_OPT += -DUSE_OPENMP -mp
|
||||
endif
|
||||
ifeq ($(F_COMPILER), IBM)
|
||||
FCOMMON_OPT += -DUSE_OPENMP
|
||||
else
|
||||
ifneq ($(F_COMPILER), PGI)
|
||||
FCOMMON_OPT += -DUSE_OPENMP -fopenmp
|
||||
else
|
||||
FCOMMON_OPT += -DUSE_OPENMP -mp
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
CCOMMON_OPT += -fno-integrated-as
|
||||
endif
|
||||
# workaround for C->FORTRAN ABI violation in LAPACKE
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
FCOMMON_OPT += -fno-optimize-sibling-calls
|
||||
|
@ -147,19 +125,8 @@ endif
|
|||
ifdef BINARY64
|
||||
|
||||
|
||||
ifeq ($(C_COMPILER)$(F_COMPILER)$(OSNAME), GCCIBMAIX)
|
||||
$(error Using GCC and XLF on AIX is not a supported combination.)
|
||||
endif
|
||||
ifeq ($(C_COMPILER)$(F_COMPILER)$(OSNAME), CLANGGFORTRANAIX)
|
||||
$(error Using Clang and gFortran on AIX is not a supported combination.)
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), AIX)
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
CCOMMON_OPT += -mpowerpc64 -maix64
|
||||
else
|
||||
CCOMMON_OPT += -m64
|
||||
endif
|
||||
ifeq ($(COMPILER_F77), g77)
|
||||
FCOMMON_OPT += -mpowerpc64 -maix64
|
||||
endif
|
||||
|
|
|
@ -55,10 +55,6 @@ ifeq ($(TARGET), C910V)
|
|||
TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), CK860FV)
|
||||
TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), x280)
|
||||
TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
|
||||
endif
|
||||
|
|
|
@ -8,13 +8,13 @@ FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
|
|||
endif
|
||||
ifeq ($(CORE), RISCV64_ZVL256B)
|
||||
CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d
|
||||
FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
|
||||
FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
|
||||
endif
|
||||
ifeq ($(CORE), RISCV64_ZVL128B)
|
||||
CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
|
||||
FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
|
||||
FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
|
||||
endif
|
||||
ifeq ($(CORE), RISCV64_GENERIC)
|
||||
CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d
|
||||
FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d
|
||||
FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static
|
||||
endif
|
||||
|
|
|
@ -3,12 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.28.dev
|
||||
|
||||
# If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
|
||||
# and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library
|
||||
#
|
||||
# LIBNAMEPREFIX = scipy
|
||||
VERSION = 0.3.21.dev
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
@ -134,12 +129,6 @@ VERSION = 0.3.28.dev
|
|||
# Build LAPACK Deprecated functions since LAPACK 3.6.0
|
||||
BUILD_LAPACK_DEPRECATED = 1
|
||||
|
||||
# The variable type assumed for the length of character arguments when passing
|
||||
# data between Fortran LAPACK and C BLAS (defaults to "size_t", but older GCC
|
||||
# versions used "int"). Mismatches will not cause runtime failures but may result
|
||||
# in build warnings or errors when building with link-time optimization (LTO)
|
||||
# LAPACK_STRLEN=int
|
||||
|
||||
# Build RecursiveLAPACK on top of LAPACK
|
||||
# BUILD_RELAPACK = 1
|
||||
# Have RecursiveLAPACK actually replace standard LAPACK routines instead of
|
||||
|
@ -179,10 +168,6 @@ NO_AFFINITY = 1
|
|||
# If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
|
||||
# BIGNUMA = 1
|
||||
|
||||
# If you are compiling for an embedded system ("bare metal") like Cortex M series
|
||||
# Note that you will have to provide implementations of malloc() and free() in this case
|
||||
# EMBEDDED = 1
|
||||
|
||||
# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
|
||||
# and OS. However, the performance is low.
|
||||
# NO_AVX = 1
|
||||
|
@ -225,16 +210,6 @@ NO_AFFINITY = 1
|
|||
# to the user space. If bigphysarea is enabled, it will use it.
|
||||
# DEVICEDRIVER_ALLOCATION = 1
|
||||
|
||||
# Use large page allocation (called hugepage support in Linux context)
|
||||
# for the thread buffers (with access by shared memory operations)
|
||||
# HUGETLB_ALLOCATION = 1
|
||||
|
||||
# Use large page allocation called hugepages in Linux) based on mmap accessing
|
||||
# a memory-backed pseudofile (requires hugetlbfs to be mounted in the system,
|
||||
# the example below has it mounted on /hugepages. OpenBLAS will create the backing
|
||||
# file as gotoblas.processid in that path)
|
||||
# HUGETLBFILE_ALLOCATION = /hugepages
|
||||
|
||||
# If you need to synchronize FP CSR between threads (for x86/x86_64 and aarch64 only).
|
||||
# CONSISTENT_FPCSR = 1
|
||||
|
||||
|
|
199
Makefile.system
199
Makefile.system
|
@ -268,46 +268,24 @@ SMALL_MATRIX_OPT = 1
|
|||
else ifeq ($(ARCH), power)
|
||||
SMALL_MATRIX_OPT = 1
|
||||
BUILD_BFLOAT16 = 1
|
||||
else ifeq ($(ARCH), arm64)
|
||||
SMALL_MATRIX_OPT = 1
|
||||
endif
|
||||
ifeq ($(ARCH), loongarch64)
|
||||
SMALL_MATRIX_OPT = 1
|
||||
endif
|
||||
ifeq ($(ARCH), arm64)
|
||||
GEMM_GEMV_FORWARD = 1
|
||||
endif
|
||||
ifeq ($(ARCH), riscv)
|
||||
GEMM_GEMV_FORWARD = 1
|
||||
endif
|
||||
ifeq ($(ARCH), power)
|
||||
GEMM_GEMV_FORWARD = 1
|
||||
GEMM_GEMV_FORWARD_BF16 = 1
|
||||
endif
|
||||
|
||||
ifeq ($(SMALL_MATRIX_OPT), 1)
|
||||
CCOMMON_OPT += -DSMALL_MATRIX_OPT
|
||||
endif
|
||||
ifneq ($(ONLY_CBLAS), 1)
|
||||
ifeq ($(GEMM_GEMV_FORWARD), 1)
|
||||
CCOMMON_OPT += -DGEMM_GEMV_FORWARD
|
||||
endif
|
||||
ifeq ($(GEMM_GEMV_FORWARD_BF16), 1)
|
||||
CCOMMON_OPT += -DGEMM_GEMV_FORWARD_BF16
|
||||
endif
|
||||
endif
|
||||
|
||||
# This operation is expensive, so execution should be once.
|
||||
ifndef GOTOBLAS_MAKEFILE
|
||||
export GOTOBLAS_MAKEFILE = 1
|
||||
|
||||
# Determine if the assembler is GNU Assembler
|
||||
HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?)
|
||||
GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS)
|
||||
|
||||
# Generating Makefile.conf and config.h
|
||||
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
|
||||
|
||||
endif
|
||||
|
||||
ifndef TARGET_CORE
|
||||
-include $(TOPDIR)/Makefile.conf
|
||||
include $(TOPDIR)/Makefile.conf
|
||||
else
|
||||
HAVE_NEON=
|
||||
HAVE_VFP=
|
||||
|
@ -328,6 +306,7 @@ HAVE_FMA3=
|
|||
include $(TOPDIR)/Makefile_kernel.conf
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
ifndef NUM_PARALLEL
|
||||
NUM_PARALLEL = 1
|
||||
|
@ -389,9 +368,8 @@ GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
|
|||
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
|
||||
GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8)
|
||||
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||
GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
|
||||
GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11)
|
||||
GCCVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12)
|
||||
GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
|
||||
# Note that the behavior of -dumpversion is compile-time-configurable for
|
||||
# gcc-7.x and newer. Use -dumpfullversion there
|
||||
ifeq ($(GCCVERSIONGTEQ7),1)
|
||||
|
@ -405,11 +383,6 @@ GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d
|
|||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
CLANGVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||
CLANGVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12)
|
||||
endif
|
||||
|
||||
#
|
||||
# OS dependent settings
|
||||
#
|
||||
|
@ -418,22 +391,11 @@ ifeq ($(OSNAME), Darwin)
|
|||
ifndef MACOSX_DEPLOYMENT_TARGET
|
||||
ifeq ($(ARCH), arm64)
|
||||
export MACOSX_DEPLOYMENT_TARGET=11.0
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
export NO_SVE = 1
|
||||
endif
|
||||
else
|
||||
export MACOSX_DEPLOYMENT_TARGET=10.8
|
||||
endif
|
||||
endif
|
||||
MD5SUM = md5 -r
|
||||
XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.Xcode |awk '/version:/ {print $2}'|cut -d: -f2|cut -f1 -d.)
|
||||
ifeq (x$(XCVER)x,xx)
|
||||
XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables |awk '/version:/ {print $2}'|cut -d: -f2|cut -f1 -d.)
|
||||
endif
|
||||
ifeq (x$(XCVER), x 15)
|
||||
CCOMMON_OPT += -Wl,-ld_classic
|
||||
FCOMMON_OPT += -Wl,-ld_classic
|
||||
endif
|
||||
endif
|
||||
|
||||
ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly))
|
||||
|
@ -457,7 +419,7 @@ ifeq ($(OSNAME), AIX)
|
|||
EXTRALIB += -lm
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
ifeq ($(ARCH), $(filter $(ARCH),arm arm64))
|
||||
EXTRALIB += -lm
|
||||
endif
|
||||
|
@ -634,9 +596,6 @@ endif
|
|||
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
CCOMMON_OPT += -fopenmp
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
FEXTRALIB := $(subst -lgomp,-lomp,$(FEXTRALIB))
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), INTEL)
|
||||
|
@ -685,7 +644,7 @@ DYNAMIC_CORE += HASWELL ZEN
|
|||
endif
|
||||
ifneq ($(NO_AVX512), 1)
|
||||
ifneq ($(NO_AVX2), 1)
|
||||
DYNAMIC_CORE += SKYLAKEX COOPERLAKE SAPPHIRERAPIDS
|
||||
DYNAMIC_CORE += SKYLAKEX COOPERLAKE
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
@ -702,13 +661,13 @@ ifeq ($(ARCH), arm64)
|
|||
DYNAMIC_CORE = ARMV8
|
||||
DYNAMIC_CORE += CORTEXA53
|
||||
DYNAMIC_CORE += CORTEXA57
|
||||
DYNAMIC_CORE += CORTEXA72
|
||||
DYNAMIC_CORE += CORTEXA73
|
||||
DYNAMIC_CORE += NEOVERSEN1
|
||||
ifneq ($(NO_SVE), 1)
|
||||
DYNAMIC_CORE += NEOVERSEV1
|
||||
DYNAMIC_CORE += NEOVERSEN2
|
||||
DYNAMIC_CORE += ARMV8SVE
|
||||
DYNAMIC_CORE += A64FX
|
||||
endif
|
||||
DYNAMIC_CORE += CORTEXA55
|
||||
DYNAMIC_CORE += FALKOR
|
||||
DYNAMIC_CORE += THUNDERX
|
||||
DYNAMIC_CORE += THUNDERX2T99
|
||||
DYNAMIC_CORE += TSV110
|
||||
|
@ -731,18 +690,7 @@ endif
|
|||
endif
|
||||
|
||||
ifeq ($(ARCH), loongarch64)
|
||||
DYNAMIC_CORE = LA64_GENERIC LA264 LA464
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), riscv64)
|
||||
DYNAMIC_CORE = RISCV64_GENERIC
|
||||
DYNAMIC_CORE += RISCV64_ZVL128B
|
||||
DYNAMIC_CORE += RISCV64_ZVL256B
|
||||
ifdef DYNAMIC_LIST
|
||||
override DYNAMIC_CORE = RISCV64_GENERIC $(DYNAMIC_LIST)
|
||||
XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_RISCV64_GENERIC
|
||||
XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
|
||||
endif
|
||||
DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), zarch)
|
||||
|
@ -793,11 +741,7 @@ DYNAMIC_CORE += POWER9
|
|||
else
|
||||
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
|
||||
endif
|
||||
ifeq ($(OSNAME), AIX)
|
||||
LDVERSIONGTEQ35 := 1
|
||||
else
|
||||
LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35)
|
||||
endif
|
||||
ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11)
|
||||
DYNAMIC_CORE += POWER10
|
||||
CCOMMON_OPT += -DHAVE_P10_SUPPORT
|
||||
|
@ -847,12 +791,8 @@ ifeq ($(ARCH), arm)
|
|||
NO_BINARY_MODE = 1
|
||||
BINARY_DEFINED = 1
|
||||
|
||||
ifneq ($(EMBEDDED), 1)
|
||||
CCOMMON_OPT += -marm
|
||||
FCOMMON_OPT += -marm
|
||||
else
|
||||
CCOMMON_OPT += -DOS_EMBEDDED -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16
|
||||
endif
|
||||
|
||||
# If softfp abi is mentioned on the command line, force it.
|
||||
ifeq ($(ARM_SOFTFP_ABI), 1)
|
||||
|
@ -887,37 +827,13 @@ endif
|
|||
ifeq ($(ARCH), riscv64)
|
||||
NO_BINARY_MODE = 1
|
||||
BINARY_DEFINED = 1
|
||||
ifdef INTERFACE64
|
||||
ifneq ($(INTERFACE64), 0)
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
FCOMMON_OPT += -fdefault-integer-8
|
||||
endif
|
||||
ifeq ($(F_COMPILER), FLANG)
|
||||
FCOMMON_OPT += -i8
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), loongarch64)
|
||||
NO_BINARY_MODE = 1
|
||||
BINARY_DEFINED = 1
|
||||
ifdef INTERFACE64
|
||||
ifneq ($(INTERFACE64), 0)
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
FCOMMON_OPT += -fdefault-integer-8
|
||||
endif
|
||||
ifeq ($(F_COMPILER), FLANG)
|
||||
FCOMMON_OPT += -i8
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), csky)
|
||||
NO_BINARY_MODE = 1
|
||||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
||||
#
|
||||
# C Compiler dependent settings
|
||||
|
@ -994,18 +910,8 @@ BINARY_DEFINED = 1
|
|||
endif
|
||||
|
||||
ifeq ($(ARCH), loongarch64)
|
||||
LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d)
|
||||
LA64_ARCH=$(shell $(CC) -march=loongarch64 -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo loongarch64)
|
||||
ifneq ($(LA64_ABI), lp64d)
|
||||
LA64_ABI=lp64
|
||||
endif
|
||||
ifneq ($(LA64_ARCH), loongarch64)
|
||||
CCOMMON_OPT += -mabi=$(LA64_ABI)
|
||||
FCOMMON_OPT += -mabi=$(LA64_ABI)
|
||||
else
|
||||
CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
|
||||
FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
|
||||
endif
|
||||
CCOMMON_OPT += -march=loongarch64 -mabi=lp64
|
||||
FCOMMON_OPT += -march=loongarch64 -mabi=lp64
|
||||
endif
|
||||
|
||||
endif
|
||||
|
@ -1040,19 +946,16 @@ endif
|
|||
endif
|
||||
ifdef BINARY64
|
||||
ifeq ($(ARCH), x86_64)
|
||||
ifeq (,$(findstring tp,$(CFLAGS)))
|
||||
ifneq ($(NEWPGI2),1)
|
||||
CCOMMON_OPT += -tp p7-64
|
||||
else
|
||||
CCOMMON_OPT += -tp px
|
||||
endif
|
||||
endif
|
||||
ifneq ($(NEWPGI),1)
|
||||
CCOMMON_OPT += -D__MMX__ -Mnollvm
|
||||
endif
|
||||
else
|
||||
ifeq ($(ARCH), power)
|
||||
ifeq (,$(findstring tp,$(CFLAGS)))
|
||||
ifeq ($(CORE), POWER8)
|
||||
CCOMMON_OPT += -tp pwr8
|
||||
endif
|
||||
|
@ -1061,17 +964,14 @@ CCOMMON_OPT += -tp pwr9
|
|||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
ifneq ($(NEWPGI2),1)
|
||||
ifeq (,$(findstring tp,$(CFLAGS)))
|
||||
CCOMMON_OPT += -tp p7
|
||||
else
|
||||
CCOMMON_OPT += -tp px
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), PATHSCALE)
|
||||
ifdef BINARY64
|
||||
|
@ -1154,9 +1054,8 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW))
|
||||
CCOMMON_OPT += -DF_INTERFACE_GFORT
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
CCOMMON_OPT += -DF_INTERFACE_GFORT
|
||||
FCOMMON_OPT += -Wall
|
||||
# make single-threaded LAPACK calls thread-safe #1847
|
||||
FCOMMON_OPT += -frecursive
|
||||
|
@ -1170,7 +1069,6 @@ EXTRALIB += -lgfortran
|
|||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifdef NO_BINARY_MODE
|
||||
ifeq ($(ARCH), $(filter $(ARCH),mips64))
|
||||
ifdef BINARY64
|
||||
|
@ -1226,10 +1124,6 @@ endif
|
|||
|
||||
ifeq ($(F_COMPILER), IBM)
|
||||
CCOMMON_OPT += -DF_INTERFACE_IBM
|
||||
FEXTRALIB += -lxlf90
|
||||
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG))
|
||||
FCOMMON_OPT += -qextname -qzerosize
|
||||
endif
|
||||
# FCOMMON_OPT += -qarch=440
|
||||
ifdef BINARY64
|
||||
FCOMMON_OPT += -q64
|
||||
|
@ -1241,6 +1135,9 @@ endif
|
|||
else
|
||||
FCOMMON_OPT += -q32
|
||||
endif
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
FCOMMON_OPT += -openmp
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER), PGI)
|
||||
|
@ -1423,8 +1320,6 @@ ifeq ($(F_COMPILER), SUN)
|
|||
FCOMMON_OPT += -pic
|
||||
else ifeq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -PIC
|
||||
else ifeq ($(F_COMPILER), IBM)
|
||||
FCOMMON_OPT += -qpic=large
|
||||
else
|
||||
FCOMMON_OPT += -fPIC
|
||||
endif
|
||||
|
@ -1468,10 +1363,6 @@ ifeq ($(NO_AVX512), 1)
|
|||
CCOMMON_OPT += -DNO_AVX512
|
||||
endif
|
||||
|
||||
ifeq ($(NO_SVE), 1)
|
||||
CCOMMON_OPT += -DNO_SVE
|
||||
endif
|
||||
|
||||
ifdef SMP
|
||||
CCOMMON_OPT += -DSMP_SERVER
|
||||
|
||||
|
@ -1560,28 +1451,16 @@ ifndef LIBSONAMEBASE
|
|||
LIBSONAMEBASE = openblas
|
||||
endif
|
||||
|
||||
ifndef LIBNAMEPREFIX
|
||||
LIBNAMEPREFIX =
|
||||
endif
|
||||
|
||||
SYMPREFIX=$(SYMBOLPREFIX)
|
||||
ifeq ($(SYMBOLPREFIX),$(LIBNAMEPREFIX))
|
||||
SYMPREFIX=
|
||||
endif
|
||||
SYMSUFFIX=$(SYMBOLSUFFIX)
|
||||
ifeq ($(SYMBOLSUFFIX),$(LIBNAMESUFFIX))
|
||||
SYMSUFFIX=
|
||||
endif
|
||||
ifndef LIBNAMESUFFIX
|
||||
LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)
|
||||
LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)
|
||||
else
|
||||
LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)$(LIBNAMESUFFIX)
|
||||
LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
LIBPREFIX = cyg$(LIBNAMEPREFIX)$(LIBNAMEBASE)
|
||||
LIBPREFIX = cyg$(LIBNAMEBASE)
|
||||
else
|
||||
LIBPREFIX = lib$(LIBNAMEPREFIX)$(LIBNAMEBASE)
|
||||
LIBPREFIX = lib$(LIBNAMEBASE)
|
||||
endif
|
||||
|
||||
KERNELDIR = $(TOPDIR)/kernel/$(ARCH)
|
||||
|
@ -1625,23 +1504,13 @@ ifdef FUNCTION_PROFILE
|
|||
CCOMMON_OPT += -DFUNCTION_PROFILE
|
||||
endif
|
||||
|
||||
ifdef SHMEM_ALLOCATION
|
||||
ifneq ($(SHMEM_ALLOCATION), 0)
|
||||
CCOMMON_OPT += -DALLOC_SHM
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef HUGETLB_ALLOCATION
|
||||
ifneq ($(HUGETLB_ALLOCATION), 0)
|
||||
CCOMMON_OPT += -DALLOC_HUGETLB
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef HUGETLBFILE_ALLOCATION
|
||||
ifneq ($(HUGETLBFILE_ALLOCATION), 0)
|
||||
CCOMMON_OPT += -DALLOC_HUGETLBFILE -DHUGETLB_FILE_NAME=$(HUGETLBFILE_ALLOCATION)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef STATIC_ALLOCATION
|
||||
CCOMMON_OPT += -DALLOC_STATIC
|
||||
|
@ -1699,11 +1568,9 @@ override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
|
|||
|
||||
ifeq ($(NEED_PIC), 1)
|
||||
ifeq (,$(findstring PIC,$(FFLAGS)))
|
||||
ifneq ($(F_COMPILER),IBM)
|
||||
override FFLAGS += -fPIC
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
#For LAPACK Fortran codes.
|
||||
#Disable -fopenmp for LAPACK Fortran codes on Windows.
|
||||
|
@ -1717,15 +1584,11 @@ endif
|
|||
|
||||
ifeq ($(F_COMPILER),NAG)
|
||||
LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||
override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||
FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||
endif
|
||||
ifeq ($(F_COMPILER),CRAY)
|
||||
LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||
override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||
endif
|
||||
ifeq ($(F_COMPILER),FLANGNEW)
|
||||
LAPACK_FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 -mtune=% -mabi=% ,$(FFLAGS))
|
||||
override FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 -mtune=% -mabi=% ,$(FFLAGS))
|
||||
FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||
endif
|
||||
|
||||
LAPACK_CFLAGS = $(CFLAGS)
|
||||
|
@ -1774,14 +1637,14 @@ LIBNAME_P = $(LIBPREFIX)p$(REVISION)_p.$(LIBSUFFIX)
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(FIXED_LIBNAME),1)
|
||||
LIBNAME = lib$(LIBNAMEPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).$(LIBSUFFIX)
|
||||
LIBNAME_P = lib$(LIBNAMEPREFIX)$(LISOBNAMEBASE)$(LIBNAMESUFFIX)_p.$(LIBSUFFIX)
|
||||
endif
|
||||
|
||||
LIBDLLNAME = $(LIBPREFIX).dll
|
||||
IMPLIBNAME = lib$(LIBNAMEBASE).dll.a
|
||||
ifneq ($(OSNAME), AIX)
|
||||
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
|
||||
else
|
||||
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a)
|
||||
endif
|
||||
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
|
||||
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
|
||||
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)
|
||||
|
@ -1868,8 +1731,6 @@ export TARGET_CORE
|
|||
export NO_AVX512
|
||||
export NO_AVX2
|
||||
export BUILD_BFLOAT16
|
||||
export NO_LSX
|
||||
export NO_LASX
|
||||
|
||||
export SBGEMM_UNROLL_M
|
||||
export SBGEMM_UNROLL_N
|
||||
|
|
|
@ -8,11 +8,6 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
ifeq ($(findstring icx,$(CC)),icx)
|
||||
CCOMMON_OPT += -fp-model=consistent
|
||||
endif
|
||||
endif
|
||||
|
||||
ifneq ($(DYNAMIC_ARCH),1)
|
||||
ADD_CPUFLAGS = 1
|
||||
|
@ -80,31 +75,18 @@ endif
|
|||
ifeq ($(CORE), COOPERLAKE)
|
||||
ifndef NO_AVX512
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
# cooperlake support was added in 10.1
|
||||
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
|
||||
CCOMMON_OPT += -march=cooperlake
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=cooperlake
|
||||
endif
|
||||
else # gcc not support, fallback to avx512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
else ifeq ($(C_COMPILER), CLANG)
|
||||
# cooperlake support was added in clang 9
|
||||
ifeq ($(CLANGVERSIONGTEQ9), 1)
|
||||
CCOMMON_OPT += -march=cooperlake
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=cooperlake
|
||||
endif
|
||||
else # not supported in clang, fallback to avx512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
# cooperlake support was added in 10.1
|
||||
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
|
||||
CCOMMON_OPT += -march=cooperlake
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=cooperlake
|
||||
endif
|
||||
else # gcc not support, fallback to avx512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
|
@ -122,52 +104,19 @@ endif
|
|||
ifeq ($(CORE), SAPPHIRERAPIDS)
|
||||
ifndef NO_AVX512
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
# sapphire rapids support was added in 11
|
||||
ifeq ($(GCCVERSIONGTEQ11), 1)
|
||||
CCOMMON_OPT += -march=sapphirerapids
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=sapphirerapids
|
||||
endif
|
||||
else # gcc not support, fallback to avx512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
else ifeq ($(C_COMPILER), CLANG)
|
||||
# sapphire rapids support was added in clang 12
|
||||
ifeq ($(CLANGVERSIONGTEQ12), 1)
|
||||
CCOMMON_OPT += -march=sapphirerapids
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=sapphirerapids
|
||||
endif
|
||||
else # not supported in clang, fallback to avx512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
# sapphire rapids support was added in 11
|
||||
ifeq ($(GCCVERSIONGTEQ11), 1)
|
||||
CCOMMON_OPT += -march=sapphirerapids
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=sapphirerapids
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
FCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
FCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ZEN)
|
||||
ifdef HAVE_AVX512VL
|
||||
ifndef NO_AVX512
|
||||
else # gcc not support, fallback to avx512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
FCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
|
@ -180,8 +129,6 @@ endif
|
|||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
ifdef HAVE_AVX2
|
||||
ifndef NO_AVX2
|
||||
|
|
129
README.md
129
README.md
|
@ -2,20 +2,20 @@
|
|||
|
||||
[](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
|
||||
Cirrus CI: [](https://cirrus-ci.com/github/xianyi/OpenBLAS)
|
||||
Travis CI: [](https://travis-ci.com/xianyi/OpenBLAS)
|
||||
|
||||
AppVeyor: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
|
||||
|
||||
Drone CI: [](https://cloud.drone.io/xianyi/OpenBLAS/)
|
||||
|
||||
[](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
|
||||
|
||||
OSUOSL POWERCI [](http://powerci.osuosl.org/job/OpenBLAS_gh/job/develop/)
|
||||
|
||||
OSUOSL IBMZ-CI [](http://ibmz-ci.osuosl.org/job/OpenBLAS-Z/job/develop/)
|
||||
## Introduction
|
||||
|
||||
OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version.
|
||||
|
||||
Please read the documentation in the OpenBLAS folder: <https://github.com/OpenMathLib/OpenBLAS/docs>.
|
||||
Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
|
||||
|
||||
For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib:
|
||||
<https://www.netlib.org/blas>. On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six
|
||||
|
@ -27,12 +27,12 @@ We provide official binary packages for the following platform:
|
|||
|
||||
* Windows x86/x86_64
|
||||
|
||||
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/OpenMathLib/OpenBLAS/releases](https://github.com/OpenMathLib/OpenBLAS/releases).
|
||||
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/xianyi/OpenBLAS/releases](https://github.com/xianyi/OpenBLAS/releases).
|
||||
|
||||
## Installation from Source
|
||||
|
||||
Download from project homepage, https://github.com/OpenMathLib/OpenBLAS/, or check out the code
|
||||
using Git from https://github.com/OpenMathLib/OpenBLAS.git. (If you want the most up to date version, be
|
||||
Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
|
||||
using Git from https://github.com/xianyi/OpenBLAS.git. (If you want the most up to date version, be
|
||||
sure to use the develop branch - master is several years out of date due to a change of maintainership.)
|
||||
Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option.
|
||||
Most can also be given directly on the make or cmake command line.
|
||||
|
@ -41,45 +41,40 @@ Most can also be given directly on the make or cmake command line.
|
|||
|
||||
Building OpenBLAS requires the following to be installed:
|
||||
|
||||
* GNU Make or CMake
|
||||
* GNU Make
|
||||
* A C compiler, e.g. GCC or Clang
|
||||
* A Fortran compiler (optional, for LAPACK)
|
||||
|
||||
* IBM MASS (optional, see below)
|
||||
|
||||
### Normal compile
|
||||
|
||||
Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically.
|
||||
To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`.
|
||||
The full target list is in the file `TargetList.txt`, other build optionss are documented in Makefile.rule and
|
||||
can either be set there (typically by removing the comment character from the respective line), or used on the
|
||||
`make` command line.
|
||||
Note that when you run `make install` after building, you need to repeat all command line options you provided to `make`
|
||||
in the build step, as some settings like the supported maximum number of threads are automatically derived from the
|
||||
build host by default, which might not be what you want.
|
||||
For building with `cmake`, the usual conventions apply, i.e. create a build directory either underneath the toplevel
|
||||
OpenBLAS source directory or separate from it, and invoke `cmake` there with the path to the source tree and any
|
||||
build options you plan to set.
|
||||
The full target list is in the file `TargetList.txt`. For building with `cmake`, the
|
||||
usual conventions apply, i.e. create a build directory either underneath the toplevel
|
||||
OpenBLAS source directory or separate from it, and invoke `cmake` there with the path
|
||||
to the source tree and any build options you plan to set.
|
||||
|
||||
### Cross compile
|
||||
|
||||
Set `CC` and `FC` to point to the cross toolchains, and if you use `make`, also set `HOSTCC` to your host C compiler.
|
||||
Set `CC` and `FC` to point to the cross toolchains, and set `HOSTCC` to your host C compiler.
|
||||
The target must be specified explicitly when cross compiling.
|
||||
|
||||
Examples:
|
||||
|
||||
* On a Linux system, cross-compiling to an older MIPS64 router board:
|
||||
* On an x86 box, compile this library for a loongson3a CPU:
|
||||
```sh
|
||||
make BINARY=64 CC=mipsisa64r6el-linux-gnuabi64-gcc FC=mipsisa64r6el-linux-gnuabi64-gfortran HOSTCC=gcc TARGET=P6600
|
||||
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
|
||||
```
|
||||
* or to a Windows x64 host:
|
||||
or same with the newer mips-crosscompiler put out by Loongson that defaults to the 32bit ABI:
|
||||
```sh
|
||||
make CC="i686-w64-mingw32-gcc -Bstatic" FC="i686-w64-mingw32-gfortran -static-libgfortran" TARGET=HASWELL BINARY=32 CROSS=1 NUM_THREADS=20 CONSISTENT_FPCSR=1 HOSTCC=gcc
|
||||
make HOSTCC=gcc CC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gcc -mabi=64' FC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gfortran -mabi=64' TARGET=LOONGSON3A
|
||||
```
|
||||
|
||||
You can find instructions for other cases both in the "Supported Systems" section below and in the docs folder. The .yml scripts included with the sources (which contain the
|
||||
build scripts for the "continuous integration" (CI) build tests automatically run on every proposed change to the sources) may also provide additional hints.
|
||||
|
||||
When compiling for a more modern CPU TARGET of the same architecture, e.g. TARGET=SKYLAKEX on a HASWELL host, option "CROSS=1" can be used to suppress the automatic invocation of the tests at the end of the build.
|
||||
* On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler:
|
||||
```sh
|
||||
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
|
||||
```
|
||||
|
||||
### Debug version
|
||||
|
||||
|
@ -118,7 +113,7 @@ Use `PREFIX=` when invoking `make`, for example
|
|||
```sh
|
||||
make install PREFIX=your_installation_directory
|
||||
```
|
||||
(along with all options you added on the `make` command line in the preceding build step)
|
||||
|
||||
The default installation directory is `/opt/OpenBLAS`.
|
||||
|
||||
## Supported CPUs and Operating Systems
|
||||
|
@ -138,7 +133,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
|||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
|
||||
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
||||
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
|
||||
- **AMD ZEN**: Uses Haswell codes with some optimizations for Zen 2/3 (use SkylakeX for Zen4)
|
||||
- **AMD ZEN**: Uses Haswell codes with some optimizations.
|
||||
|
||||
#### MIPS32
|
||||
|
||||
|
@ -163,7 +158,6 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
|||
- **Cortex A57**: Optimized Level-3 and Level-2 functions
|
||||
- **Cortex A72**: same as A57 ( different cpu specifications)
|
||||
- **Cortex A73**: same as A57 (different cpu specifications)
|
||||
- **Cortex A76**: same as A57 (different cpu specifications)
|
||||
- **Falkor**: same as A57 (different cpu specifications)
|
||||
- **ThunderX**: Optimized some Level-1 functions
|
||||
- **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
|
||||
|
@ -171,21 +165,13 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
|||
- **TSV110**: Optimized some Level-3 helper functions
|
||||
- **EMAG 8180**: preliminary support based on A57
|
||||
- **Neoverse N1**: (AWS Graviton2) preliminary support
|
||||
- **Neoverse V1**: (AWS Graviton3) optimized Level-3 BLAS
|
||||
- **Apple Vortex**: preliminary support based on ThunderX2/3
|
||||
- **A64FX**: preliminary support, optimized Level-3 BLAS
|
||||
- **ARMV8SVE**: any ARMV8 cpu with SVE extensions
|
||||
- **Apple Vortex**: preliminary support based on ARMV8
|
||||
|
||||
#### PPC/PPC64
|
||||
|
||||
- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
|
||||
- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
|
||||
- **POWER10**: Optimized Level-3 BLAS including SBGEMM and some Level-1,2.
|
||||
|
||||
- **AIX**: Dynamic architecture with OpenXL and OpenMP.
|
||||
```sh
|
||||
make CC=ibm-clang_r FC=xlf_r TARGET=POWER7 BINARY=64 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 USE_THREAD=1
|
||||
```
|
||||
- **POWER10**:
|
||||
|
||||
#### IBM zEnterprise System
|
||||
|
||||
|
@ -198,66 +184,28 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
|||
```sh
|
||||
make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran
|
||||
```
|
||||
(also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision)
|
||||
(also known to work on C906)
|
||||
|
||||
- **x280**: Level-3 BLAS and Level-1,2 are optimized by RISC-V Vector extension 1.0.
|
||||
```sh
|
||||
make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran
|
||||
```
|
||||
|
||||
- **ZVL???B**: Level-3 BLAS and Level-1,2 including vectorised kernels targeting generic RISCV cores with vector support with registers of at least the corresponding width; ZVL128B and ZVL256B are available.
|
||||
e.g.:
|
||||
```sh
|
||||
make TARGET=RISCV64_ZVL256B CFLAGS="-DTARGET=RISCV64_ZVL256B" \
|
||||
BINARY=64 ARCH=riscv64 CC='clang -target riscv64-unknown-linux-gnu' \
|
||||
AR=riscv64-unknown-linux-gnu-ar AS=riscv64-unknown-linux-gnu-gcc \
|
||||
LD=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran \
|
||||
HOSTCC=gcc HOSTFC=gfortran -j
|
||||
```
|
||||
|
||||
#### LOONGARCH64
|
||||
|
||||
- **LA64_GENERIC**: Optimized Level-3, Level-2 and Level-1 BLAS with scalar instruction
|
||||
```sh
|
||||
make HOSTCC=gcc TARGET=LA64_GENERIC CC=loongarch64-unknown-linux-gnu-gcc FC=loongarch64-unknown-linux-gnu-gfortran USE_SIMPLE_THREADED_LEVEL3=1
|
||||
```
|
||||
The old-style TARGET=LOONGSONGENERIC is still supported
|
||||
|
||||
- **LA264**: Optimized Level-3, Level-2 and Level-1 BLAS with LSX instruction
|
||||
```sh
|
||||
make HOSTCC=gcc TARGET=LA264 CC=loongarch64-unknown-linux-gnu-gcc FC=loongarch64-unknown-linux-gnu-gfortran USE_SIMPLE_THREADED_LEVEL3=1
|
||||
```
|
||||
The old-style TARGET=LOONGSON2K1000 is still supported
|
||||
|
||||
- **LA464**: Optimized Level-3, Level-2 and Level-1 BLAS with LASX instruction
|
||||
```sh
|
||||
make HOSTCC=gcc TARGET=LA464 CC=loongarch64-unknown-linux-gnu-gcc FC=loongarch64-unknown-linux-gnu-gfortran USE_SIMPLE_THREADED_LEVEL3=1
|
||||
```
|
||||
The old-style TARGET=LOONGSON3R5 is still supported
|
||||
|
||||
### Support for multiple targets in a single library
|
||||
|
||||
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
|
||||
|
||||
For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX, Cooper Lake, Sapphire Rapids. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default.
|
||||
For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default.
|
||||
|
||||
`DYNAMIC_ARCH` is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
|
||||
Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.
|
||||
|
||||
On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. If compiler support for SVE is available at build time, support for NeoverseN2, NeoverseV1 as well as generic ArmV8SVE targets is also enabled.
|
||||
On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus.
|
||||
|
||||
For **POWER**, the list encompasses POWER6, POWER8 and POWER9. POWER10 is additionally available if a sufficiently recent compiler is used for the build.
|
||||
|
||||
on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support.
|
||||
|
||||
On **riscv64**, DYNAMIC_ARCH enables support for riscv64_zvl128b and riscv64_zvl256b in addition to generic riscv64 support. A compiler that supports RVV 1.0 is required to build OpenBLAS for riscv64 when DYNAMIC_ARCH is enabled.
|
||||
|
||||
On **LoongArch64**, it comprises LA264 and LA464 as well as generic LoongArch64 support.
|
||||
|
||||
The `TARGET` option can - and usually **should** - be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the common code in the library, usually you will want to set this to the oldest model you expect to encounter.
|
||||
Failure to specify this may lead to advanced instructions being used by the compiler, just because the build host happens to support them. This is most likely to happen when aggressive optimization options are in effect, and the resulting library may then crash with an
|
||||
illegal instruction error on weaker hardware, before it even reaches the BLAS routines specifically included for that cpu.
|
||||
For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
|
||||
|
||||
The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the
|
||||
common code in the library, usually you will want to set this to the oldest model you expect to encounter.
|
||||
Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library.
|
||||
|
||||
### Supported OS
|
||||
|
@ -270,7 +218,7 @@ Please note that it is not possible to combine support for different architectur
|
|||
- **NetBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
||||
- **AIX**: Supported on PPC up to POWER10
|
||||
- **AIX**: Supported on PPC up to POWER8
|
||||
- **Haiku**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **SunOS**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **Cortex-M**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-on-Cortex-M>.
|
||||
|
@ -311,21 +259,20 @@ If you compile this library with `USE_OPENMP=1`, you should use the above functi
|
|||
|
||||
## Reporting bugs
|
||||
|
||||
Please submit an issue in https://github.com/OpenMathLib/OpenBLAS/issues.
|
||||
Please submit an issue in https://github.com/xianyi/OpenBLAS/issues.
|
||||
|
||||
## Contact
|
||||
|
||||
+ Use github discussions: https://github.com/OpenMathLib/OpenBLAS/discussions
|
||||
* OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users
|
||||
* OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev
|
||||
|
||||
## Change log
|
||||
|
||||
Please see Changelog.txt.
|
||||
Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 1.13 BSD version.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
* Please read the [FAQ](https://github.com/OpenMathLib/OpenBLAS/docs/faq,md) in the docs folder first.
|
||||
* Please read the [FAQ](https://github.com/xianyi/OpenBLAS/wiki/Faq) first.
|
||||
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
|
||||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
|
||||
Clang 3.0 will generate the wrong AVX binary code.
|
||||
|
@ -342,9 +289,9 @@ Please see Changelog.txt.
|
|||
|
||||
## Contributing
|
||||
|
||||
1. [Check for open issues](https://github.com/OpenMathLib/OpenBLAS/issues) or open a fresh issue
|
||||
1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue
|
||||
to start a discussion around a feature idea or a bug.
|
||||
2. Fork the [OpenBLAS](https://github.com/OpenMathLib/OpenBLAS) repository to start making your changes.
|
||||
2. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
|
||||
3. Write a test which shows that the bug was fixed or that the feature works as expected.
|
||||
4. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.
|
||||
|
||||
|
|
20
SECURITY.md
20
SECURITY.md
|
@ -1,20 +0,0 @@
|
|||
# Security Policy
|
||||
|
||||
## Supported Versions
|
||||
|
||||
It is generally recommended to use the latest release as this project
|
||||
does not maintain multiple stable branches and providing packages e.g.
|
||||
for Linux distributions is outside our scope. In particular, versions
|
||||
before 0.3.18 can be assumed to carry the out-of-bounds-read error in
|
||||
the LAPACK ?LARRV family of functions that was the subject of
|
||||
CVE-2021-4048
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
If you suspect that you have found a vulnerability - a defect that could
|
||||
be abused to compromise the security of a user's code or systems - please
|
||||
do not use the normal github issue tracker (except perhaps to post a general
|
||||
warning if you deem that necessary). Instead, please contact the project
|
||||
maintainers through the email addresses given in their github user profiles.
|
||||
Defects found in the "lapack-netlib" subtree should ideally be reported to
|
||||
the maintainers of the reference implementation of LAPACK, lapack@icl.itk.edu
|
|
@ -93,7 +93,6 @@ CORTEXA53
|
|||
CORTEXA57
|
||||
CORTEXA72
|
||||
CORTEXA73
|
||||
CORTEXA76
|
||||
CORTEXA510
|
||||
CORTEXA710
|
||||
CORTEXX1
|
||||
|
@ -126,17 +125,9 @@ x280
|
|||
RISCV64_ZVL256B
|
||||
|
||||
11.LOONGARCH64:
|
||||
// LOONGSONGENERIC/LOONGSON2K1000/LOONGSON3R5 are legacy names,
|
||||
// and it is recommended to use the more standardized naming conventions
|
||||
// LA64_GENERIC/LA264/LA464. You can still specify TARGET as
|
||||
// LOONGSONGENERIC/LOONGSON2K1000/LOONGSON3R5 during compilation or runtime,
|
||||
// and they will be internally relocated to LA64_GENERIC/LA264/LA464.
|
||||
LOONGSONGENERIC
|
||||
LOONGSON2K1000
|
||||
LOONGSON3R5
|
||||
LA64_GENERIC
|
||||
LA264
|
||||
LA464
|
||||
LOONGSON2K1000
|
||||
|
||||
12. Elbrus E2000:
|
||||
E2K
|
||||
|
@ -145,7 +136,3 @@ E2K
|
|||
EV4
|
||||
EV5
|
||||
EV6
|
||||
|
||||
14.CSKY
|
||||
CSKY
|
||||
CK860FV
|
||||
|
|
|
@ -115,7 +115,7 @@ jobs:
|
|||
mkdir build
|
||||
cd build
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER="flang -I C:\Miniconda\Library\include\flang" -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
|
||||
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
|
||||
cmake --build . --config Release
|
||||
ctest
|
||||
|
||||
|
@ -133,29 +133,29 @@ jobs:
|
|||
mkdir build
|
||||
cd build
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
cmake -G "Ninja" -DCMAKE_C_COMPILER=cl -DCMAKE_Fortran_COMPILER=flang-new -DC_LAPACK=1 -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
|
||||
cmake -G "Ninja" -DCMAKE_C_COMPILER=cl -DCMAKE_Fortran_COMPILER=flang -DC_LAPACK=1 -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
|
||||
cmake --build . --config Release
|
||||
ctest
|
||||
ctest --rerun-failed --output-on-failure
|
||||
|
||||
|
||||
|
||||
- job: OSX_OpenMP
|
||||
pool:
|
||||
vmImage: 'macOS-12'
|
||||
vmImage: 'macOS-11'
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-13 FC=gfortran-13
|
||||
make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-13 FC=gfortran-13 PREFIX=../blasinst install
|
||||
make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10
|
||||
make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 PREFIX=../blasinst install
|
||||
ls -lR ../blasinst
|
||||
|
||||
- job: OSX_GCC_Nothreads
|
||||
pool:
|
||||
vmImage: 'macOS-12'
|
||||
vmImage: 'macOS-11'
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
make USE_THREADS=0 CC=gcc-13 FC=gfortran-13
|
||||
make USE_THREADS=0 CC=gcc-10 FC=gfortran-10
|
||||
|
||||
- job: OSX_GCC12
|
||||
pool:
|
||||
|
@ -167,10 +167,11 @@ jobs:
|
|||
|
||||
- job: OSX_OpenMP_Clang
|
||||
pool:
|
||||
vmImage: 'macOS-latest'
|
||||
vmImage: 'macOS-11'
|
||||
variables:
|
||||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
MACOSX_DEPLOYMENT_TARGET: 11.0
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
|
@ -179,7 +180,7 @@ jobs:
|
|||
|
||||
- job: OSX_OpenMP_Clang_cmake
|
||||
pool:
|
||||
vmImage: 'macOS-latest'
|
||||
vmImage: 'macOS-11'
|
||||
variables:
|
||||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
|
@ -195,7 +196,7 @@ jobs:
|
|||
|
||||
- job: OSX_dynarch_cmake
|
||||
pool:
|
||||
vmImage: 'macOS-12'
|
||||
vmImage: 'macOS-11'
|
||||
variables:
|
||||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
|
@ -203,16 +204,16 @@ jobs:
|
|||
- script: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DDYNAMIC_LIST='NEHALEM HASWELL SKYLAKEX' -DCMAKE_C_COMPILER=gcc-13 -DCMAKE_Fortran_COMPILER=gfortran-13 -DBUILD_SHARED_LIBS=ON ..
|
||||
cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DDYNAMIC_LIST='NEHALEM HASWELL SKYLAKEX' -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON ..
|
||||
cmake --build .
|
||||
ctest
|
||||
|
||||
- job: OSX_Ifort_Clang
|
||||
pool:
|
||||
vmImage: 'macOS-latest'
|
||||
vmImage: 'macOS-11'
|
||||
variables:
|
||||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/edb4dc2f-266f-47f2-8d56-21bc7764e119/m_HPCKit_p_2023.2.0.49443.dmg
|
||||
MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg
|
||||
LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
MACOS_FORTRAN_COMPONENTS: intel.oneapi.mac.ifort-compiler
|
||||
steps:
|
||||
|
@ -242,7 +243,7 @@ jobs:
|
|||
|
||||
- job: OSX_NDK_ARMV7
|
||||
pool:
|
||||
vmImage: 'macOS-12'
|
||||
vmImage: 'macOS-11'
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
|
@ -252,45 +253,32 @@ jobs:
|
|||
|
||||
- job: OSX_IOS_ARMV8
|
||||
pool:
|
||||
vmImage: 'macOS-12'
|
||||
vmImage: 'macOS-11'
|
||||
variables:
|
||||
CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk -arch arm64 -miphoneos-version-min=10.0
|
||||
CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch arm64 -miphoneos-version-min=10.0
|
||||
steps:
|
||||
- script: |
|
||||
make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
|
||||
|
||||
- job: OSX_IOS_ARMV7
|
||||
pool:
|
||||
vmImage: 'macOS-12'
|
||||
vmImage: 'macOS-11'
|
||||
variables:
|
||||
CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.2.sdk -arch armv7 -miphoneos-version-min=5.1
|
||||
CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch armv7 -miphoneos-version-min=5.1
|
||||
steps:
|
||||
- script: |
|
||||
make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
|
||||
|
||||
- job: OSX_xbuild_DYNAMIC_ARM64
|
||||
pool:
|
||||
vmImage: 'macOS-12'
|
||||
variables:
|
||||
CC: /Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_14.2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX13.1.sdk -arch arm64
|
||||
steps:
|
||||
- script: |
|
||||
ls /Applications/Xcode_14.2.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs
|
||||
/Applications/Xcode_12.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus
|
||||
/Applications/Xcode_14.2.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version
|
||||
make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
|
||||
|
||||
- job: ALPINE_MUSL
|
||||
pool:
|
||||
vmImage: 'ubuntu-latest'
|
||||
steps:
|
||||
- script: |
|
||||
wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.14.0/alpine-chroot-install \
|
||||
&& echo 'ccbf65f85cdc351851f8ad025bb3e65bae4d5b06 alpine-chroot-install' | sha1sum -c \
|
||||
|| exit 1
|
||||
wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \
|
||||
&& echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc alpine-chroot-install' | sha1sum -c \
|
||||
|| exit 1
|
||||
alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
|
||||
sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo'
|
||||
alpine make DYNAMIC_ARCH=1 BINARY=64
|
||||
|
|
6927
benchmark/Makefile
6927
benchmark/Makefile
File diff suppressed because it is too large
Load Diff
|
@ -92,7 +92,7 @@ int main(int argc, char *argv[]){
|
|||
|
||||
if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step);
|
||||
|
||||
|
|
|
@ -85,7 +85,7 @@ int main(int argc, char *argv[]){
|
|||
double time1, time2, timeg1,timeg2;
|
||||
|
||||
char *p;
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
|
|
|
@ -1,122 +0,0 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "bench.h"
|
||||
|
||||
#undef OMATCOPY
|
||||
|
||||
#ifndef COMPLEX
|
||||
#ifdef DOUBLE
|
||||
#define OMATCOPY BLASFUNC(domatcopy)
|
||||
#else
|
||||
#define OMATCOPY BLASFUNC(somatcopy)
|
||||
#endif
|
||||
#else
|
||||
#ifdef DOUBLE
|
||||
#define OMATCOPY BLASFUNC(zomatcopy)
|
||||
#else
|
||||
#define OMATCOPY BLASFUNC(comatcopy)
|
||||
#endif
|
||||
#endif
|
||||
int main(int argc, char *argv[]){
|
||||
FLOAT *a, *b;
|
||||
FLOAT alpha[] = {1.0, 0.0};
|
||||
char trans = 'N';
|
||||
char order = 'C';
|
||||
blasint crows, ccols, clda, cldb;
|
||||
int loops = 1;
|
||||
char *p;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
int i, j;
|
||||
|
||||
double time1, timeg;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++; }
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++; }
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++; }
|
||||
|
||||
if ((p = getenv("OPENBLAS_TRANS"))) {
|
||||
trans=*p;
|
||||
}
|
||||
if ((p = getenv("OPENBLAS_ORDER"))) {
|
||||
order=*p;
|
||||
}
|
||||
TOUPPER(trans);
|
||||
TOUPPER(order);
|
||||
fprintf(stderr, "From : %3d To : %3d Step=%d : Trans=%c : Order=%c\n", from, to, step, trans, order);
|
||||
p = getenv("OPENBLAS_LOOPS");
|
||||
if ( p != NULL ) {
|
||||
loops = atoi(p);
|
||||
}
|
||||
|
||||
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
#ifdef __linux
|
||||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
for (i = 0; i < to * to * COMPSIZE; i++) {
|
||||
a[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
for (i = 0; i < to * to * COMPSIZE; i++) {
|
||||
b[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
fprintf(stderr, " SIZE Flops Time\n");
|
||||
for (i = from; i <= to; i += step) {
|
||||
cldb = clda = crows = ccols = i;
|
||||
fprintf(stderr, " ROWS=%4d, COLS=%4d : ", (int)crows, (int)ccols);
|
||||
begin();
|
||||
|
||||
for (j=0; j<loops; j++) {
|
||||
OMATCOPY (&order, &trans, &crows, &ccols, alpha, a, &clda, b, &cldb);
|
||||
}
|
||||
|
||||
end();
|
||||
time1 = getsec();
|
||||
|
||||
timeg = time1/loops;
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops %10.6f sec\n",
|
||||
COMPSIZE * COMPSIZE * (double)ccols * (double)crows / timeg * 1.e-6, time1);
|
||||
}
|
||||
|
||||
free(a);
|
||||
free(b);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -120,7 +120,7 @@ int main(int argc, char *argv[]){
|
|||
|
||||
if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]);
|
||||
|
||||
|
|
|
@ -1,49 +0,0 @@
|
|||
# Continuous benchmarking of OpenBLAS performance
|
||||
|
||||
We run a set of benchmarks of subset of OpenBLAS functionality.
|
||||
|
||||
## Benchmark runner
|
||||
|
||||
[](https://codspeed.io/OpenMathLib/OpenBLAS/)
|
||||
|
||||
Click on [benchmarks](https://codspeed.io/OpenMathLib/OpenBLAS/benchmarks) to see the performance of a particular benchmark over time;
|
||||
Click on [branches](https://codspeed.io/OpenMathLib/OpenBLAS/branches/) and then on the last PR link to see the flamegraphs.
|
||||
|
||||
## What are the benchmarks
|
||||
|
||||
We run raw BLAS/LAPACK subroutines, via f2py-generated python wrappers. The wrappers themselves are equivalent to [those from SciPy](https://docs.scipy.org/doc/scipy/reference/linalg.lapack.html).
|
||||
In fact, the wrappers _are_ from SciPy, we take a small subset simply to avoid having to build the whole SciPy for each CI run.
|
||||
|
||||
|
||||
## Adding a new benchmark
|
||||
|
||||
`.github/workflows/codspeed-bench.yml` does all the orchestration on CI.
|
||||
|
||||
Benchmarks live in the `benchmark/pybench` directory. It is organized as follows:
|
||||
|
||||
- benchmarks themselves live in the `benchmarks` folder. Note that the LAPACK routines are imported from the `openblas_wrap` package.
|
||||
- the `openblas_wrap` package is a simple trampoline: it contains an f2py extension, `_flapack`, which talks to OpenBLAS, and exports the python names in its `__init__.py`.
|
||||
This way, the `openblas_wrap` package shields the benchmarks from the details of where a particular LAPACK function comes from. If wanted, you may for instance swap the `_flapack` extension to
|
||||
`scipy.linalg.blas` and `scipy.linalg.lapack`.
|
||||
|
||||
To change parameters of an existing benchmark, edit python files in the `benchmark/pybench/benchmarks` directory.
|
||||
|
||||
To add a benchmark for a new BLAS or LAPACK function, you need to:
|
||||
|
||||
- add an f2py wrapper for the bare LAPACK function. You can simply copy a wrapper from SciPy (look for `*.pyf.src` files in https://github.com/scipy/scipy/tree/main/scipy/linalg)
|
||||
- add an import to `benchmark/pybench/openblas_wrap/__init__.py`
|
||||
|
||||
|
||||
## Running benchmarks locally
|
||||
|
||||
This benchmarking layer is orchestrated from python, therefore you'll need to
|
||||
have all what it takes to build OpenBLAS from source, plus `python` and
|
||||
|
||||
```
|
||||
$ python -mpip install numpy meson ninja pytest pytest-benchmark
|
||||
```
|
||||
|
||||
The benchmark syntax is consistent with that of `pytest-benchmark` framework. The incantation to run the suite locally is `$ pytest benchmark/pybench/benchmarks/test_blas.py`.
|
||||
|
||||
An ASV compatible benchmark suite is planned but currently not implemented.
|
||||
|
|
@ -1,274 +0,0 @@
|
|||
import pytest
|
||||
import numpy as np
|
||||
import openblas_wrap as ow
|
||||
|
||||
dtype_map = {
|
||||
's': np.float32,
|
||||
'd': np.float64,
|
||||
'c': np.complex64,
|
||||
'z': np.complex128,
|
||||
'dz': np.complex128,
|
||||
}
|
||||
|
||||
|
||||
# ### BLAS level 1 ###
|
||||
|
||||
# dnrm2
|
||||
|
||||
dnrm2_sizes = [100, 1000]
|
||||
|
||||
def run_dnrm2(n, x, incx, func):
|
||||
res = func(x, n, incx=incx)
|
||||
return res
|
||||
|
||||
|
||||
@pytest.mark.parametrize('variant', ['d', 'dz'])
|
||||
@pytest.mark.parametrize('n', dnrm2_sizes)
|
||||
def test_nrm2(benchmark, n, variant):
|
||||
rndm = np.random.RandomState(1234)
|
||||
dtyp = dtype_map[variant]
|
||||
|
||||
x = np.array(rndm.uniform(size=(n,)), dtype=dtyp)
|
||||
nrm2 = ow.get_func('nrm2', variant)
|
||||
result = benchmark(run_dnrm2, n, x, 1, nrm2)
|
||||
|
||||
|
||||
# ddot
|
||||
|
||||
ddot_sizes = [100, 1000]
|
||||
|
||||
def run_ddot(x, y, func):
|
||||
res = func(x, y)
|
||||
return res
|
||||
|
||||
|
||||
@pytest.mark.parametrize('n', ddot_sizes)
|
||||
def test_dot(benchmark, n):
|
||||
rndm = np.random.RandomState(1234)
|
||||
|
||||
x = np.array(rndm.uniform(size=(n,)), dtype=float)
|
||||
y = np.array(rndm.uniform(size=(n,)), dtype=float)
|
||||
dot = ow.get_func('dot', 'd')
|
||||
result = benchmark(run_ddot, x, y, dot)
|
||||
|
||||
|
||||
# daxpy
|
||||
|
||||
daxpy_sizes = [100, 1000]
|
||||
|
||||
def run_daxpy(x, y, func):
|
||||
res = func(x, y, a=2.0)
|
||||
return res
|
||||
|
||||
|
||||
@pytest.mark.parametrize('variant', ['s', 'd', 'c', 'z'])
|
||||
@pytest.mark.parametrize('n', daxpy_sizes)
|
||||
def test_daxpy(benchmark, n, variant):
|
||||
rndm = np.random.RandomState(1234)
|
||||
dtyp = dtype_map[variant]
|
||||
|
||||
x = np.array(rndm.uniform(size=(n,)), dtype=dtyp)
|
||||
y = np.array(rndm.uniform(size=(n,)), dtype=dtyp)
|
||||
axpy = ow.get_func('axpy', variant)
|
||||
result = benchmark(run_daxpy, x, y, axpy)
|
||||
|
||||
|
||||
# ### BLAS level 2 ###
|
||||
|
||||
gemv_sizes = [100, 1000]
|
||||
|
||||
def run_gemv(a, x, y, func):
|
||||
res = func(1.0, a, x, y=y, overwrite_y=True)
|
||||
return res
|
||||
|
||||
|
||||
@pytest.mark.parametrize('variant', ['s', 'd', 'c', 'z'])
|
||||
@pytest.mark.parametrize('n', gemv_sizes)
|
||||
def test_dgemv(benchmark, n, variant):
|
||||
rndm = np.random.RandomState(1234)
|
||||
dtyp = dtype_map[variant]
|
||||
|
||||
x = np.array(rndm.uniform(size=(n,)), dtype=dtyp)
|
||||
y = np.empty(n, dtype=dtyp)
|
||||
|
||||
a = np.array(rndm.uniform(size=(n,n)), dtype=dtyp)
|
||||
x = np.array(rndm.uniform(size=(n,)), dtype=dtyp)
|
||||
y = np.zeros(n, dtype=dtyp)
|
||||
|
||||
gemv = ow.get_func('gemv', variant)
|
||||
result = benchmark(run_gemv, a, x, y, gemv)
|
||||
|
||||
assert result is y
|
||||
|
||||
|
||||
# dgbmv
|
||||
|
||||
dgbmv_sizes = [100, 1000]
|
||||
|
||||
def run_gbmv(m, n, kl, ku, a, x, y, func):
|
||||
res = func(m, n, kl, ku, 1.0, a, x, y=y, overwrite_y=True)
|
||||
return res
|
||||
|
||||
|
||||
|
||||
@pytest.mark.parametrize('variant', ['s', 'd', 'c', 'z'])
|
||||
@pytest.mark.parametrize('n', dgbmv_sizes)
|
||||
@pytest.mark.parametrize('kl', [1])
|
||||
def test_dgbmv(benchmark, n, kl, variant):
|
||||
rndm = np.random.RandomState(1234)
|
||||
dtyp = dtype_map[variant]
|
||||
|
||||
x = np.array(rndm.uniform(size=(n,)), dtype=dtyp)
|
||||
y = np.empty(n, dtype=dtyp)
|
||||
|
||||
m = n
|
||||
|
||||
a = rndm.uniform(size=(2*kl + 1, n))
|
||||
a = np.array(a, dtype=dtyp, order='F')
|
||||
|
||||
gbmv = ow.get_func('gbmv', variant)
|
||||
result = benchmark(run_gbmv, m, n, kl, kl, a, x, y, gbmv)
|
||||
assert result is y
|
||||
|
||||
|
||||
# ### BLAS level 3 ###
|
||||
|
||||
# dgemm
|
||||
|
||||
gemm_sizes = [100, 1000]
|
||||
|
||||
def run_gemm(a, b, c, func):
|
||||
alpha = 1.0
|
||||
res = func(alpha, a, b, c=c, overwrite_c=True)
|
||||
return res
|
||||
|
||||
|
||||
@pytest.mark.parametrize('variant', ['s', 'd', 'c', 'z'])
|
||||
@pytest.mark.parametrize('n', gemm_sizes)
|
||||
def test_gemm(benchmark, n, variant):
|
||||
rndm = np.random.RandomState(1234)
|
||||
dtyp = dtype_map[variant]
|
||||
a = np.array(rndm.uniform(size=(n, n)), dtype=dtyp, order='F')
|
||||
b = np.array(rndm.uniform(size=(n, n)), dtype=dtyp, order='F')
|
||||
c = np.empty((n, n), dtype=dtyp, order='F')
|
||||
gemm = ow.get_func('gemm', variant)
|
||||
result = benchmark(run_gemm, a, b, c, gemm)
|
||||
assert result is c
|
||||
|
||||
|
||||
# dsyrk
|
||||
|
||||
syrk_sizes = [100, 1000]
|
||||
|
||||
|
||||
def run_syrk(a, c, func):
|
||||
res = func(1.0, a, c=c, overwrite_c=True)
|
||||
return res
|
||||
|
||||
|
||||
@pytest.mark.parametrize('variant', ['s', 'd', 'c', 'z'])
|
||||
@pytest.mark.parametrize('n', syrk_sizes)
|
||||
def test_syrk(benchmark, n, variant):
|
||||
rndm = np.random.RandomState(1234)
|
||||
dtyp = dtype_map[variant]
|
||||
a = np.array(rndm.uniform(size=(n, n)), dtype=dtyp, order='F')
|
||||
c = np.empty((n, n), dtype=dtyp, order='F')
|
||||
syrk = ow.get_func('syrk', variant)
|
||||
result = benchmark(run_syrk, a, c, syrk)
|
||||
assert result is c
|
||||
|
||||
|
||||
# ### LAPACK ###
|
||||
|
||||
# linalg.solve
|
||||
|
||||
gesv_sizes = [100, 1000]
|
||||
|
||||
|
||||
def run_gesv(a, b, func):
|
||||
res = func(a, b, overwrite_a=True, overwrite_b=True)
|
||||
return res
|
||||
|
||||
|
||||
@pytest.mark.parametrize('variant', ['s', 'd', 'c', 'z'])
|
||||
@pytest.mark.parametrize('n', gesv_sizes)
|
||||
def test_gesv(benchmark, n, variant):
|
||||
rndm = np.random.RandomState(1234)
|
||||
dtyp = dtype_map[variant]
|
||||
|
||||
a = (np.array(rndm.uniform(size=(n, n)), dtype=dtyp, order='F') +
|
||||
np.eye(n, dtype=dtyp, order='F'))
|
||||
b = np.array(rndm.uniform(size=(n, 1)), dtype=dtyp, order='F')
|
||||
gesv = ow.get_func('gesv', variant)
|
||||
lu, piv, x, info = benchmark(run_gesv, a, b, gesv)
|
||||
assert lu is a
|
||||
assert x is b
|
||||
assert info == 0
|
||||
|
||||
|
||||
# linalg.svd
|
||||
|
||||
gesdd_sizes = [(100, 5), (1000, 222)]
|
||||
|
||||
|
||||
def run_gesdd(a, lwork, func):
|
||||
res = func(a, lwork=lwork, full_matrices=False, overwrite_a=False)
|
||||
return res
|
||||
|
||||
|
||||
@pytest.mark.parametrize('variant', ['s', 'd'])
|
||||
@pytest.mark.parametrize('mn', gesdd_sizes)
|
||||
def test_gesdd(benchmark, mn, variant):
|
||||
m, n = mn
|
||||
rndm = np.random.RandomState(1234)
|
||||
dtyp = dtype_map[variant]
|
||||
|
||||
a = np.array(rndm.uniform(size=(m, n)), dtype=dtyp, order='F')
|
||||
|
||||
gesdd_lwork = ow.get_func('gesdd_lwork', variant)
|
||||
|
||||
lwork, info = gesdd_lwork(m, n)
|
||||
lwork = int(lwork)
|
||||
assert info == 0
|
||||
|
||||
gesdd = ow.get_func('gesdd', variant)
|
||||
u, s, vt, info = benchmark(run_gesdd, a, lwork, gesdd)
|
||||
|
||||
assert info == 0
|
||||
|
||||
atol = {'s': 1e-5, 'd': 1e-13}
|
||||
np.testing.assert_allclose(u @ np.diag(s) @ vt, a, atol=atol[variant])
|
||||
|
||||
|
||||
# linalg.eigh
|
||||
|
||||
syev_sizes = [50, 200]
|
||||
|
||||
|
||||
def run_syev(a, lwork, func):
|
||||
res = func(a, lwork=lwork, overwrite_a=True)
|
||||
return res
|
||||
|
||||
|
||||
@pytest.mark.parametrize('variant', ['s', 'd'])
|
||||
@pytest.mark.parametrize('n', syev_sizes)
|
||||
def test_syev(benchmark, n, variant):
|
||||
rndm = np.random.RandomState(1234)
|
||||
dtyp = dtype_map[variant]
|
||||
|
||||
a = rndm.uniform(size=(n, n))
|
||||
a = np.asarray(a + a.T, dtype=dtyp, order='F')
|
||||
a_ = a.copy()
|
||||
|
||||
dsyev_lwork = ow.get_func('syev_lwork', variant)
|
||||
lwork, info = dsyev_lwork(n)
|
||||
lwork = int(lwork)
|
||||
assert info == 0
|
||||
|
||||
syev = ow.get_func('syev', variant)
|
||||
w, v, info = benchmark(run_syev, a, lwork, syev)
|
||||
|
||||
assert info == 0
|
||||
assert a is v # overwrite_a=True
|
||||
|
||||
|
|
@ -1,48 +0,0 @@
|
|||
#
|
||||
# Taken from SciPy (of course)
|
||||
#
|
||||
project(
|
||||
'openblas-wrap',
|
||||
'c', 'fortran',
|
||||
version: '0.1',
|
||||
license: 'BSD-3',
|
||||
meson_version: '>= 1.1.0',
|
||||
default_options: [
|
||||
'buildtype=debugoptimized',
|
||||
'b_ndebug=if-release',
|
||||
'c_std=c17',
|
||||
'fortran_std=legacy',
|
||||
],
|
||||
)
|
||||
|
||||
py3 = import('python').find_installation(pure: false)
|
||||
py3_dep = py3.dependency()
|
||||
|
||||
cc = meson.get_compiler('c')
|
||||
|
||||
_global_c_args = cc.get_supported_arguments(
|
||||
'-Wno-unused-but-set-variable',
|
||||
'-Wno-unused-function',
|
||||
'-Wno-conversion',
|
||||
'-Wno-misleading-indentation',
|
||||
)
|
||||
add_project_arguments(_global_c_args, language : 'c')
|
||||
|
||||
# We need -lm for all C code (assuming it uses math functions, which is safe to
|
||||
# assume for SciPy). For C++ it isn't needed, because libstdc++/libc++ is
|
||||
# guaranteed to depend on it. For Fortran code, Meson already adds `-lm`.
|
||||
m_dep = cc.find_library('m', required : false)
|
||||
if m_dep.found()
|
||||
add_project_link_arguments('-lm', language : 'c')
|
||||
endif
|
||||
|
||||
generate_f2pymod = find_program('openblas_wrap/generate_f2pymod.py')
|
||||
|
||||
openblas = dependency('openblas', method: 'pkg-config', required: true)
|
||||
openblas_dep = declare_dependency(
|
||||
dependencies: openblas,
|
||||
compile_args: []
|
||||
)
|
||||
|
||||
|
||||
subdir('openblas_wrap')
|
|
@ -1,17 +0,0 @@
|
|||
"""
|
||||
Trampoline to hide the LAPACK details (scipy.lapack.linalg or scipy_openblas32 or...)
|
||||
from benchmarking.
|
||||
"""
|
||||
|
||||
__version__ = "0.1"
|
||||
|
||||
|
||||
from . import _flapack
|
||||
|
||||
PREFIX = ''
|
||||
|
||||
|
||||
def get_func(name, variant):
|
||||
"""get_func('gesv', 'c') -> cgesv etc."""
|
||||
return getattr(_flapack, PREFIX + variant + name)
|
||||
|
|
@ -1,417 +0,0 @@
|
|||
!
|
||||
! Taken from scipy/linalg
|
||||
!
|
||||
! Shorthand notations
|
||||
!
|
||||
! <tchar=s,d,cs,zd>
|
||||
! <tchar2c=cs,zd>
|
||||
!
|
||||
! <prefix2=s,d>
|
||||
! <prefix2c=c,z>
|
||||
! <prefix3=s,sc>
|
||||
! <prefix4=d,dz>
|
||||
! <prefix6=s,d,c,z,c,z>
|
||||
!
|
||||
! <ftype2=real,double precision>
|
||||
! <ftype2c=complex,double complex>
|
||||
! <ftype3=real,complex>
|
||||
! <ftype4=double precision,double complex>
|
||||
! <ftypereal3=real,real>
|
||||
! <ftypereal4=double precision,double precision>
|
||||
! <ftype6=real,double precision,complex,double complex,\2,\3>
|
||||
! <ftype6creal=real,double precision,complex,double complex,\0,\1>
|
||||
!
|
||||
! <ctype2=float,double>
|
||||
! <ctype2c=complex_float,complex_double>
|
||||
! <ctype3=float,complex_float>
|
||||
! <ctype4=double,complex_double>
|
||||
! <ctypereal3=float,float>
|
||||
! <ctypereal4=double,double>
|
||||
! <ctype6=float,double,complex_float,complex_double,\2,\3>
|
||||
! <ctype6creal=float,double,complex_float,complex_double,\0,\1>
|
||||
!
|
||||
!
|
||||
! Level 1 BLAS
|
||||
!
|
||||
|
||||
|
||||
python module _flapack
|
||||
usercode '''
|
||||
#define F_INT int
|
||||
'''
|
||||
|
||||
interface
|
||||
|
||||
|
||||
subroutine <prefix>axpy(n,a,x,offx,incx,y,offy,incy)
|
||||
! Calculate z = a*x+y, where a is scalar.
|
||||
|
||||
callstatement (*f2py_func)(&n,&a,x+offx,&incx,y+offy,&incy)
|
||||
callprotoargument F_INT*,<ctype>*,<ctype>*,F_INT*,<ctype>*,F_INT*
|
||||
|
||||
<ftype> dimension(*), intent(in) :: x
|
||||
<ftype> dimension(*), intent(in,out,out=z) :: y
|
||||
<ftype> optional, intent(in):: a=<1.0,\0,(1.0\,0.0),\2>
|
||||
integer optional, intent(in),check(incx>0||incx<0) :: incx = 1
|
||||
integer optional, intent(in),check(incy>0||incy<0) :: incy = 1
|
||||
integer optional, intent(in),depend(x) :: offx=0
|
||||
integer optional, intent(in),depend(y) :: offy=0
|
||||
check(offx>=0 && offx<len(x)) :: offx
|
||||
check(offy>=0 && offy<len(y)) :: offy
|
||||
integer optional, intent(in),depend(x,incx,offx,y,incy,offy) :: &
|
||||
n = (len(x)-offx)/abs(incx)
|
||||
check(len(x)-offx>(n-1)*abs(incx)) :: n
|
||||
check(len(y)-offy>(n-1)*abs(incy)) :: n
|
||||
|
||||
end subroutine <prefix>axpy
|
||||
|
||||
function ddot(n,x,offx,incx,y,offy,incy) result (xy)
|
||||
! Computes a vector-vector dot product.
|
||||
|
||||
callstatement ddot_return_value = (*f2py_func)(&n,x+offx,&incx,y+offy,&incy)
|
||||
callprotoargument F_INT*,double*,F_INT*,double*,F_INT*
|
||||
intent(c) ddot
|
||||
fortranname F_FUNC(ddot,DDOT)
|
||||
|
||||
double precision dimension(*), intent(in) :: x
|
||||
double precision dimension(*), intent(in) :: y
|
||||
double precision ddot,xy
|
||||
integer optional, intent(in),check(incx>0||incx<0) :: incx = 1
|
||||
integer optional, intent(in),check(incy>0||incy<0) :: incy = 1
|
||||
integer optional, intent(in),depend(x) :: offx=0
|
||||
integer optional, intent(in),depend(y) :: offy=0
|
||||
check(offx>=0 && offx<len(x)) :: offx
|
||||
check(offy>=0 && offy<len(y)) :: offy
|
||||
integer optional, intent(in),depend(x,incx,offx,y,incy,offy) :: &
|
||||
n = (len(x)-offx)/abs(incx)
|
||||
check(len(x)-offx>(n-1)*abs(incx)) :: n
|
||||
check(len(y)-offy>(n-1)*abs(incy)) :: n
|
||||
|
||||
end function ddot
|
||||
|
||||
|
||||
function <prefix4>nrm2(n,x,offx,incx) result(n2)
|
||||
|
||||
<ftypereal4> <prefix4>nrm2, n2
|
||||
|
||||
callstatement <prefix4>nrm2_return_value = (*f2py_func)(&n,x+offx,&incx)
|
||||
callprotoargument F_INT*,<ctype4>*,F_INT*
|
||||
intent(c) <prefix4>nrm2
|
||||
fortranname F_FUNC(<prefix4>nrm2,<D,DZ>NRM2)
|
||||
|
||||
<ftype4> dimension(*),intent(in) :: x
|
||||
|
||||
integer optional, intent(in),check(incx>0) :: incx = 1
|
||||
|
||||
integer optional,intent(in),depend(x) :: offx=0
|
||||
check(offx>=0 && offx<len(x)) :: offx
|
||||
|
||||
integer optional,intent(in),depend(x,incx,offx) :: n = (len(x)-offx)/abs(incx)
|
||||
check(len(x)-offx>(n-1)*abs(incx)) :: n
|
||||
|
||||
end function <prefix4>nrm2
|
||||
|
||||
|
||||
!
|
||||
! Level 2 BLAS
|
||||
!
|
||||
|
||||
|
||||
subroutine <prefix>gemv(m,n,alpha,a,x,beta,y,offx,incx,offy,incy,trans,rows,cols,ly)
|
||||
! Computes a matrix-vector product using a general matrix
|
||||
!
|
||||
! y = gemv(alpha,a,x,beta=0,y=0,offx=0,incx=1,offy=0,incy=0,trans=0)
|
||||
! Calculate y <- alpha * op(A) * x + beta * y
|
||||
|
||||
callstatement (*f2py_func)((trans?(trans==2?"C":"T"):"N"),&m,&n,&alpha,a,&m, &
|
||||
x+offx,&incx,&beta,y+offy,&incy)
|
||||
callprotoargument char*,F_INT*,F_INT*,<ctype>*,<ctype>*,F_INT*,<ctype>*,F_INT*,<ctype>*, &
|
||||
<ctype>*,F_INT*
|
||||
|
||||
integer optional, intent(in), check(trans>=0 && trans <=2) :: trans = 0
|
||||
integer optional, intent(in), check(incx>0||incx<0) :: incx = 1
|
||||
integer optional, intent(in), check(incy>0||incy<0) :: incy = 1
|
||||
<ftype> intent(in) :: alpha
|
||||
<ftype> intent(in), optional :: beta = <0.0,\0,(0.0\,0.0),\2>
|
||||
|
||||
<ftype> dimension(*), intent(in) :: x
|
||||
<ftype> dimension(ly), intent(in,copy,out), depend(ly),optional :: y
|
||||
integer intent(hide), depend(incy,rows,offy) :: ly = &
|
||||
(y_capi==Py_None?1+offy+(rows-1)*abs(incy):-1)
|
||||
<ftype> dimension(m,n), intent(in) :: a
|
||||
integer depend(a), intent(hide):: m = shape(a,0)
|
||||
integer depend(a), intent(hide):: n = shape(a,1)
|
||||
|
||||
integer optional, intent(in) :: offx=0
|
||||
integer optional, intent(in) :: offy=0
|
||||
check(offx>=0 && offx<len(x)) :: x
|
||||
check(len(x)>offx+(cols-1)*abs(incx)) :: x
|
||||
depend(offx,cols,incx) :: x
|
||||
|
||||
check(offy>=0 && offy<len(y)) :: y
|
||||
check(len(y)>offy+(rows-1)*abs(incy)) :: y
|
||||
depend(offy,rows,incy) :: y
|
||||
|
||||
integer depend(m,n,trans), intent(hide) :: rows = (trans?n:m)
|
||||
integer depend(m,n,trans), intent(hide) :: cols = (trans?m:n)
|
||||
|
||||
end subroutine <prefix>gemv
|
||||
|
||||
|
||||
subroutine <prefix>gbmv(m,n,kl,ku,alpha,a,lda,x,incx,offx,beta,y,incy,offy,trans,ly)
|
||||
! Performs one of the matrix-vector operations
|
||||
!
|
||||
! y := alpha*A*x + beta*y, or y := alpha*A**T*x + beta*y,
|
||||
! or y := alpha*A**H*x + beta*y,
|
||||
!
|
||||
! where alpha and beta are scalars, x and y are vectors and A is an
|
||||
! m by n band matrix, with kl sub-diagonals and ku super-diagonals.
|
||||
|
||||
callstatement (*f2py_func)((trans?(trans==2?"C":"T"):"N"),&m,&n,&kl,&ku,&alpha,a,&lda,x+offx,&incx,&beta,y+offy,&incy)
|
||||
callprotoargument char*,F_INT*,F_INT*,F_INT*,F_INT*,<ctype>*,<ctype>*,F_INT*,<ctype>*,F_INT*,<ctype>*,<ctype>*,F_INT*
|
||||
|
||||
integer optional,intent(in),check(trans>=0 && trans <=2) :: trans = 0
|
||||
integer intent(in), depend(ku,kl),check(m>=ku+kl+1) :: m
|
||||
integer intent(in),check(n>=0&&n==shape(a,1)),depend(a) :: n
|
||||
integer intent(in),check(kl>=0) :: kl
|
||||
integer intent(in),check(ku>=0) :: ku
|
||||
integer intent(hide),depend(a) :: lda = MAX(shape(a,0),1)
|
||||
integer optional, intent(in),check(incx>0||incx<0) :: incx = 1
|
||||
integer optional, intent(in),check(incy>0||incy<0) :: incy = 1
|
||||
integer intent(hide),depend(m,n,incy,offy,trans) :: ly = &
|
||||
(y_capi==Py_None?1+offy+(trans==0?m-1:n-1)*abs(incy):-1)
|
||||
integer optional, intent(in) :: offx=0
|
||||
integer optional, intent(in) :: offy=0
|
||||
|
||||
<ftype> intent(in) :: alpha
|
||||
<ftype> intent(in),optional :: beta = <0.0,\0,(0.0\,0.0),\2>
|
||||
|
||||
<ftype> dimension(lda,n),intent(in) :: a
|
||||
|
||||
<ftype> dimension(ly), intent(in,out,copy,out=yout),depend(ly),optional :: y
|
||||
check(offy>=0 && offy<len(y)) :: y
|
||||
check(len(y)>offy+(trans==0?m-1:n-1)*abs(incy)) :: y
|
||||
depend(offy,n,incy) :: y
|
||||
|
||||
<ftype> dimension(*), intent(in) :: x
|
||||
check(offx>=0 && offx<len(x)) :: x
|
||||
check(len(x)>offx+(trans==0?n-1:m-1)*abs(incx)) :: x
|
||||
depend(offx,n,incx) :: x
|
||||
|
||||
end subroutine <prefix>gbmv
|
||||
|
||||
|
||||
|
||||
!
|
||||
! Level 3 BLAS
|
||||
!
|
||||
|
||||
|
||||
subroutine <prefix>gemm(m,n,k,alpha,a,b,beta,c,trans_a,trans_b,lda,ka,ldb,kb)
|
||||
! Computes a scalar-matrix-matrix product and adds the result to a
|
||||
! scalar-matrix product.
|
||||
!
|
||||
! c = gemm(alpha,a,b,beta=0,c=0,trans_a=0,trans_b=0,overwrite_c=0)
|
||||
! Calculate C <- alpha * op(A) * op(B) + beta * C
|
||||
|
||||
callstatement (*f2py_func)((trans_a?(trans_a==2?"C":"T"):"N"), &
|
||||
(trans_b?(trans_b==2?"C":"T"):"N"),&m,&n,&k,&alpha,a,&lda,b,&ldb,&beta,c,&m)
|
||||
callprotoargument char*,char*,F_INT*,F_INT*,F_INT*,<ctype>*,<ctype>*,F_INT*,<ctype>*, &
|
||||
F_INT*,<ctype>*,<ctype>*,F_INT*
|
||||
|
||||
integer optional,intent(in),check(trans_a>=0 && trans_a <=2) :: trans_a = 0
|
||||
integer optional,intent(in),check(trans_b>=0 && trans_b <=2) :: trans_b = 0
|
||||
<ftype> intent(in) :: alpha
|
||||
<ftype> intent(in),optional :: beta = <0.0,\0,(0.0\,0.0),\2>
|
||||
|
||||
<ftype> dimension(lda,ka),intent(in) :: a
|
||||
<ftype> dimension(ldb,kb),intent(in) :: b
|
||||
<ftype> dimension(m,n),intent(in,out,copy),depend(m,n),optional :: c
|
||||
check(shape(c,0)==m && shape(c,1)==n) :: c
|
||||
|
||||
integer depend(a),intent(hide) :: lda = shape(a,0)
|
||||
integer depend(a),intent(hide) :: ka = shape(a,1)
|
||||
integer depend(b),intent(hide) :: ldb = shape(b,0)
|
||||
integer depend(b),intent(hide) :: kb = shape(b,1)
|
||||
|
||||
integer depend(a,trans_a,ka,lda),intent(hide):: m = (trans_a?ka:lda)
|
||||
integer depend(a,trans_a,ka,lda),intent(hide):: k = (trans_a?lda:ka)
|
||||
integer depend(b,trans_b,kb,ldb,k),intent(hide),check(trans_b?kb==k:ldb==k) :: &
|
||||
n = (trans_b?ldb:kb)
|
||||
|
||||
end subroutine <prefix>gemm
|
||||
|
||||
|
||||
subroutine <prefix6><sy,\0,\0,\0,he,he>rk(n,k,alpha,a,beta,c,trans,lower,lda,ka)
|
||||
! performs one of the symmetric rank k operations
|
||||
! C := alpha*A*A**T + beta*C, or C := alpha*A**T*A + beta*C,
|
||||
!
|
||||
! c = syrk(alpha,a,beta=0,c=0,trans=0,lower=0,overwrite_c=0)
|
||||
!
|
||||
callstatement (*f2py_func)((lower?"L":"U"), &
|
||||
(trans?(trans==2?"C":"T"):"N"), &n,&k,&alpha,a,&lda,&beta,c,&n)
|
||||
callprotoargument char*,char*,F_INT*,F_INT*,<ctype6>*,<ctype6>*,F_INT*,<ctype6>*, &
|
||||
<ctype6>*,F_INT*
|
||||
|
||||
integer optional, intent(in),check(lower==0||lower==1) :: lower = 0
|
||||
integer optional,intent(in),check(trans>=0 && trans <=2) :: trans = 0
|
||||
|
||||
<ftype6> intent(in) :: alpha
|
||||
<ftype6> intent(in),optional :: beta = <0.0,\0,(0.0\,0.0),\2,\2,\2>
|
||||
|
||||
<ftype6> dimension(lda,ka),intent(in) :: a
|
||||
<ftype6> dimension(n,n),intent(in,out,copy),depend(n),optional :: c
|
||||
check(shape(c,0)==n && shape(c,1)==n) :: c
|
||||
|
||||
integer depend(a),intent(hide) :: lda = shape(a,0)
|
||||
integer depend(a),intent(hide) :: ka = shape(a,1)
|
||||
|
||||
integer depend(a, trans, ka, lda), intent(hide) :: n = (trans ? ka : lda)
|
||||
integer depend(a, trans, ka, lda), intent(hide) :: k = (trans ? lda : ka)
|
||||
|
||||
end subroutine <prefix6><sy,\0,\0,\0,he,he>rk
|
||||
|
||||
|
||||
!
|
||||
! LAPACK
|
||||
!
|
||||
|
||||
subroutine <prefix>gesv(n,nrhs,a,piv,b,info)
|
||||
! lu,piv,x,info = gesv(a,b,overwrite_a=0,overwrite_b=0)
|
||||
! Solve A * X = B.
|
||||
! A = P * L * U
|
||||
! U is upper diagonal triangular, L is unit lower triangular,
|
||||
! piv pivots columns.
|
||||
|
||||
callstatement {F_INT i;(*f2py_func)(&n,&nrhs,a,&n,piv,b,&n,&info);for(i=0;i\<n;--piv[i++]);}
|
||||
callprotoargument F_INT*,F_INT*,<ctype>*,F_INT*,F_INT*,<ctype>*,F_INT*,F_INT*
|
||||
|
||||
integer depend(a),intent(hide):: n = shape(a,0)
|
||||
integer depend(b),intent(hide):: nrhs = shape(b,1)
|
||||
<ftype> dimension(n,n),check(shape(a,0)==shape(a,1)) :: a
|
||||
integer dimension(n),depend(n),intent(out) :: piv
|
||||
<ftype> dimension(n,nrhs),check(shape(a,0)==shape(b,0)),depend(n) :: b
|
||||
integer intent(out)::info
|
||||
intent(in,out,copy,out=x) b
|
||||
intent(in,out,copy,out=lu) a
|
||||
end subroutine <prefix>gesv
|
||||
|
||||
|
||||
subroutine <prefix2>gesdd(m,n,minmn,u0,u1,vt0,vt1,a,compute_uv,full_matrices,u,s,vt,work,lwork,iwork,info)
|
||||
! u,s,vt,info = gesdd(a,compute_uv=1,lwork=..,overwrite_a=0)
|
||||
! Compute the singular value decomposition (SVD) using divide and conquer:
|
||||
! A = U * SIGMA * transpose(V)
|
||||
! A - M x N matrix
|
||||
! U - M x M matrix or min(M,N) x N if full_matrices=False
|
||||
! SIGMA - M x N zero matrix with a main diagonal filled with min(M,N)
|
||||
! singular values
|
||||
! transpose(V) - N x N matrix or N x min(M,N) if full_matrices=False
|
||||
|
||||
callstatement (*f2py_func)((compute_uv?(full_matrices?"A":"S"):"N"),&m,&n,a,&m,s,u,&u0,vt,&vt0,work,&lwork,iwork,&info)
|
||||
callprotoargument char*,F_INT*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,<ctype2>*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,F_INT*,F_INT*,F_INT*
|
||||
|
||||
integer intent(in),optional,check(compute_uv==0||compute_uv==1):: compute_uv = 1
|
||||
integer intent(in),optional,check(full_matrices==0||full_matrices==1):: full_matrices = 1
|
||||
integer intent(hide),depend(a):: m = shape(a,0)
|
||||
integer intent(hide),depend(a):: n = shape(a,1)
|
||||
integer intent(hide),depend(m,n):: minmn = MIN(m,n)
|
||||
integer intent(hide),depend(compute_uv,minmn) :: u0 = (compute_uv?m:1)
|
||||
integer intent(hide),depend(compute_uv,minmn, full_matrices) :: u1 = (compute_uv?(full_matrices?m:minmn):1)
|
||||
integer intent(hide),depend(compute_uv,minmn, full_matrices) :: vt0 = (compute_uv?(full_matrices?n:minmn):1)
|
||||
integer intent(hide),depend(compute_uv,minmn) :: vt1 = (compute_uv?n:1)
|
||||
<ftype2> dimension(m,n),intent(in,copy,aligned8) :: a
|
||||
<ftype2> dimension(minmn),intent(out),depend(minmn) :: s
|
||||
<ftype2> dimension(u0,u1),intent(out),depend(u0, u1) :: u
|
||||
<ftype2> dimension(vt0,vt1),intent(out),depend(vt0, vt1) :: vt
|
||||
<ftype2> dimension(lwork),intent(hide,cache),depend(lwork) :: work
|
||||
integer optional,intent(in),depend(minmn,compute_uv) &
|
||||
:: lwork = max((compute_uv?4*minmn*minmn+MAX(m,n)+9*minmn:MAX(14*minmn+4,10*minmn+2+25*(25+8))+MAX(m,n)),1)
|
||||
integer intent(hide,cache),dimension(8*minmn),depend(minmn) :: iwork
|
||||
integer intent(out)::info
|
||||
|
||||
end subroutine <prefix2>gesdd
|
||||
|
||||
subroutine <prefix2>gesdd_lwork(m,n,minmn,u0,vt0,a,compute_uv,full_matrices,u,s,vt,work,lwork,iwork,info)
|
||||
! LWORK computation for (S/D)GESDD
|
||||
|
||||
fortranname <prefix2>gesdd
|
||||
callstatement (*f2py_func)((compute_uv?(full_matrices?"A":"S"):"N"),&m,&n,&a,&m,&s,&u,&u0,&vt,&vt0,&work,&lwork,&iwork,&info)
|
||||
callprotoargument char*,F_INT*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,<ctype2>*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,F_INT*,F_INT*,F_INT*
|
||||
|
||||
integer intent(in),optional,check(compute_uv==0||compute_uv==1):: compute_uv = 1
|
||||
integer intent(in),optional,check(full_matrices==0||full_matrices==1):: full_matrices = 1
|
||||
integer intent(in) :: m
|
||||
integer intent(in) :: n
|
||||
integer intent(hide),depend(m,n):: minmn = MIN(m,n)
|
||||
integer intent(hide),depend(compute_uv,minmn) :: u0 = (compute_uv?m:1)
|
||||
integer intent(hide),depend(compute_uv,minmn, full_matrices) :: vt0 = (compute_uv?(full_matrices?n:minmn):1)
|
||||
<ftype2> intent(hide) :: a
|
||||
<ftype2> intent(hide) :: s
|
||||
<ftype2> intent(hide) :: u
|
||||
<ftype2> intent(hide) :: vt
|
||||
<ftype2> intent(out) :: work
|
||||
integer intent(hide) :: lwork = -1
|
||||
integer intent(hide) :: iwork
|
||||
integer intent(out) :: info
|
||||
|
||||
end subroutine <prefix2>gesdd_lwork
|
||||
|
||||
|
||||
subroutine <prefix2>syev(compute_v,lower,n,w,a,lda,work,lwork,info)
|
||||
! w,v,info = syev(a,compute_v=1,lower=0,lwork=3*n-1,overwrite_a=0)
|
||||
! Compute all eigenvalues and, optionally, eigenvectors of a
|
||||
! real symmetric matrix A.
|
||||
!
|
||||
! Performance tip:
|
||||
! If compute_v=0 then set also overwrite_a=1.
|
||||
|
||||
callstatement (*f2py_func)((compute_v?"V":"N"),(lower?"L":"U"),&n,a,&lda,w,work,&lwork,&info)
|
||||
callprotoargument char*,char*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,<ctype2>*,F_INT*,F_INT*
|
||||
|
||||
integer optional,intent(in):: compute_v = 1
|
||||
check(compute_v==1||compute_v==0) compute_v
|
||||
integer optional,intent(in),check(lower==0||lower==1) :: lower = 0
|
||||
|
||||
integer intent(hide),depend(a):: n = shape(a,0)
|
||||
integer intent(hide),depend(a):: lda = MAX(1,shape(a,0))
|
||||
<ftype2> dimension(n,n),check(shape(a,0)==shape(a,1)) :: a
|
||||
intent(in,copy,out,out=v) :: a
|
||||
|
||||
<ftype2> dimension(n),intent(out),depend(n) :: w
|
||||
|
||||
integer optional,intent(in),depend(n) :: lwork=max(3*n-1,1)
|
||||
check(lwork>=3*n-1) :: lwork
|
||||
<ftype2> dimension(lwork),intent(hide),depend(lwork) :: work
|
||||
|
||||
integer intent(out) :: info
|
||||
|
||||
end subroutine <prefix2>syev
|
||||
|
||||
|
||||
subroutine <prefix2>syev_lwork(lower,n,w,a,lda,work,lwork,info)
|
||||
! LWORK routines for syev
|
||||
|
||||
fortranname <prefix2>syev
|
||||
|
||||
callstatement (*f2py_func)("N",(lower?"L":"U"),&n,&a,&lda,&w,&work,&lwork,&info)
|
||||
callprotoargument char*,char*,F_INT*,<ctype2>*,F_INT*,<ctype2>*,<ctype2>*,F_INT*,F_INT*
|
||||
|
||||
integer intent(in):: n
|
||||
integer optional,intent(in),check(lower==0||lower==1) :: lower = 0
|
||||
|
||||
integer intent(hide),depend(n):: lda = MAX(1, n)
|
||||
<ftype2> intent(hide):: a
|
||||
<ftype2> intent(hide):: w
|
||||
integer intent(hide):: lwork = -1
|
||||
|
||||
<ftype2> intent(out):: work
|
||||
integer intent(out):: info
|
||||
|
||||
end subroutine <prefix2>syev_lwork
|
||||
|
||||
end interface
|
||||
|
||||
end python module _flapack
|
||||
|
||||
|
||||
|
|
@ -1,299 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Process f2py template files (`filename.pyf.src` -> `filename.pyf`)
|
||||
|
||||
Usage: python generate_pyf.py filename.pyf.src -o filename.pyf
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import subprocess
|
||||
import argparse
|
||||
|
||||
|
||||
# START OF CODE VENDORED FROM `numpy.distutils.from_template`
|
||||
#############################################################
|
||||
"""
|
||||
process_file(filename)
|
||||
|
||||
takes templated file .xxx.src and produces .xxx file where .xxx
|
||||
is .pyf .f90 or .f using the following template rules:
|
||||
|
||||
'<..>' denotes a template.
|
||||
|
||||
All function and subroutine blocks in a source file with names that
|
||||
contain '<..>' will be replicated according to the rules in '<..>'.
|
||||
|
||||
The number of comma-separated words in '<..>' will determine the number of
|
||||
replicates.
|
||||
|
||||
'<..>' may have two different forms, named and short. For example,
|
||||
|
||||
named:
|
||||
<p=d,s,z,c> where anywhere inside a block '<p>' will be replaced with
|
||||
'd', 's', 'z', and 'c' for each replicate of the block.
|
||||
|
||||
<_c> is already defined: <_c=s,d,c,z>
|
||||
<_t> is already defined: <_t=real,double precision,complex,double complex>
|
||||
|
||||
short:
|
||||
<s,d,c,z>, a short form of the named, useful when no <p> appears inside
|
||||
a block.
|
||||
|
||||
In general, '<..>' contains a comma separated list of arbitrary
|
||||
expressions. If these expression must contain a comma|leftarrow|rightarrow,
|
||||
then prepend the comma|leftarrow|rightarrow with a backslash.
|
||||
|
||||
If an expression matches '\\<index>' then it will be replaced
|
||||
by <index>-th expression.
|
||||
|
||||
Note that all '<..>' forms in a block must have the same number of
|
||||
comma-separated entries.
|
||||
|
||||
Predefined named template rules:
|
||||
<prefix=s,d,c,z>
|
||||
<ftype=real,double precision,complex,double complex>
|
||||
<ftypereal=real,double precision,\\0,\\1>
|
||||
<ctype=float,double,complex_float,complex_double>
|
||||
<ctypereal=float,double,\\0,\\1>
|
||||
"""
|
||||
|
||||
routine_start_re = re.compile(
|
||||
r'(\n|\A)(( (\$|\*))|)\s*(subroutine|function)\b',
|
||||
re.I
|
||||
)
|
||||
routine_end_re = re.compile(r'\n\s*end\s*(subroutine|function)\b.*(\n|\Z)', re.I)
|
||||
function_start_re = re.compile(r'\n (\$|\*)\s*function\b', re.I)
|
||||
|
||||
def parse_structure(astr):
|
||||
""" Return a list of tuples for each function or subroutine each
|
||||
tuple is the start and end of a subroutine or function to be
|
||||
expanded.
|
||||
"""
|
||||
|
||||
spanlist = []
|
||||
ind = 0
|
||||
while True:
|
||||
m = routine_start_re.search(astr, ind)
|
||||
if m is None:
|
||||
break
|
||||
start = m.start()
|
||||
if function_start_re.match(astr, start, m.end()):
|
||||
while True:
|
||||
i = astr.rfind('\n', ind, start)
|
||||
if i==-1:
|
||||
break
|
||||
start = i
|
||||
if astr[i:i+7]!='\n $':
|
||||
break
|
||||
start += 1
|
||||
m = routine_end_re.search(astr, m.end())
|
||||
ind = end = m and m.end()-1 or len(astr)
|
||||
spanlist.append((start, end))
|
||||
return spanlist
|
||||
|
||||
template_re = re.compile(r"<\s*(\w[\w\d]*)\s*>")
|
||||
named_re = re.compile(r"<\s*(\w[\w\d]*)\s*=\s*(.*?)\s*>")
|
||||
list_re = re.compile(r"<\s*((.*?))\s*>")
|
||||
|
||||
def find_repl_patterns(astr):
|
||||
reps = named_re.findall(astr)
|
||||
names = {}
|
||||
for rep in reps:
|
||||
name = rep[0].strip() or unique_key(names)
|
||||
repl = rep[1].replace(r'\,', '@comma@')
|
||||
thelist = conv(repl)
|
||||
names[name] = thelist
|
||||
return names
|
||||
|
||||
def find_and_remove_repl_patterns(astr):
|
||||
names = find_repl_patterns(astr)
|
||||
astr = re.subn(named_re, '', astr)[0]
|
||||
return astr, names
|
||||
|
||||
item_re = re.compile(r"\A\\(?P<index>\d+)\Z")
|
||||
def conv(astr):
|
||||
b = astr.split(',')
|
||||
l = [x.strip() for x in b]
|
||||
for i in range(len(l)):
|
||||
m = item_re.match(l[i])
|
||||
if m:
|
||||
j = int(m.group('index'))
|
||||
l[i] = l[j]
|
||||
return ','.join(l)
|
||||
|
||||
def unique_key(adict):
|
||||
""" Obtain a unique key given a dictionary."""
|
||||
allkeys = list(adict.keys())
|
||||
done = False
|
||||
n = 1
|
||||
while not done:
|
||||
newkey = '__l%s' % (n)
|
||||
if newkey in allkeys:
|
||||
n += 1
|
||||
else:
|
||||
done = True
|
||||
return newkey
|
||||
|
||||
|
||||
template_name_re = re.compile(r'\A\s*(\w[\w\d]*)\s*\Z')
|
||||
def expand_sub(substr, names):
|
||||
substr = substr.replace(r'\>', '@rightarrow@')
|
||||
substr = substr.replace(r'\<', '@leftarrow@')
|
||||
lnames = find_repl_patterns(substr)
|
||||
substr = named_re.sub(r"<\1>", substr) # get rid of definition templates
|
||||
|
||||
def listrepl(mobj):
|
||||
thelist = conv(mobj.group(1).replace(r'\,', '@comma@'))
|
||||
if template_name_re.match(thelist):
|
||||
return "<%s>" % (thelist)
|
||||
name = None
|
||||
for key in lnames.keys(): # see if list is already in dictionary
|
||||
if lnames[key] == thelist:
|
||||
name = key
|
||||
if name is None: # this list is not in the dictionary yet
|
||||
name = unique_key(lnames)
|
||||
lnames[name] = thelist
|
||||
return "<%s>" % name
|
||||
|
||||
substr = list_re.sub(listrepl, substr) # convert all lists to named templates
|
||||
# newnames are constructed as needed
|
||||
|
||||
numsubs = None
|
||||
base_rule = None
|
||||
rules = {}
|
||||
for r in template_re.findall(substr):
|
||||
if r not in rules:
|
||||
thelist = lnames.get(r, names.get(r, None))
|
||||
if thelist is None:
|
||||
raise ValueError('No replicates found for <%s>' % (r))
|
||||
if r not in names and not thelist.startswith('_'):
|
||||
names[r] = thelist
|
||||
rule = [i.replace('@comma@', ',') for i in thelist.split(',')]
|
||||
num = len(rule)
|
||||
|
||||
if numsubs is None:
|
||||
numsubs = num
|
||||
rules[r] = rule
|
||||
base_rule = r
|
||||
elif num == numsubs:
|
||||
rules[r] = rule
|
||||
else:
|
||||
print("Mismatch in number of replacements (base <{}={}>) "
|
||||
"for <{}={}>. Ignoring."
|
||||
.format(base_rule, ','.join(rules[base_rule]), r, thelist))
|
||||
if not rules:
|
||||
return substr
|
||||
|
||||
def namerepl(mobj):
|
||||
name = mobj.group(1)
|
||||
return rules.get(name, (k+1)*[name])[k]
|
||||
|
||||
newstr = ''
|
||||
for k in range(numsubs):
|
||||
newstr += template_re.sub(namerepl, substr) + '\n\n'
|
||||
|
||||
newstr = newstr.replace('@rightarrow@', '>')
|
||||
newstr = newstr.replace('@leftarrow@', '<')
|
||||
return newstr
|
||||
|
||||
def process_str(allstr):
|
||||
newstr = allstr
|
||||
writestr = ''
|
||||
|
||||
struct = parse_structure(newstr)
|
||||
|
||||
oldend = 0
|
||||
names = {}
|
||||
names.update(_special_names)
|
||||
for sub in struct:
|
||||
cleanedstr, defs = find_and_remove_repl_patterns(newstr[oldend:sub[0]])
|
||||
writestr += cleanedstr
|
||||
names.update(defs)
|
||||
writestr += expand_sub(newstr[sub[0]:sub[1]], names)
|
||||
oldend = sub[1]
|
||||
writestr += newstr[oldend:]
|
||||
|
||||
return writestr
|
||||
|
||||
include_src_re = re.compile(
|
||||
r"(\n|\A)\s*include\s*['\"](?P<name>[\w\d./\\]+\.src)['\"]",
|
||||
re.I
|
||||
)
|
||||
|
||||
def resolve_includes(source):
|
||||
d = os.path.dirname(source)
|
||||
with open(source) as fid:
|
||||
lines = []
|
||||
for line in fid:
|
||||
m = include_src_re.match(line)
|
||||
if m:
|
||||
fn = m.group('name')
|
||||
if not os.path.isabs(fn):
|
||||
fn = os.path.join(d, fn)
|
||||
if os.path.isfile(fn):
|
||||
lines.extend(resolve_includes(fn))
|
||||
else:
|
||||
lines.append(line)
|
||||
else:
|
||||
lines.append(line)
|
||||
return lines
|
||||
|
||||
def process_file(source):
|
||||
lines = resolve_includes(source)
|
||||
return process_str(''.join(lines))
|
||||
|
||||
_special_names = find_repl_patterns('''
|
||||
<_c=s,d,c,z>
|
||||
<_t=real,double precision,complex,double complex>
|
||||
<prefix=s,d,c,z>
|
||||
<ftype=real,double precision,complex,double complex>
|
||||
<ctype=float,double,complex_float,complex_double>
|
||||
<ftypereal=real,double precision,\\0,\\1>
|
||||
<ctypereal=float,double,\\0,\\1>
|
||||
''')
|
||||
|
||||
# END OF CODE VENDORED FROM `numpy.distutils.from_template`
|
||||
###########################################################
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("infile", type=str,
|
||||
help="Path to the input file")
|
||||
parser.add_argument("-o", "--outdir", type=str,
|
||||
help="Path to the output directory")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.infile.endswith(('.pyf', '.pyf.src', '.f.src')):
|
||||
raise ValueError(f"Input file has unknown extension: {args.infile}")
|
||||
|
||||
outdir_abs = os.path.join(os.getcwd(), args.outdir)
|
||||
|
||||
# Write out the .pyf/.f file
|
||||
if args.infile.endswith(('.pyf.src', '.f.src')):
|
||||
code = process_file(args.infile)
|
||||
fname_pyf = os.path.join(args.outdir,
|
||||
os.path.splitext(os.path.split(args.infile)[1])[0])
|
||||
|
||||
with open(fname_pyf, 'w') as f:
|
||||
f.write(code)
|
||||
else:
|
||||
fname_pyf = args.infile
|
||||
|
||||
# Now invoke f2py to generate the C API module file
|
||||
if args.infile.endswith(('.pyf.src', '.pyf')):
|
||||
p = subprocess.Popen([sys.executable, '-m', 'numpy.f2py', fname_pyf,
|
||||
'--build-dir', outdir_abs], #'--quiet'],
|
||||
stdout=subprocess.PIPE, stderr=subprocess.PIPE,
|
||||
cwd=os.getcwd())
|
||||
out, err = p.communicate()
|
||||
if not (p.returncode == 0):
|
||||
raise RuntimeError(f"Writing {args.outfile} with f2py failed!\n"
|
||||
f"{out}\n"
|
||||
r"{err}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,50 +0,0 @@
|
|||
# find numpy & f2py includes
|
||||
inc_numpy = run_command(py3,
|
||||
['-c', 'import os; os.chdir(".."); import numpy; print(numpy.get_include())'],
|
||||
check : true
|
||||
).stdout().strip()
|
||||
|
||||
inc_f2py = run_command(py3,
|
||||
['-c', 'import os; os.chdir(".."); import numpy.f2py; print(numpy.f2py.get_include())'],
|
||||
check : true
|
||||
).stdout().strip()
|
||||
|
||||
|
||||
inc_np = include_directories(inc_numpy, inc_f2py)
|
||||
fortranobject_c = inc_f2py / 'fortranobject.c'
|
||||
|
||||
|
||||
fortranobject_lib = static_library('_fortranobject',
|
||||
fortranobject_c,
|
||||
# c_args: numpy_nodepr_api,
|
||||
dependencies: py3_dep,
|
||||
include_directories: [inc_np, inc_f2py],
|
||||
gnu_symbol_visibility: 'hidden',
|
||||
)
|
||||
fortranobject_dep = declare_dependency(
|
||||
link_with: fortranobject_lib,
|
||||
include_directories: [inc_np, inc_f2py],
|
||||
)
|
||||
|
||||
|
||||
# f2py generated wrappers
|
||||
|
||||
flapack_module = custom_target('flapack_module',
|
||||
output: ['_flapackmodule.c'],
|
||||
input: 'blas_lapack.pyf.src',
|
||||
command: [generate_f2pymod, '@INPUT@', '-o', '@OUTDIR@'],
|
||||
)
|
||||
|
||||
py3.extension_module('_flapack',
|
||||
flapack_module,
|
||||
link_args: [], # version_link_args,
|
||||
dependencies: [openblas_dep, fortranobject_dep],
|
||||
install: true,
|
||||
subdir: 'openblas_wrap'
|
||||
)
|
||||
|
||||
|
||||
py3.install_sources(
|
||||
['__init__.py'],
|
||||
subdir: 'openblas_wrap'
|
||||
)
|
|
@ -1,12 +0,0 @@
|
|||
libdir=/home/br/repos/OpenBLAS/
|
||||
includedir=/home/br/repos/OpenBLAS/
|
||||
openblas_config= OpenBLAS 0.3.27 DYNAMIC_ARCH NO_AFFINITY Haswell MAX_THREADS=64
|
||||
version=0.3.27
|
||||
extralib=-lm -lpthread -lgfortran -lquadmath -L${libdir} -lopenblas
|
||||
Name: openblas
|
||||
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
||||
Version: ${version}
|
||||
URL: https://github.com/xianyi/OpenBLAS
|
||||
Libs: -L${libdir} -lopenblas
|
||||
Libs.private: ${extralib}
|
||||
Cflags: -I${includedir}
|
|
@ -54,7 +54,7 @@ int main(int argc, char *argv[]){
|
|||
int step = 1;
|
||||
int loops = 1;
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
|
||||
double time1,timeg;
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2014, 2023 The OpenBLAS Project
|
||||
Copyright (c) 2014, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
@ -67,7 +67,7 @@ int main(int argc, char *argv[]){
|
|||
int step = 1;
|
||||
int loops = 1;
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
|
||||
double time1,timeg;
|
||||
|
||||
|
@ -77,7 +77,7 @@ int main(int argc, char *argv[]){
|
|||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Loops = %d\n", from, to, step,uplo,trans,loops);
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
|
||||
|
||||
|
||||
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
|
||||
|
|
|
@ -127,7 +127,7 @@ int main(int argc, char *argv[]){
|
|||
long long muls = n*(n+1)/2.0;
|
||||
long long adds = (n - 1.0)*n/2.0;
|
||||
|
||||
fprintf(stderr, "%10d : %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg);
|
||||
fprintf(stderr, "%10d %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg);
|
||||
if(a != NULL){
|
||||
free(a);
|
||||
}
|
||||
|
|
140
c_check
140
c_check
|
@ -31,17 +31,13 @@ flags="$*"
|
|||
|
||||
cross_suffix=""
|
||||
|
||||
if [ "`dirname "$compiler_name"`" != '.' ]; then
|
||||
cross_suffix="$cross_suffix`dirname "$compiler_name"`/"
|
||||
if [ "`dirname \"$compiler_name\"`" != '.' ]; then
|
||||
cross_suffix="$cross_suffix`dirname \"$compiler_name\"`/"
|
||||
fi
|
||||
|
||||
cn=`echo $compiler_name | sed -e 's/ -.*//'`
|
||||
bn=`basename "$cn"`
|
||||
|
||||
bn=`basename $compiler_name`
|
||||
case "$bn" in
|
||||
*-*) if [ "$bn" != '-' ]; then
|
||||
cross_suffix="$cross_suffix${bn%-*}-"
|
||||
fi
|
||||
*-*) cross_suffix="$cross_suffix${bn%-*}-"
|
||||
esac
|
||||
|
||||
compiler=""
|
||||
|
@ -91,25 +87,16 @@ case "$data" in
|
|||
*ARCH_ZARCH*) architecture=zarch ;;
|
||||
*ARCH_RISCV64*) architecture=riscv64 ;;
|
||||
*ARCH_LOONGARCH64*) architecture=loongarch64 ;;
|
||||
*ARCH_CSKY*) architecture=csky ;;
|
||||
esac
|
||||
|
||||
defined=0
|
||||
|
||||
if [ "$os" = "AIX" ]; then
|
||||
if [ "$compiler" = "GCC" ]; then
|
||||
case "$BINARY" in
|
||||
32) compiler_name="$compiler_name -maix32" ;;
|
||||
64) compiler_name="$compiler_name -maix64" ;;
|
||||
esac
|
||||
defined=1
|
||||
else
|
||||
case "$BINARY" in
|
||||
32) compiler_name="$compiler_name -m32" ;;
|
||||
64) compiler_name="$compiler_name -m64" ;;
|
||||
esac
|
||||
defined=1
|
||||
fi
|
||||
case "$BINARY" in
|
||||
32) compiler_name="$compiler_name -maix32" ;;
|
||||
64) compiler_name="$compiler_name -maix64" ;;
|
||||
esac
|
||||
defined=1
|
||||
fi
|
||||
|
||||
case "$architecture" in
|
||||
|
@ -175,9 +162,9 @@ fi
|
|||
exit 1
|
||||
}
|
||||
|
||||
no_msa=0
|
||||
have_msa=0
|
||||
if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
|
||||
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||
tmpd="$(mktemp -d)"
|
||||
tmpf="$tmpd/a.c"
|
||||
code='"addvi.b $w0, $w1, 1"'
|
||||
msa_flags='-mmsa -mfp64 -mload-store-pairs'
|
||||
|
@ -185,50 +172,11 @@ if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
|
|||
printf "void main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
|
||||
|
||||
args="$msa_flags -o $tmpf.o $tmpf"
|
||||
have_msa=1
|
||||
{
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
no_msa=1
|
||||
}
|
||||
|
||||
rm -rf "$tmpd"
|
||||
fi
|
||||
|
||||
no_lsx=0
|
||||
no_lasx=0
|
||||
if [ "$architecture" = "loongarch64" ]; then
|
||||
lasx_flags='-march=loongarch64'
|
||||
lsx_flags='-march=loongarch64'
|
||||
|
||||
tmpd="$(mktemp -d)"
|
||||
tmparch="$tmpd/arch.c"
|
||||
printf "void main(void){ }\n" >> "$tmparch"
|
||||
args="-march=loongarch64 -o $tmparch.o $tmparch"
|
||||
{
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
lasx_flags=''
|
||||
lsx_flags=''
|
||||
}
|
||||
|
||||
tmplsx="$tmpd/lsx.c"
|
||||
codelsx='"vadd.b $vr0, $vr0, $vr0"'
|
||||
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx"
|
||||
args="$lsx_flags -o $tmplsx.o $tmplsx"
|
||||
{
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
no_lsx=1
|
||||
}
|
||||
|
||||
tmplasx="$tmpd/lasx.c"
|
||||
codelasx='"xvadd.b $xr0, $xr0, $xr0"'
|
||||
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx"
|
||||
args="$lasx_flags -o $tmplasx.o $tmplasx"
|
||||
{
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
no_lasx=1
|
||||
have_msa=0
|
||||
}
|
||||
|
||||
rm -rf "$tmpd"
|
||||
|
@ -248,7 +196,6 @@ case "$data" in
|
|||
*ARCH_ARM*) architecture=arm ;;
|
||||
*ARCH_ZARCH*) architecture=zarch ;;
|
||||
*ARCH_LOONGARCH64*) architecture=loongarch64 ;;
|
||||
*ARCH_CSKY*) architecture=csky ;;
|
||||
esac
|
||||
|
||||
binformat='bin32'
|
||||
|
@ -257,9 +204,8 @@ case "$data" in
|
|||
esac
|
||||
|
||||
no_avx512=0
|
||||
no_avx512bf=0
|
||||
if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
|
||||
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||
tmpd=`mktemp -d`
|
||||
tmpf="$tmpd/a.c"
|
||||
code='"vbroadcastss -4 * 4(%rsi), %zmm2"'
|
||||
printf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
|
||||
|
@ -276,30 +222,11 @@ if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
|
|||
}
|
||||
|
||||
rm -rf "$tmpd"
|
||||
if [ "$no_avx512" -eq 0 ]; then
|
||||
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||
tmpf="$tmpd/a.c"
|
||||
code='"__m512 a= _mm512_dpbf16_ps(a, (__m512bh) _mm512_loadu_si512(%1]), (__m512bh) _mm512_loadu_si512(%2]));"'
|
||||
printf "#include <immintrin.h>\n\nint main(void){ %s; }\n" "$code" >> "$tmpf"
|
||||
if [ "$compiler" = "PGI" ]; then
|
||||
args=" -tp cooperlake -c -o $tmpf.o $tmpf"
|
||||
else
|
||||
args=" -march=cooperlake -c -o $tmpf.o $tmpf"
|
||||
fi
|
||||
no_avx512bf=0
|
||||
{
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
no_avx512bf=1
|
||||
}
|
||||
|
||||
rm -rf "$tmpd"
|
||||
fi
|
||||
fi
|
||||
|
||||
no_rv64gv=0
|
||||
if [ "$architecture" = "riscv64" ]; then
|
||||
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||
tmpd=`mktemp -d`
|
||||
tmpf="$tmpd/a.c"
|
||||
code='"vsetvli zero, zero, e8, m1\n"'
|
||||
printf "int main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
|
||||
|
@ -313,28 +240,10 @@ if [ "$architecture" = "riscv64" ]; then
|
|||
rm -rf "$tmpd"
|
||||
fi
|
||||
|
||||
no_sve=0
|
||||
if [ "$architecture" = "arm64" ]; then
|
||||
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||
tmpf="$tmpd/a.c"
|
||||
printf "#include <arm_sve.h>\n\n int main(void){}\n">> "$tmpf"
|
||||
args=" -march=armv8-a+sve -c -o $tmpf.o $tmpf"
|
||||
no_sve=0
|
||||
{
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
args=" -Msve_intrinsics -c -o $tmpf.o $tmpf"
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
no_sve=1
|
||||
}
|
||||
rm -rf "$tmpd"
|
||||
fi
|
||||
|
||||
c11_atomics=0
|
||||
case "$data" in
|
||||
*HAVE_C11*)
|
||||
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||
tmpd=`mktemp -d`
|
||||
tmpf="$tmpd/a.c"
|
||||
printf "#include <stdatomic.h>\nint main(void){}\n" >> "$tmpf"
|
||||
args=" -c -o $tmpf.o $tmpf"
|
||||
|
@ -356,9 +265,6 @@ if [ "$compiler" = "GCC" ]; then
|
|||
no_avx2=0
|
||||
oldgcc=0
|
||||
data=`$compiler_name -dumpversion`
|
||||
case "$data" in *-*)
|
||||
data="${data%-*}"
|
||||
esac
|
||||
case "$data" in *.*.*)
|
||||
data="${data%.*}"
|
||||
esac
|
||||
|
@ -445,7 +351,6 @@ done
|
|||
[ "$makefile" = "-" ] && {
|
||||
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
|
||||
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
|
||||
[ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
|
||||
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
|
||||
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
|
||||
exit 0
|
||||
|
@ -470,15 +375,14 @@ done
|
|||
printf "CROSS_SUFFIX=%s\n" "$cross_suffix"
|
||||
[ "$cross" -ne 0 ] && printf "CROSS=1\n"
|
||||
printf "CEXTRALIB=%s %s %s\n" "$linker_L" "$linker_l" "$linker_a"
|
||||
[ "$no_msa" -eq 1 ] && printf "NO_MSA=1\n"
|
||||
[ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n"
|
||||
[ "$have_msa" -eq 1 ] && {
|
||||
printf "HAVE_MSA=1\n"
|
||||
printf "MSA_FLAGS=%s\n" "$msa_flags"
|
||||
}
|
||||
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
|
||||
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
|
||||
[ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
|
||||
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
|
||||
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
|
||||
[ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"
|
||||
[ "$no_lasx" -eq 1 ] && printf "NO_LASX=1\n"
|
||||
} >> "$makefile"
|
||||
|
||||
os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ `
|
||||
|
@ -492,10 +396,8 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' `
|
|||
[ "$binformat" = "bin32" ] && printf "#define __32BIT__\t1\n"
|
||||
[ "$binformat" = "bin64" ] && printf "#define __64BIT__\t1\n"
|
||||
[ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu"
|
||||
[ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n"
|
||||
[ "$have_msa" -eq 1 ] && printf "#define HAVE_MSA\t1\n"
|
||||
[ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n"
|
||||
[ "$no_lsx" -eq 1 ] && printf "#define NO_LSX\t1\n"
|
||||
[ "$no_lasx" -eq 1 ] && printf "#define NO_LASX\t1\n"
|
||||
} >> "$config"
|
||||
|
||||
|
||||
|
|
50
c_check.pl
50
c_check.pl
|
@ -97,7 +97,6 @@ $architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
|||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
|
||||
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
||||
$architecture = csky if ($data =~ /ARCH_CSKY/);
|
||||
|
||||
$defined = 0;
|
||||
|
||||
|
@ -157,11 +156,6 @@ if ($architecture eq "loongarch64") {
|
|||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "csky") {
|
||||
$defined = 1;
|
||||
$binary = 32;
|
||||
}
|
||||
|
||||
if ($compiler eq "PGI") {
|
||||
$compiler_name .= " -tp p7" if ($binary eq "32");
|
||||
$compiler_name .= " -tp p7-64" if ($binary eq "64");
|
||||
|
@ -238,45 +232,6 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
|||
}
|
||||
}
|
||||
|
||||
$no_lsx = 0;
|
||||
$no_lasx = 0;
|
||||
if (($architecture eq "loongarch64")) {
|
||||
eval "use File::Temp qw(tempfile)";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Temp, so could not check LSX and LASX capatibility";
|
||||
} else {
|
||||
$tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
|
||||
$codelsx = '"vadd.b $vr0, $vr0, $vr0"';
|
||||
$lsx_flags = "-march=loongarch64";
|
||||
print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n";
|
||||
|
||||
$args = "$lsx_flags -o $tmplsx.o $tmplsx";
|
||||
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$no_lsx = 1;
|
||||
} else {
|
||||
$no_lsx = 0;
|
||||
}
|
||||
unlink("$tmplsx.o");
|
||||
|
||||
$tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
|
||||
$codelasx = '"xvadd.b $xr0, $xr0, $xr0"';
|
||||
$lasx_flags = "-march=loongarch64";
|
||||
print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n";
|
||||
|
||||
$args = "$lasx_flags -o $tmplasx.o $tmplasx";
|
||||
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$no_lasx = 1;
|
||||
} else {
|
||||
$no_lasx = 0;
|
||||
}
|
||||
unlink("$tmplasx.o");
|
||||
}
|
||||
}
|
||||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = e2k if ($data =~ /ARCH_E2K/);
|
||||
|
@ -290,7 +245,6 @@ $architecture = arm if ($data =~ /ARCH_ARM/);
|
|||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
||||
$architecture = csky if ($data =~ /ARCH_CSKY/);
|
||||
|
||||
$binformat = bin32;
|
||||
$binformat = bin64 if ($data =~ /BINARY_64/);
|
||||
|
@ -470,8 +424,6 @@ print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1;
|
|||
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
|
||||
print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1;
|
||||
print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1;
|
||||
print MAKEFILE "NO_LSX=1\n" if $no_lsx eq 1;
|
||||
print MAKEFILE "NO_LASX=1\n" if $no_lasx eq 1;
|
||||
|
||||
$os =~ tr/[a-z]/[A-Z]/;
|
||||
$architecture =~ tr/[a-z]/[A-Z]/;
|
||||
|
@ -485,8 +437,6 @@ print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64;
|
|||
print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
|
||||
print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1;
|
||||
print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1;
|
||||
print CONFFILE "#define NO_LSX\t1\n" if $no_lsx eq 1;
|
||||
print CONFFILE "#define NO_LASX\t1\n" if $no_lasx eq 1;
|
||||
|
||||
|
||||
if ($os eq "LINUX") {
|
||||
|
|
44
cblas.h
44
cblas.h
|
@ -12,7 +12,6 @@ extern "C" {
|
|||
/*Set the number of threads on runtime.*/
|
||||
void openblas_set_num_threads(int num_threads);
|
||||
void goto_set_num_threads(int num_threads);
|
||||
int openblas_set_num_threads_local(int num_threads);
|
||||
|
||||
/*Get the number of threads on runtime.*/
|
||||
int openblas_get_num_threads(void);
|
||||
|
@ -26,11 +25,6 @@ char* openblas_get_config(void);
|
|||
/*Get the CPU corename on runtime.*/
|
||||
char* openblas_get_corename(void);
|
||||
|
||||
/*Set the threading backend to a custom callback.*/
|
||||
typedef void (*openblas_dojob_callback)(int thread_num, void *jobdata, int dojob_data);
|
||||
typedef void (*openblas_threads_callback)(int sync, openblas_dojob_callback dojob, int numjobs, size_t jobdata_elsize, void *jobdata, int dojob_data);
|
||||
void openblas_set_threads_callback_function(openblas_threads_callback callback);
|
||||
|
||||
#ifdef OPENBLAS_OS_LINUX
|
||||
/* Sets thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */
|
||||
int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set);
|
||||
|
@ -106,16 +100,6 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
|
|||
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||
float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||
float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
@ -131,9 +115,6 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS
|
|||
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
|
||||
void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
|
||||
void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
|
@ -377,7 +358,7 @@ void cblas_cher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBL
|
|||
void cblas_zher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
|
||||
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, void *C, OPENBLAS_CONST blasint ldc);
|
||||
|
||||
void cblas_xerbla(blasint p, OPENBLAS_CONST char *rout, OPENBLAS_CONST char *form, ...);
|
||||
void cblas_xerbla(blasint p, char *rout, char *form, ...);
|
||||
|
||||
/*** BLAS extensions ***/
|
||||
|
||||
|
@ -407,27 +388,15 @@ void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum
|
|||
void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a,
|
||||
OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb);
|
||||
|
||||
void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta,
|
||||
void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta,
|
||||
float *c, OPENBLAS_CONST blasint cldc);
|
||||
void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta,
|
||||
void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta,
|
||||
double *c, OPENBLAS_CONST blasint cldc);
|
||||
void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta,
|
||||
void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta,
|
||||
float *c, OPENBLAS_CONST blasint cldc);
|
||||
void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta,
|
||||
void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta,
|
||||
double *c, OPENBLAS_CONST blasint cldc);
|
||||
|
||||
void cblas_sgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
|
||||
OPENBLAS_CONST float * alpha_array, OPENBLAS_CONST float ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST float ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST float * beta_array, float ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);
|
||||
|
||||
void cblas_dgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
|
||||
OPENBLAS_CONST double * alpha_array, OPENBLAS_CONST double ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST double ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST double * beta_array, double ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);
|
||||
|
||||
void cblas_cgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
|
||||
OPENBLAS_CONST void * alpha_array, OPENBLAS_CONST void ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST void ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST void * beta_array, void ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);
|
||||
|
||||
void cblas_zgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
|
||||
OPENBLAS_CONST void * alpha_array, OPENBLAS_CONST void ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST void ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST void * beta_array, void ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);
|
||||
|
||||
/*** BFLOAT16 and INT8 extensions ***/
|
||||
/* convert float array to BFLOAT16 array by rounding */
|
||||
void cblas_sbstobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout);
|
||||
|
@ -443,9 +412,6 @@ void cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum
|
|||
|
||||
void cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
|
||||
OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
|
||||
void cblas_sbgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
|
||||
OPENBLAS_CONST float * alpha_array, OPENBLAS_CONST bfloat16 ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST bfloat16 ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST float * beta_array, float ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
|
|
@ -44,12 +44,9 @@ endif ()
|
|||
|
||||
if (DYNAMIC_ARCH)
|
||||
if (ARM64)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX)
|
||||
endif ()
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 NEOVERSEV1 NEOVERSEN2 THUNDERX3T110)
|
||||
if (DYNAMIC_LIST)
|
||||
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
|
||||
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
|
@ -57,11 +54,7 @@ if (DYNAMIC_ARCH)
|
|||
set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_P10_SUPPORT")
|
||||
endif ()
|
||||
|
||||
if (RISCV64)
|
||||
set(DYNAMIC_CORE RISCV64_GENERIC RISCV64_ZVL128B RISCV64_ZVL256B)
|
||||
endif ()
|
||||
|
||||
|
||||
if (X86)
|
||||
set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO)
|
||||
endif ()
|
||||
|
@ -86,7 +79,7 @@ if (DYNAMIC_ARCH)
|
|||
set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
|
||||
endif ()
|
||||
if (NOT NO_AVX512)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE SAPPHIRERAPIDS)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE)
|
||||
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
|
||||
endif ()
|
||||
if (DYNAMIC_LIST)
|
||||
|
@ -94,10 +87,6 @@ if (DYNAMIC_ARCH)
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (LOONGARCH64)
|
||||
set(DYNAMIC_CORE LA64_GENERIC LA264 LA464)
|
||||
endif ()
|
||||
|
||||
if (EXISTS ${PROJECT_SOURCE_DIR}/config_kernel.h)
|
||||
message (FATAL_ERROR "Your build directory contains a file config_kernel.h, probably from a previous compilation with make. This will conflict with the cmake compilation and cause strange compiler errors - please remove the file before trying again")
|
||||
endif ()
|
||||
|
@ -143,8 +132,3 @@ if (ARM64)
|
|||
set(BINARY_DEFINED 1)
|
||||
endif ()
|
||||
|
||||
if (RISCV64)
|
||||
set(NO_BINARY_MODE 1)
|
||||
set(BINARY_DEFINED 1)
|
||||
endif ()
|
||||
|
||||
|
|
105
cmake/cc.cmake
105
cmake/cc.cmake
|
@ -2,18 +2,12 @@
|
|||
## Author: Hank Anderson <hank@statease.com>
|
||||
## Description: Ported from portion of OpenBLAS/Makefile.system
|
||||
## Sets C related variables.
|
||||
include(CheckCCompilerFlag)
|
||||
|
||||
if (${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -fp-model=consistent")
|
||||
set(GCC_VERSION 100)
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB" OR ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
|
||||
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -Wall")
|
||||
set(COMMON_PROF "${COMMON_PROF} -fno-inline")
|
||||
set(NO_UNINITIALIZED_WARN "-Wno-uninitialized")
|
||||
set(GCC_VERSION ${CMAKE_C_COMPILER_VERSION})
|
||||
|
||||
if (QUIET_MAKE)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} ${NO_UNINITIALIZED_WARN} -Wno-unused")
|
||||
|
@ -42,19 +36,9 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS
|
|||
|
||||
if (LOONGARCH64)
|
||||
if (BINARY64)
|
||||
CHECK_C_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI)
|
||||
if(COMPILER_SUPPORT_LP64D_ABI)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64d")
|
||||
else()
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64")
|
||||
endif ()
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64")
|
||||
else ()
|
||||
CHECK_C_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI)
|
||||
if(COMPILER_SUPPORT_ILP32D_ABI)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=ilp32d")
|
||||
else()
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32")
|
||||
endif ()
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32")
|
||||
endif ()
|
||||
set(BINARY_DEFINED 1)
|
||||
endif ()
|
||||
|
@ -81,14 +65,6 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC")
|
||||
if (POWER)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -tp pwr8")
|
||||
else ()
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -tp px")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
|
||||
if (BINARY64)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
|
||||
|
@ -145,6 +121,7 @@ endif ()
|
|||
if (${CORE} STREQUAL COOPERLAKE)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (NOT NO_AVX512)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=cooperlake")
|
||||
else ()
|
||||
|
@ -157,6 +134,7 @@ endif ()
|
|||
if (${CORE} STREQUAL SAPPHIRERAPIDS)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (NOT NO_AVX512)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=sapphirerapids")
|
||||
else ()
|
||||
|
@ -166,22 +144,9 @@ if (${CORE} STREQUAL SAPPHIRERAPIDS)
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL ZEN)
|
||||
if (HAVE_AVX512VL)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (NOT NO_AVX512)
|
||||
if (${GCC_VERSION} VERSION_GREATER 13.0 OR ${GCC_VERSION} VERSION_EQUAL 13.0)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=znver4")
|
||||
else ()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512")
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL A64FX)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx")
|
||||
else ()
|
||||
|
@ -192,34 +157,29 @@ endif ()
|
|||
|
||||
if (${CORE} STREQUAL NEOVERSEN2)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
|
||||
else ()
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
|
||||
else ()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
|
||||
endif()
|
||||
endif ()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
|
||||
endif()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL NEOVERSEV1)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
|
||||
else ()
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
|
||||
else ()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
|
||||
endif()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
|
||||
endif()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL NEOVERSEN1)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=neoverse-n1")
|
||||
else ()
|
||||
|
@ -230,11 +190,7 @@ endif ()
|
|||
|
||||
if (${CORE} STREQUAL ARMV8SVE)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve")
|
||||
else ()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
|
||||
endif ()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
|
@ -264,21 +220,23 @@ endif ()
|
|||
|
||||
if (${CORE} STREQUAL POWER10)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math")
|
||||
else ()
|
||||
message(FATAL_ERROR "Compiler GCC ${GCC_VERSION} does not support Power10." )
|
||||
message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10." )
|
||||
endif()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL POWER9)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math")
|
||||
else ()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
|
||||
message(WARNING "Compiler GCC ${GCC_VERSION} does not fully support Power9.")
|
||||
message(WARNING "Compiler GCC.${GCC_VERSION} does not fully support Power9.")
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
|
@ -289,27 +247,6 @@ if (${CORE} STREQUAL POWER8)
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
# With -mcpu=970 added it compiles, but library is broken, at least on macOS. If someone
|
||||
# tests on *BSD or Linux and adds this flag, please make sure it is not used for macOS case.
|
||||
if (${CORE} STREQUAL PPC970)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mtune=970 -maltivec -fno-fast-math")
|
||||
endif ()
|
||||
if (APPLE)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -force_cpusubtype_ALL")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
# -mcpu=G4 seems to work fine, but perhaps avoid it for the sake of consistency?
|
||||
if (${CORE} STREQUAL PPCG4)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mtune=G4 -maltivec -fno-fast-math")
|
||||
endif ()
|
||||
if (APPLE)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -force_cpusubtype_ALL")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (HAVE_AVX2)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2")
|
||||
|
|
|
@ -64,7 +64,6 @@ else ()
|
|||
"#define NEEDBUNDERSCORE 1\n")
|
||||
endif()
|
||||
|
||||
if (CMAKE_Fortran_COMPILER)
|
||||
get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE)
|
||||
string(TOUPPER ${F_COMPILER} F_COMPILER)
|
||||
endif()
|
||||
|
||||
|
|
|
@ -3,9 +3,11 @@
|
|||
## Description: Ported from portion of OpenBLAS/Makefile.system
|
||||
## Sets Fortran related variables.
|
||||
|
||||
if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
|
||||
# This is for classic Flang. LLVM Flang is handled with gfortran below.
|
||||
if (${F_COMPILER} STREQUAL "FLANG")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
|
||||
if (BINARY64 AND INTERFACE64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
|
||||
endif ()
|
||||
if (USE_OPENMP)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
|
||||
endif ()
|
||||
|
@ -36,68 +38,34 @@ if (${F_COMPILER} STREQUAL "G95")
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
|
||||
if (${F_COMPILER} STREQUAL "GFORTRAN")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
|
||||
if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
|
||||
# ensure reentrancy of lapack codes
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
|
||||
# work around ABI violation in passing string arguments from C
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
|
||||
if (NOT NO_LAPACK)
|
||||
# Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||
set(EXTRALIB "${EXTRALIB} -lgfortran")
|
||||
endif ()
|
||||
# ensure reentrancy of lapack codes
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
|
||||
# work around ABI violation in passing string arguments from C
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
|
||||
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||
if (NOT NO_LAPACK)
|
||||
set(EXTRALIB "{EXTRALIB} -lgfortran")
|
||||
endif ()
|
||||
if (NO_BINARY_MODE)
|
||||
if (MIPS64)
|
||||
if (BINARY64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64")
|
||||
if (INTERFACE64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
|
||||
endif ()
|
||||
else ()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32")
|
||||
endif ()
|
||||
endif ()
|
||||
if (LOONGARCH64)
|
||||
if (BINARY64)
|
||||
if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
|
||||
CHECK_C_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI)
|
||||
if(COMPILER_SUPPORT_LP64D_ABI)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d")
|
||||
else()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
|
||||
endif ()
|
||||
endif ()
|
||||
if (INTERFACE64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
|
||||
endif ()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
|
||||
else ()
|
||||
if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
|
||||
CHECK_C_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI)
|
||||
if(COMPILER_SUPPORT_ILP32D_ABI)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d")
|
||||
else()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
|
||||
endif ()
|
||||
endif ()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
|
||||
endif ()
|
||||
endif ()
|
||||
if (RISCV64)
|
||||
if (BINARY64)
|
||||
if (INTERFACE64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
if (ARM64 AND INTERFACE64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
|
||||
endif ()
|
||||
else ()
|
||||
if (BINARY64)
|
||||
if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
|
||||
endif ()
|
||||
if (INTERFACE64)
|
||||
if (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel")
|
||||
if (WIN32)
|
||||
|
@ -110,9 +78,7 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
|
|||
endif ()
|
||||
endif ()
|
||||
else ()
|
||||
if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -m32")
|
||||
endif ()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -m32")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
|
@ -121,12 +87,12 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${F_COMPILER} STREQUAL "INTEL" OR CMAKE_Fortran_COMPILER_ID MATCHES "Intel")
|
||||
if (${F_COMPILER} STREQUAL "INTEL")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_INTEL")
|
||||
if (INTERFACE64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
|
||||
endif ()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -recursive -fp-model=consistent")
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -recursive")
|
||||
if (USE_OPENMP)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp")
|
||||
endif ()
|
||||
|
@ -155,7 +121,7 @@ if (${F_COMPILER} STREQUAL "IBM")
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${F_COMPILER} STREQUAL "PGI" OR ${F_COMPILER} STREQUAL "PGF95")
|
||||
if (${F_COMPILER} STREQUAL "PGI")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PGI")
|
||||
set(COMMON_PROF "${COMMON_PROF} -DPGICOMPILER")
|
||||
if (BINARY64)
|
||||
|
|
|
@ -22,7 +22,7 @@ set(SCLAUX
|
|||
slasd7.f slasd8.f slasda.f slasdq.f slasdt.f
|
||||
slaset.f slasq1.f slasq2.f slasq3.f slasq4.f slasq5.f slasq6.f
|
||||
slasr.f slasrt.f slassq.f90 slasv2.f spttrf.f sstebz.f sstedc.f
|
||||
ssteqr.f ssterf.f slaisnan.f sisnan.f slarmm.f
|
||||
ssteqr.f ssterf.f slaisnan.f sisnan.f
|
||||
slartgp.f slartgs.f ../INSTALL/sroundup_lwork.f
|
||||
../INSTALL/second_${TIMER}.f)
|
||||
|
||||
|
@ -42,7 +42,7 @@ set(DZLAUX
|
|||
dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f
|
||||
dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f
|
||||
dlasr.f dlasrt.f dlassq.f90 dlasv2.f dpttrf.f dstebz.f dstedc.f
|
||||
dsteqr.f dsterf.f dlaisnan.f disnan.f dlarmm.f
|
||||
dsteqr.f dsterf.f dlaisnan.f disnan.f
|
||||
dlartgp.f dlartgs.f ../INSTALL/droundup_lwork.f
|
||||
../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f)
|
||||
|
||||
|
@ -52,7 +52,7 @@ set(SLASRC
|
|||
sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f
|
||||
sgehd2.f sgehrd.f sgelq2.f sgelqf.f
|
||||
sgels.f sgelsd.f sgelss.f sgelsy.f sgeql2.f sgeqlf.f
|
||||
sgeqp3.f sgeqp3rk.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f
|
||||
sgeqp3.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f
|
||||
sgesc2.f sgesdd.f sgesvd.f sgesvdx.f sgesvx.f sgetc2.f
|
||||
sgetrf2.f sgetri.f
|
||||
sggbak.f sggbal.f
|
||||
|
@ -67,7 +67,7 @@ set(SLASRC
|
|||
slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f
|
||||
slansy.f slantb.f slantp.f slantr.f slanv2.f
|
||||
slapll.f slapmt.f
|
||||
slaqgb.f slaqge.f slaqp2.f slaqps.f slaqp2rk.f slaqp3rk.f slaqsb.f slaqsp.f slaqsy.f
|
||||
slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f
|
||||
slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
|
||||
slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
|
||||
slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
|
||||
|
@ -124,7 +124,7 @@ set(SLASRC
|
|||
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
|
||||
sgesvdq.f slaorhr_col_getrfnp.f
|
||||
slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f
|
||||
slatrs3.f strsyl3.f sgelst.f sgedmd.f90 sgedmdq.f90)
|
||||
slarmm.f slatrs3.f strsyl3.f sgelst.f)
|
||||
|
||||
set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
|
||||
sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
|
||||
|
@ -139,7 +139,7 @@ set(CLASRC
|
|||
cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f
|
||||
cgecon.f cgeequ.f cgees.f cgeesx.f cgeev.f cgeevx.f
|
||||
cgehd2.f cgehrd.f cgelq2.f cgelqf.f
|
||||
cgels.f cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f cgeqp3rk.f
|
||||
cgels.f cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f
|
||||
cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f cgerq2.f cgerqf.f
|
||||
cgesc2.f cgesdd.f cgesvd.f cgesvdx.f
|
||||
cgesvj.f cgejsv.f cgsvj0.f cgsvj1.f
|
||||
|
@ -173,7 +173,7 @@ set(CLASRC
|
|||
clanhb.f clanhe.f
|
||||
clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f
|
||||
clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f
|
||||
claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqp2rk.f claqp3rk.f claqsb.f
|
||||
claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f
|
||||
claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
|
||||
claqz0.f claqz1.f claqz2.f claqz3.f
|
||||
claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
|
||||
|
@ -187,7 +187,7 @@ set(CLASRC
|
|||
cposv.f cposvx.f cpotrf2.f cpotri.f cpstrf.f cpstf2.f
|
||||
cppcon.f cppequ.f cpprfs.f cppsv.f cppsvx.f cpptrf.f cpptri.f cpptrs.f
|
||||
cptcon.f cpteqr.f cptrfs.f cptsv.f cptsvx.f cpttrf.f cpttrs.f cptts2.f
|
||||
crot.f crscl.f cspcon.f csprfs.f cspsv.f
|
||||
crot.f cspcon.f csprfs.f cspsv.f
|
||||
cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f
|
||||
cstegr.f cstein.f csteqr.f csycon.f
|
||||
csyrfs.f csysv.f csysvx.f csytf2.f csytrf.f csytri.f
|
||||
|
@ -223,7 +223,7 @@ set(CLASRC
|
|||
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
|
||||
cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f
|
||||
cungtsqr.f cungtsqr_row.f cunhr_col.f
|
||||
clatrs3.f ctrsyl3.f cgelst.f cgedmd.f90 cgedmdq.f90)
|
||||
clatrs3.f ctrsyl3.f cgelst.f)
|
||||
|
||||
set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
|
||||
cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
|
||||
|
@ -243,7 +243,7 @@ set(DLASRC
|
|||
dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f
|
||||
dgehd2.f dgehrd.f dgelq2.f dgelqf.f
|
||||
dgels.f dgelsd.f dgelss.f dgelsy.f dgeql2.f dgeqlf.f
|
||||
dgeqp3.f dgeqp3rk.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f
|
||||
dgeqp3.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f
|
||||
dgesc2.f dgesdd.f dgesvd.f dgesvdx.f dgesvx.f dgetc2.f
|
||||
dgetrf2.f dgetri.f
|
||||
dggbak.f dggbal.f
|
||||
|
@ -258,7 +258,7 @@ set(DLASRC
|
|||
dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f
|
||||
dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f
|
||||
dlapll.f dlapmt.f
|
||||
dlaqgb.f dlaqge.f dlaqp2.f dlaqp2rk.f dlaqp3rk.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
|
||||
dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
|
||||
dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
|
||||
dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
|
||||
dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
|
||||
|
@ -316,7 +316,7 @@ set(DLASRC
|
|||
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
|
||||
dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
|
||||
dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f
|
||||
dlatrs3.f dtrsyl3.f dgelst.f dgedmd.f90 dgedmdq.f90)
|
||||
dlarmm.f dlatrs3.f dtrsyl3.f dgelst.f)
|
||||
|
||||
set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
|
||||
dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
|
||||
|
@ -331,7 +331,7 @@ set(ZLASRC
|
|||
zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f
|
||||
zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f
|
||||
zgehd2.f zgehrd.f zgelq2.f zgelqf.f
|
||||
zgels.f zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f zgeqp3rk.f
|
||||
zgels.f zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f
|
||||
zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f
|
||||
zgesc2.f zgesdd.f zgesvd.f zgesvdx.f zgesvx.f
|
||||
zgesvj.f zgejsv.f zgsvj0.f zgsvj1.f
|
||||
|
@ -367,7 +367,7 @@ set(ZLASRC
|
|||
zlanhe.f
|
||||
zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f
|
||||
zlantp.f zlantr.f zlapll.f zlapmt.f zlaqgb.f zlaqge.f
|
||||
zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqp2rk.f zlaqp3rk.f zlaqps.f zlaqsb.f
|
||||
zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f
|
||||
zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
|
||||
zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
|
||||
zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f
|
||||
|
@ -381,7 +381,7 @@ set(ZLASRC
|
|||
zposv.f zposvx.f zpotrf2.f zpotri.f zpotrs.f zpstrf.f zpstf2.f
|
||||
zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f
|
||||
zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f
|
||||
zrot.f zrscl.f zspcon.f zsprfs.f zspsv.f
|
||||
zrot.f zspcon.f zsprfs.f zspsv.f
|
||||
zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f
|
||||
zstegr.f zstein.f zsteqr.f zsycon.f
|
||||
zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f
|
||||
|
@ -419,7 +419,7 @@ set(ZLASRC
|
|||
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
|
||||
zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
|
||||
zungtsqr.f zungtsqr_row.f zunhr_col.f
|
||||
zlatrs3.f ztrsyl3.f zgelst.f zgedmd.f90 zgedmdq.f90)
|
||||
zlatrs3.f ztrsyl3.f zgelst.f)
|
||||
|
||||
set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
|
||||
zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
|
||||
|
@ -436,25 +436,19 @@ if(USE_XBLAS)
|
|||
set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
|
||||
endif()
|
||||
|
||||
if(BUILD_LAPACK_DEPRECATED)
|
||||
list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f
|
||||
DEPRECATED/sgelqs.f DEPRECATED/sgeqrs.f
|
||||
DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f
|
||||
DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f)
|
||||
list(APPEND DLASRC DEPRECATED/dgegs.f DEPRECATED/dgegv.f
|
||||
DEPRECATED/dgelqs.f DEPRECATED/dgeqrs.f
|
||||
DEPRECATED/dgeqpf.f DEPRECATED/dgelsx.f DEPRECATED/dggsvd.f
|
||||
DEPRECATED/dggsvp.f DEPRECATED/dlahrd.f DEPRECATED/dlatzm.f DEPRECATED/dtzrqf.f)
|
||||
list(APPEND CLASRC DEPRECATED/cgegs.f DEPRECATED/cgegv.f
|
||||
DEPRECATED/cgelqs.f DEPRECATED/cgeqrs.f
|
||||
DEPRECATED/cgeqpf.f DEPRECATED/cgelsx.f DEPRECATED/cggsvd.f
|
||||
DEPRECATED/cggsvp.f DEPRECATED/clahrd.f DEPRECATED/clatzm.f DEPRECATED/ctzrqf.f)
|
||||
list(APPEND ZLASRC DEPRECATED/zgegs.f DEPRECATED/zgegv.f
|
||||
DEPRECATED/zgelqs.f DEPRECATED/zgeqrs.f
|
||||
DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f
|
||||
DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f)
|
||||
message(STATUS "Building deprecated routines")
|
||||
endif()
|
||||
|
||||
set(DSLASRC spotrs.f)
|
||||
|
||||
|
@ -529,7 +523,7 @@ set(SCLAUX
|
|||
slaset.c slasq1.c slasq2.c slasq3.c slasq4.c slasq5.c slasq6.c
|
||||
slasr.c slasrt.c slassq.c slasv2.c spttrf.c sstebz.c sstedc.c
|
||||
ssteqr.c ssterf.c slaisnan.c sisnan.c
|
||||
slartgp.c slartgs.c slarmm.c
|
||||
slartgp.c slartgs.c
|
||||
../INSTALL/second_${TIMER}.c)
|
||||
|
||||
set(DZLAUX
|
||||
|
@ -548,7 +542,7 @@ set(DZLAUX
|
|||
dlasq1.c dlasq2.c dlasq3.c dlasq4.c dlasq5.c dlasq6.c
|
||||
dlasr.c dlasrt.c dlassq.c dlasv2.c dpttrf.c dstebz.c dstedc.c
|
||||
dsteqr.c dsterf.c dlaisnan.c disnan.c
|
||||
dlartgp.c dlartgs.c dlarmm.c
|
||||
dlartgp.c dlartgs.c
|
||||
../INSTALL/dlamch.c ../INSTALL/dsecnd_${TIMER}.c)
|
||||
|
||||
set(SLASRC
|
||||
|
@ -557,7 +551,7 @@ set(SLASRC
|
|||
sgebrd.c sgecon.c sgeequ.c sgees.c sgeesx.c sgeev.c sgeevx.c
|
||||
sgehd2.c sgehrd.c sgelq2.c sgelqf.c
|
||||
sgels.c sgelsd.c sgelss.c sgelsy.c sgeql2.c sgeqlf.c
|
||||
sgeqp3.c sgeqp3rk.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c
|
||||
sgeqp3.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c
|
||||
sgesc2.c sgesdd.c sgesvd.c sgesvdx.c sgesvx.c sgetc2.c
|
||||
sgetrf2.c sgetri.c
|
||||
sggbak.c sggbal.c
|
||||
|
@ -571,7 +565,7 @@ set(SLASRC
|
|||
slangb.c slange.c slangt.c slanhs.c slansb.c slansp.c
|
||||
slansy.c slantb.c slantp.c slantr.c slanv2.c
|
||||
slapll.c slapmt.c
|
||||
slaqgb.c slaqge.c slaqp2.c slaqp2rk.c slaqp3rk.c slaqps.c slaqsb.c slaqsp.c slaqsy.c
|
||||
slaqgb.c slaqge.c slaqp2.c slaqps.c slaqsb.c slaqsp.c slaqsy.c
|
||||
slaqr0.c slaqr1.c slaqr2.c slaqr3.c slaqr4.c slaqr5.c
|
||||
slaqtr.c slar1v.c slar2v.c ilaslr.c ilaslc.c
|
||||
slarf.c slarfb.c slarfb_gett.c slarfg.c slarfgp.c slarft.c slarfx.c slarfy.c slargv.c
|
||||
|
@ -628,7 +622,7 @@ set(SLASRC
|
|||
ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c
|
||||
sgesvdq.c slaorhr_col_getrfnp.c
|
||||
slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c
|
||||
slatrs3.c strsyl3.c sgelst.c sgedmd.c sgedmdq.c)
|
||||
slarmm.c slatrs3.c strsyl3.c sgelst.c)
|
||||
|
||||
set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c
|
||||
sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c
|
||||
|
@ -643,7 +637,7 @@ set(CLASRC
|
|||
cgbtf2.c cgbtrf.c cgbtrs.c cgebak.c cgebal.c cgebd2.c cgebrd.c
|
||||
cgecon.c cgeequ.c cgees.c cgeesx.c cgeev.c cgeevx.c
|
||||
cgehd2.c cgehrd.c cgelq2.c cgelqf.c
|
||||
cgels.c cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c cgeqp3rk.c
|
||||
cgels.c cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c
|
||||
cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c
|
||||
cgesc2.c cgesdd.c cgesvd.c cgesvdx.c
|
||||
cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c
|
||||
|
@ -677,7 +671,7 @@ set(CLASRC
|
|||
clanhb.c clanhe.c
|
||||
clanhp.c clanhs.c clanht.c clansb.c clansp.c clansy.c clantb.c
|
||||
clantp.c clantr.c clapll.c clapmt.c clarcm.c claqgb.c claqge.c
|
||||
claqhb.c claqhe.c claqhp.c claqp2.c claqp2rk.c claqp3rk.c claqps.c claqsb.c
|
||||
claqhb.c claqhe.c claqhp.c claqp2.c claqps.c claqsb.c
|
||||
claqr0.c claqr1.c claqr2.c claqr3.c claqr4.c claqr5.c
|
||||
claqsp.c claqsy.c clar1v.c clar2v.c ilaclr.c ilaclc.c
|
||||
clarf.c clarfb.c clarfb_gett.c clarfg.c clarfgp.c clarft.c
|
||||
|
@ -690,7 +684,7 @@ set(CLASRC
|
|||
cposv.c cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c
|
||||
cppcon.c cppequ.c cpprfs.c cppsv.c cppsvx.c cpptrf.c cpptri.c cpptrs.c
|
||||
cptcon.c cpteqr.c cptrfs.c cptsv.c cptsvx.c cpttrf.c cpttrs.c cptts2.c
|
||||
crot.c crscl.c cspcon.c csprfs.c cspsv.c
|
||||
crot.c cspcon.c csprfs.c cspsv.c
|
||||
cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c
|
||||
cstegr.c cstein.c csteqr.c csycon.c
|
||||
csyrfs.c csysv.c csysvx.c csytf2.c csytrf.c csytri.c
|
||||
|
@ -726,7 +720,7 @@ set(CLASRC
|
|||
chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c
|
||||
cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c
|
||||
cungtsqr.c cungtsqr_row.c cunhr_col.c
|
||||
clatrs3.c ctrsyl3.c cgelst.c cgedmd.c cgedmdq.c)
|
||||
clatrs3.c ctrsyl3.c cgelst.c)
|
||||
|
||||
set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c
|
||||
cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c
|
||||
|
@ -746,7 +740,7 @@ set(DLASRC
|
|||
dgebrd.c dgecon.c dgeequ.c dgees.c dgeesx.c dgeev.c dgeevx.c
|
||||
dgehd2.c dgehrd.c dgelq2.c dgelqf.c
|
||||
dgels.c dgelsd.c dgelss.c dgelsy.c dgeql2.c dgeqlf.c
|
||||
dgeqp3.c dgeqp3rk.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c
|
||||
dgeqp3.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c
|
||||
dgesc2.c dgesdd.c dgesvd.c dgesvdx.c dgesvx.c dgetc2.c
|
||||
dgetrf2.c dgetri.c
|
||||
dggbak.c dggbal.c
|
||||
|
@ -760,7 +754,7 @@ set(DLASRC
|
|||
dlangb.c dlange.c dlangt.c dlanhs.c dlansb.c dlansp.c
|
||||
dlansy.c dlantb.c dlantp.c dlantr.c dlanv2.c
|
||||
dlapll.c dlapmt.c
|
||||
dlaqgb.c dlaqge.c dlaqp2.c dlaqp2rk.c dlaqp3rk.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c
|
||||
dlaqgb.c dlaqge.c dlaqp2.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c
|
||||
dlaqr0.c dlaqr1.c dlaqr2.c dlaqr3.c dlaqr4.c dlaqr5.c
|
||||
dlaqtr.c dlar1v.c dlar2v.c iladlr.c iladlc.c
|
||||
dlarf.c dlarfb.c dlarfb_gett.c dlarfg.c dlarfgp.c dlarft.c dlarfx.c dlarfy.c
|
||||
|
@ -818,7 +812,7 @@ set(DLASRC
|
|||
dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c
|
||||
dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c
|
||||
dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c
|
||||
dlatrs3.c dtrsyl3.c dgelst.c dgedmd.c dgedmdq.c)
|
||||
dlarmm.c dlatrs3.c dtrsyl3.c dgelst.c)
|
||||
|
||||
set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c
|
||||
dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c
|
||||
|
@ -833,7 +827,7 @@ set(ZLASRC
|
|||
zgbtf2.c zgbtrf.c zgbtrs.c zgebak.c zgebal.c zgebd2.c zgebrd.c
|
||||
zgecon.c zgeequ.c zgees.c zgeesx.c zgeev.c zgeevx.c
|
||||
zgehd2.c zgehrd.c zgelq2.c zgelqf.c
|
||||
zgels.c zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c zgeqp3rk.c
|
||||
zgels.c zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c
|
||||
zgeqr2.c zgeqr2p.c zgeqrf.c zgeqrfp.c zgerfs.c zgerq2.c zgerqf.c
|
||||
zgesc2.c zgesdd.c zgesvd.c zgesvdx.c zgesvx.c
|
||||
zgesvj.c zgejsv.c zgsvj0.c zgsvj1.c
|
||||
|
@ -868,7 +862,7 @@ set(ZLASRC
|
|||
zlanhe.c
|
||||
zlanhp.c zlanhs.c zlanht.c zlansb.c zlansp.c zlansy.c zlantb.c
|
||||
zlantp.c zlantr.c zlapll.c zlapmt.c zlaqgb.c zlaqge.c
|
||||
zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqp2rk.c zlaqp3rk.c zlaqps.c zlaqsb.c
|
||||
zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqps.c zlaqsb.c
|
||||
zlaqr0.c zlaqr1.c zlaqr2.c zlaqr3.c zlaqr4.c zlaqr5.c
|
||||
zlaqsp.c zlaqsy.c zlar1v.c zlar2v.c ilazlr.c ilazlc.c
|
||||
zlarcm.c zlarf.c zlarfb.c zlarfb_gett.c
|
||||
|
@ -882,7 +876,7 @@ set(ZLASRC
|
|||
zposv.c zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c
|
||||
zppcon.c zppequ.c zpprfs.c zppsv.c zppsvx.c zpptrf.c zpptri.c zpptrs.c
|
||||
zptcon.c zpteqr.c zptrfs.c zptsv.c zptsvx.c zpttrf.c zpttrs.c zptts2.c
|
||||
zrot.c zrscl.c zspcon.c zsprfs.c zspsv.c
|
||||
zrot.c zspcon.c zsprfs.c zspsv.c
|
||||
zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c
|
||||
zstegr.c zstein.c zsteqr.c zsycon.c
|
||||
zsyrfs.c zsysv.c zsysvx.c zsytf2.c zsytrf.c zsytri.c
|
||||
|
@ -919,8 +913,7 @@ set(ZLASRC
|
|||
zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c
|
||||
zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c
|
||||
zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c
|
||||
zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c
|
||||
zgedmd.c zgedmdq.c)
|
||||
zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c)
|
||||
|
||||
set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c
|
||||
zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c
|
||||
|
@ -937,25 +930,19 @@ if(USE_XBLAS)
|
|||
set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
|
||||
endif()
|
||||
|
||||
if(BUILD_LAPACK_DEPRECATED)
|
||||
list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c
|
||||
DEPRECATED/sgelqs.c DEPRECATED/sgeqrs.c
|
||||
DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c
|
||||
DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c)
|
||||
list(APPEND DLASRC DEPRECATED/dgegs.c DEPRECATED/dgegv.c
|
||||
DEPRECATED/dgelqs.c DEPRECATED/dgeqrs.c
|
||||
DEPRECATED/dgeqpf.c DEPRECATED/dgelsx.c DEPRECATED/dggsvd.c
|
||||
DEPRECATED/dggsvp.c DEPRECATED/dlahrd.c DEPRECATED/dlatzm.c DEPRECATED/dtzrqf.c)
|
||||
list(APPEND CLASRC DEPRECATED/cgegs.c DEPRECATED/cgegv.c
|
||||
DEPRECATED/cgelqs.c DEPRECATED/cgeqrs.c
|
||||
DEPRECATED/cgeqpf.c DEPRECATED/cgelsx.c DEPRECATED/cggsvd.c
|
||||
DEPRECATED/cggsvp.c DEPRECATED/clahrd.c DEPRECATED/clatzm.c DEPRECATED/ctzrqf.c)
|
||||
list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c
|
||||
DEPRECATED/zgelqs.c DEPRECATED/zgeqrs.c
|
||||
DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c
|
||||
DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c)
|
||||
message(STATUS "Building deprecated routines")
|
||||
endif()
|
||||
|
||||
set(DSLASRC spotrs.c)
|
||||
|
||||
|
|
|
@ -70,6 +70,8 @@ set(CSRC
|
|||
lapacke_cgeqlf_work.c
|
||||
lapacke_cgeqp3.c
|
||||
lapacke_cgeqp3_work.c
|
||||
lapacke_cgeqpf.c
|
||||
lapacke_cgeqpf_work.c
|
||||
lapacke_cgeqr.c
|
||||
lapacke_cgeqr_work.c
|
||||
lapacke_cgeqr2.c
|
||||
|
@ -90,10 +92,6 @@ set(CSRC
|
|||
lapacke_cgerqf_work.c
|
||||
lapacke_cgesdd.c
|
||||
lapacke_cgesdd_work.c
|
||||
lapacke_cgedmd.c
|
||||
lapacke_cgedmd_work.c
|
||||
lapacke_cgedmdq.c
|
||||
lapacke_cgedmdq_work.c
|
||||
lapacke_cgesv.c
|
||||
lapacke_cgesv_work.c
|
||||
lapacke_cgesvd.c
|
||||
|
@ -146,8 +144,12 @@ set(CSRC
|
|||
lapacke_cggqrf_work.c
|
||||
lapacke_cggrqf.c
|
||||
lapacke_cggrqf_work.c
|
||||
lapacke_cggsvd.c
|
||||
lapacke_cggsvd_work.c
|
||||
lapacke_cggsvd3.c
|
||||
lapacke_cggsvd3_work.c
|
||||
lapacke_cggsvp.c
|
||||
lapacke_cggsvp_work.c
|
||||
lapacke_cggsvp3.c
|
||||
lapacke_cggsvp3_work.c
|
||||
lapacke_cgtcon.c
|
||||
|
@ -562,8 +564,6 @@ set(CSRC
|
|||
lapacke_ctrsna_work.c
|
||||
lapacke_ctrsyl.c
|
||||
lapacke_ctrsyl_work.c
|
||||
lapacke_ctrsyl3.c
|
||||
lapacke_ctrsyl3_work.c
|
||||
lapacke_ctrtri.c
|
||||
lapacke_ctrtri_work.c
|
||||
lapacke_ctrtrs.c
|
||||
|
@ -596,8 +596,6 @@ set(CSRC
|
|||
lapacke_cungtr_work.c
|
||||
lapacke_cungtsqr_row.c
|
||||
lapacke_cungtsqr_row_work.c
|
||||
lapacke_cunhr_col.c
|
||||
lapacke_cunhr_col_work.c
|
||||
lapacke_cunmbr.c
|
||||
lapacke_cunmbr_work.c
|
||||
lapacke_cunmhr.c
|
||||
|
@ -697,6 +695,8 @@ set(DSRC
|
|||
lapacke_dgeqlf_work.c
|
||||
lapacke_dgeqp3.c
|
||||
lapacke_dgeqp3_work.c
|
||||
lapacke_dgeqpf.c
|
||||
lapacke_dgeqpf_work.c
|
||||
lapacke_dgeqr.c
|
||||
lapacke_dgeqr_work.c
|
||||
lapacke_dgeqr2.c
|
||||
|
@ -717,10 +717,6 @@ set(DSRC
|
|||
lapacke_dgerqf_work.c
|
||||
lapacke_dgesdd.c
|
||||
lapacke_dgesdd_work.c
|
||||
lapacke_dgedmd.c
|
||||
lapacke_dgedmd_work.c
|
||||
lapacke_dgedmdq.c
|
||||
lapacke_dgedmdq_work.c
|
||||
lapacke_dgesv.c
|
||||
lapacke_dgesv_work.c
|
||||
lapacke_dgesvd.c
|
||||
|
@ -775,8 +771,12 @@ set(DSRC
|
|||
lapacke_dggqrf_work.c
|
||||
lapacke_dggrqf.c
|
||||
lapacke_dggrqf_work.c
|
||||
lapacke_dggsvd.c
|
||||
lapacke_dggsvd_work.c
|
||||
lapacke_dggsvd3.c
|
||||
lapacke_dggsvd3_work.c
|
||||
lapacke_dggsvp.c
|
||||
lapacke_dggsvp_work.c
|
||||
lapacke_dggsvp3.c
|
||||
lapacke_dggsvp3_work.c
|
||||
lapacke_dgtcon.c
|
||||
|
@ -874,8 +874,6 @@ set(DSRC
|
|||
lapacke_dorgtr_work.c
|
||||
lapacke_dorgtsqr_row.c
|
||||
lapacke_dorgtsqr_row_work.c
|
||||
lapacke_dorhr_col.c
|
||||
lapacke_dorhr_col_work.c
|
||||
lapacke_dormbr.c
|
||||
lapacke_dormbr_work.c
|
||||
lapacke_dormhr.c
|
||||
|
@ -1188,8 +1186,6 @@ set(DSRC
|
|||
lapacke_dtrsna_work.c
|
||||
lapacke_dtrsyl.c
|
||||
lapacke_dtrsyl_work.c
|
||||
lapacke_dtrsyl3.c
|
||||
lapacke_dtrsyl3_work.c
|
||||
lapacke_dtrtri.c
|
||||
lapacke_dtrtri_work.c
|
||||
lapacke_dtrtrs.c
|
||||
|
@ -1279,6 +1275,8 @@ set(SSRC
|
|||
lapacke_sgeqlf_work.c
|
||||
lapacke_sgeqp3.c
|
||||
lapacke_sgeqp3_work.c
|
||||
lapacke_sgeqpf.c
|
||||
lapacke_sgeqpf_work.c
|
||||
lapacke_sgeqr.c
|
||||
lapacke_sgeqr_work.c
|
||||
lapacke_sgeqr2.c
|
||||
|
@ -1299,10 +1297,6 @@ set(SSRC
|
|||
lapacke_sgerqf_work.c
|
||||
lapacke_sgesdd.c
|
||||
lapacke_sgesdd_work.c
|
||||
lapacke_sgedmd.c
|
||||
lapacke_sgedmd_work.c
|
||||
lapacke_sgedmdq.c
|
||||
lapacke_sgedmdq_work.c
|
||||
lapacke_sgesv.c
|
||||
lapacke_sgesv_work.c
|
||||
lapacke_sgesvd.c
|
||||
|
@ -1357,8 +1351,12 @@ set(SSRC
|
|||
lapacke_sggqrf_work.c
|
||||
lapacke_sggrqf.c
|
||||
lapacke_sggrqf_work.c
|
||||
lapacke_sggsvd.c
|
||||
lapacke_sggsvd_work.c
|
||||
lapacke_sggsvd3.c
|
||||
lapacke_sggsvd3_work.c
|
||||
lapacke_sggsvp.c
|
||||
lapacke_sggsvp_work.c
|
||||
lapacke_sggsvp3.c
|
||||
lapacke_sggsvp3_work.c
|
||||
lapacke_sgtcon.c
|
||||
|
@ -1455,8 +1453,6 @@ set(SSRC
|
|||
lapacke_sorgtr_work.c
|
||||
lapacke_sorgtsqr_row.c
|
||||
lapacke_sorgtsqr_row_work.c
|
||||
lapacke_sorhr_col.c
|
||||
lapacke_sorhr_col_work.c
|
||||
lapacke_sormbr.c
|
||||
lapacke_sormbr_work.c
|
||||
lapacke_sormhr.c
|
||||
|
@ -1766,8 +1762,6 @@ set(SSRC
|
|||
lapacke_strsna_work.c
|
||||
lapacke_strsyl.c
|
||||
lapacke_strsyl_work.c
|
||||
lapacke_ctrsyl3.c
|
||||
lapacke_ctrsyl3_work.c
|
||||
lapacke_strtri.c
|
||||
lapacke_strtri_work.c
|
||||
lapacke_strtrs.c
|
||||
|
@ -1855,6 +1849,8 @@ set(ZSRC
|
|||
lapacke_zgeqlf_work.c
|
||||
lapacke_zgeqp3.c
|
||||
lapacke_zgeqp3_work.c
|
||||
lapacke_zgeqpf.c
|
||||
lapacke_zgeqpf_work.c
|
||||
lapacke_zgeqr.c
|
||||
lapacke_zgeqr_work.c
|
||||
lapacke_zgeqr2.c
|
||||
|
@ -1875,10 +1871,6 @@ set(ZSRC
|
|||
lapacke_zgerqf_work.c
|
||||
lapacke_zgesdd.c
|
||||
lapacke_zgesdd_work.c
|
||||
lapacke_zgedmd.c
|
||||
lapacke_zgedmd_work.c
|
||||
lapacke_zgedmdq.c
|
||||
lapacke_zgedmdq_work.c
|
||||
lapacke_zgesv.c
|
||||
lapacke_zgesv_work.c
|
||||
lapacke_zgesvd.c
|
||||
|
@ -1933,8 +1925,12 @@ set(ZSRC
|
|||
lapacke_zggqrf_work.c
|
||||
lapacke_zggrqf.c
|
||||
lapacke_zggrqf_work.c
|
||||
lapacke_zggsvd.c
|
||||
lapacke_zggsvd_work.c
|
||||
lapacke_zggsvd3.c
|
||||
lapacke_zggsvd3_work.c
|
||||
lapacke_zggsvp.c
|
||||
lapacke_zggsvp_work.c
|
||||
lapacke_zggsvp3.c
|
||||
lapacke_zggsvp3_work.c
|
||||
lapacke_zgtcon.c
|
||||
|
@ -2347,8 +2343,6 @@ set(ZSRC
|
|||
lapacke_ztrsna_work.c
|
||||
lapacke_ztrsyl.c
|
||||
lapacke_ztrsyl_work.c
|
||||
lapacke_ztrsyl3.c
|
||||
lapacke_ztrsyl3_work.c
|
||||
lapacke_ztrtri.c
|
||||
lapacke_ztrtri_work.c
|
||||
lapacke_ztrtrs.c
|
||||
|
@ -2381,8 +2375,6 @@ set(ZSRC
|
|||
lapacke_zungtr_work.c
|
||||
lapacke_zungtsqr_row.c
|
||||
lapacke_zungtsqr_row_work.c
|
||||
lapacke_zunhr_col.c
|
||||
lapacke_zunhr_col_work.c
|
||||
lapacke_zunmbr.c
|
||||
lapacke_zunmbr_work.c
|
||||
lapacke_zunmhr.c
|
||||
|
@ -2409,12 +2401,6 @@ set(ZSRC
|
|||
lapacke_csyr_work.c
|
||||
lapacke_ilaver.c
|
||||
)
|
||||
if (BUILD_LAPACK_DEPRECATED)
|
||||
set(SRCS $SRCS lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c)
|
||||
set(SRCD $SRCD lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c)
|
||||
set(SRCC $SRCC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c)
|
||||
set(SRCZ $SRCZ lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c)
|
||||
endif()
|
||||
|
||||
set(SRCX
|
||||
lapacke_cgbrfsx.c lapacke_cporfsx.c lapacke_dgerfsx.c lapacke_sgbrfsx.c lapacke_ssyrfsx.c lapacke_zherfsx.c
|
||||
|
|
|
@ -1,13 +1,11 @@
|
|||
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
||||
libnameprefix=@LIBNAMEPREFIX@
|
||||
libnamesuffix=@LIBNAMESUFFIX@
|
||||
libsuffix=@SUFFIX64_UNDERSCORE@
|
||||
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
||||
|
||||
openblas_config=USE_64BITINT=@INTERFACE64@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
|
||||
Name: OpenBLAS
|
||||
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
||||
Version: @OpenBLAS_VERSION@
|
||||
URL: https://github.com/OpenMathLib/OpenBLAS
|
||||
Libs: -L${libdir} -l${libnameprefix}openblas${libnamesuffix}${libsuffix}
|
||||
Cflags: -I${includedir} @OpenMP_C_FLAGS@
|
||||
Version: @OPENBLAS_VERSION@
|
||||
URL: https://github.com/xianyi/OpenBLAS
|
||||
Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix}
|
||||
Cflags: -I${includedir}
|
||||
|
|
|
@ -38,7 +38,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
|
|||
|
||||
# Test for supporting MS_ABI
|
||||
# removed string parsing in favor of CMake's version comparison -hpa
|
||||
set(GCC_VERSION ${CMAKE_C_COMPILER_VERSION})
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
|
||||
# GCC Version >=4.7
|
||||
# It is compatible with MSVC ABI.
|
||||
|
|
|
@ -823,41 +823,6 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
set(CGEMM3M_UNROLL_N 4)
|
||||
set(ZGEMM3M_UNROLL_M 4)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "ARMV5")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
"#define L1_DATA_LINESIZE\t32\n"
|
||||
"#define L2_SIZE\t512488\n"
|
||||
"#define L2_LINESIZE\t32\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define L2_ASSOCIATIVE\t4\n")
|
||||
set(SGEMM_UNROLL_M 2)
|
||||
set(SGEMM_UNROLL_N 2)
|
||||
set(DGEMM_UNROLL_M 2)
|
||||
set(DGEMM_UNROLL_N 2)
|
||||
set(CGEMM_UNROLL_M 2)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
elseif ("${TCORE}" STREQUAL "ARMV6")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
"#define L1_DATA_LINESIZE\t32\n"
|
||||
"#define L2_SIZE\t512488\n"
|
||||
"#define L2_LINESIZE\t32\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define L2_ASSOCIATIVE\t4\n"
|
||||
"#define HAVE_VFP\n")
|
||||
set(SGEMM_UNROLL_M 4)
|
||||
set(SGEMM_UNROLL_N 2)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
set(DGEMM_UNROLL_N 2)
|
||||
set(CGEMM_UNROLL_M 2)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
elseif ("${TCORE}" STREQUAL "ARMV7")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
|
@ -921,18 +886,14 @@ else ()
|
|||
set(SGEMM_UNROLL_M 8)
|
||||
set(SGEMM_UNROLL_N 8)
|
||||
endif ()
|
||||
if ("${TCORE}" STREQUAL "CORTEXA53")
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
else ()
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
endif ()
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73" OR "${TCORE}" STREQUAL "CORTEXA76")
|
||||
elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t49152\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
|
@ -1218,37 +1179,6 @@ endif ()
|
|||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "A64FX")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t65536\n"
|
||||
"#define L1_CODE_LINESIZE\t256\n"
|
||||
"#define L1_CODE_ASSOCIATIVE\t8\n"
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t256\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t8\n"
|
||||
"#define L2_SIZE\t8388608\n\n"
|
||||
"#define L2_LINESIZE\t256\n"
|
||||
"#define L2_ASSOCIATIVE\t8\n"
|
||||
"#define L3_SIZE\t0\n\n"
|
||||
"#define L3_LINESIZE\t0\n\n"
|
||||
"#define L3_ASSOCIATIVE\t0\n\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_VFPV4\n"
|
||||
"#define HAVE_VFPV3\n"
|
||||
"#define HAVE_VFP\n"
|
||||
"#define HAVE_NEON\n"
|
||||
"#define HAVE_SVE\n"
|
||||
"#define ARMV8\n")
|
||||
set(SGEMM_UNROLL_M 4)
|
||||
set(SGEMM_UNROLL_N 8)
|
||||
set(DGEMM_UNROLL_M 2)
|
||||
set(DGEMM_UNROLL_N 8)
|
||||
set(CGEMM_UNROLL_M 2)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "P5600")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L2_SIZE 1048576\n"
|
||||
|
@ -1340,63 +1270,6 @@ endif ()
|
|||
"#define DTB_DEFAULT_ENTRIES 128\n"
|
||||
"#define DTB_SIZE 4096\n"
|
||||
"#define L2_ASSOCIATIVE 8\n")
|
||||
elseif ("${TCORE}" STREQUAL "RISCV64_GENERIC")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE 32768\n"
|
||||
"#define L1_DATA_LINESIZE 32\n"
|
||||
"#define L2_SIZE 1048576\n"
|
||||
"#define L2_LINESIZE 32 \n"
|
||||
"#define DTB_DEFAULT_ENTRIES 128\n"
|
||||
"#define DTB_SIZE 4096\n"
|
||||
"#define L2_ASSOCIATIVE 4\n")
|
||||
elseif ("${TCORE}" STREQUAL "LA64_GENERIC")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define DTB_DEFAULT_ENTRIES 64\n")
|
||||
set(SGEMM_UNROLL_M 2)
|
||||
set(SGEMM_UNROLL_N 8)
|
||||
set(DGEMM_UNROLL_M 2)
|
||||
set(DGEMM_UNROLL_N 8)
|
||||
set(CGEMM_UNROLL_M 1)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 1)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(CGEMM3M_UNROLL_M 2)
|
||||
set(CGEMM3M_UNROLL_N 8)
|
||||
set(ZGEMM3M_UNROLL_M 2)
|
||||
set(ZGEMM3M_UNROLL_N 8)
|
||||
elseif ("${TCORE}" STREQUAL "LA264")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define DTB_DEFAULT_ENTRIES 64\n")
|
||||
set(HAVE_LSX 1)
|
||||
set(SGEMM_UNROLL_M 2)
|
||||
set(SGEMM_UNROLL_N 8)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(CGEMM3M_UNROLL_M 2)
|
||||
set(CGEMM3M_UNROLL_N 8)
|
||||
set(ZGEMM3M_UNROLL_M 8)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "LA464")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define DTB_DEFAULT_ENTRIES 64\n")
|
||||
set(HAVE_LASX 1)
|
||||
set(HAVE_LSX 1)
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 8)
|
||||
set(DGEMM_UNROLL_M 16)
|
||||
set(DGEMM_UNROLL_N 6)
|
||||
set(CGEMM_UNROLL_M 16)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 8)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(CGEMM3M_UNROLL_M 16)
|
||||
set(CGEMM3M_UNROLL_N 8)
|
||||
set(ZGEMM3M_UNROLL_M 16)
|
||||
set(ZGEMM3M_UNROLL_N 6)
|
||||
endif()
|
||||
set(SBGEMM_UNROLL_M 8)
|
||||
set(SBGEMM_UNROLL_N 4)
|
||||
|
@ -1430,7 +1303,7 @@ else(NOT CMAKE_CROSSCOMPILING)
|
|||
|
||||
if ("${CMAKE_C_COMPILER_ID}" STREQUAL "MSVC")
|
||||
#Use generic for MSVC now
|
||||
message(STATUS "MSVC")
|
||||
message("MSVC")
|
||||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
|
||||
else()
|
||||
list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S)
|
||||
|
@ -1446,25 +1319,16 @@ else(NOT CMAKE_CROSSCOMPILING)
|
|||
|
||||
set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build")
|
||||
set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}")
|
||||
file(MAKE_DIRECTORY "${GETARCH_DIR}")
|
||||
configure_file("${TARGET_CONF_TEMP}" "${GETARCH_DIR}/${TARGET_CONF}" COPYONLY)
|
||||
file(MAKE_DIRECTORY ${GETARCH_DIR})
|
||||
configure_file(${TARGET_CONF_TEMP} ${GETARCH_DIR}/${TARGET_CONF} COPYONLY)
|
||||
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||
if (CMAKE_ASM_COMPILER_ID STREQUAL "")
|
||||
try_compile(GETARCH_RESULT "${GETARCH_DIR}"
|
||||
SOURCES ${GETARCH_SRC}
|
||||
CMAKE_FLAGS "-DCMAKE_ASM_COMPILER=${CMAKE_C_COMPILER}"
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I"${GETARCH_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
|
||||
OUTPUT_VARIABLE GETARCH_LOG
|
||||
COPY_FILE "${PROJECT_BINARY_DIR}/${GETARCH_BIN}"
|
||||
)
|
||||
else()
|
||||
try_compile(GETARCH_RESULT "${GETARCH_DIR}"
|
||||
SOURCES ${GETARCH_SRC}
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I"${GETARCH_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
|
||||
OUTPUT_VARIABLE GETARCH_LOG
|
||||
COPY_FILE "${PROJECT_BINARY_DIR}/${GETARCH_BIN}"
|
||||
)
|
||||
endif()
|
||||
try_compile(GETARCH_RESULT ${GETARCH_DIR}
|
||||
SOURCES ${GETARCH_SRC}
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I"${GETARCH_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
|
||||
OUTPUT_VARIABLE GETARCH_LOG
|
||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
|
||||
)
|
||||
|
||||
if (NOT ${GETARCH_RESULT})
|
||||
MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
|
||||
endif ()
|
||||
|
@ -1493,19 +1357,19 @@ execute_process(COMMAND "${PROJECT_BINARY_DIR}/${GETARCH_BIN}" 1 OUTPUT_VARIABLE
|
|||
message(STATUS "GETARCH results:\n${GETARCH_MAKE_OUT}")
|
||||
|
||||
# append config data from getarch to the TARGET file and read in CMake vars
|
||||
file(APPEND "${TARGET_CONF_TEMP}" ${GETARCH_CONF_OUT})
|
||||
file(APPEND ${TARGET_CONF_TEMP} ${GETARCH_CONF_OUT})
|
||||
ParseGetArchVars(${GETARCH_MAKE_OUT})
|
||||
|
||||
set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build")
|
||||
set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}")
|
||||
file(MAKE_DIRECTORY "${GETARCH2_DIR}")
|
||||
configure_file("${TARGET_CONF_TEMP}" "${GETARCH2_DIR}/${TARGET_CONF}" COPYONLY)
|
||||
file(MAKE_DIRECTORY ${GETARCH2_DIR})
|
||||
configure_file(${TARGET_CONF_TEMP} ${GETARCH2_DIR}/${TARGET_CONF} COPYONLY)
|
||||
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||
try_compile(GETARCH2_RESULT "${GETARCH2_DIR}"
|
||||
SOURCES "${PROJECT_SOURCE_DIR}/getarch_2nd.c"
|
||||
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
|
||||
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
|
||||
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I"${GETARCH2_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
|
||||
OUTPUT_VARIABLE GETARCH2_LOG
|
||||
COPY_FILE "${PROJECT_BINARY_DIR}/${GETARCH2_BIN}"
|
||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
|
||||
)
|
||||
|
||||
if (NOT ${GETARCH2_RESULT})
|
||||
|
@ -1518,9 +1382,9 @@ execute_process(COMMAND "${PROJECT_BINARY_DIR}/${GETARCH2_BIN}" 0 OUTPUT_VARIABL
|
|||
execute_process(COMMAND "${PROJECT_BINARY_DIR}/${GETARCH2_BIN}" 1 OUTPUT_VARIABLE GETARCH2_CONF_OUT)
|
||||
|
||||
# append config data from getarch_2nd to the TARGET file and read in CMake vars
|
||||
file(APPEND "${TARGET_CONF_TEMP}" ${GETARCH2_CONF_OUT})
|
||||
file(APPEND ${TARGET_CONF_TEMP} ${GETARCH2_CONF_OUT})
|
||||
|
||||
configure_file("${TARGET_CONF_TEMP}" "${TARGET_CONF_DIR}/${TARGET_CONF}" COPYONLY)
|
||||
configure_file(${TARGET_CONF_TEMP} ${TARGET_CONF_DIR}/${TARGET_CONF} COPYONLY)
|
||||
|
||||
ParseGetArchVars(${GETARCH2_MAKE_OUT})
|
||||
|
||||
|
|
|
@ -55,7 +55,7 @@ if (DEFINED TARGET)
|
|||
endif ()
|
||||
|
||||
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
|
||||
if (X86_64 AND NOT (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" OR ${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC"))
|
||||
if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
|
||||
set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
|
||||
endif ()
|
||||
|
||||
|
@ -160,16 +160,11 @@ else()
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (C_LAPACK)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -Wno-error=incompatible-pointer-types")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
|
||||
if (DEFINED TARGET)
|
||||
if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 10.09)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
|
||||
else()
|
||||
|
@ -177,14 +172,15 @@ if (DEFINED TARGET)
|
|||
endif()
|
||||
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 8.99)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake -mllvm -exhaustive-register-search")
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
|
||||
else()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512 -mllvm -exhaustive-register-search")
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL SAPPHIRERAPIDS AND NOT NO_AVX512)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 11.0)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids")
|
||||
else()
|
||||
|
@ -192,39 +188,19 @@ if (DEFINED TARGET)
|
|||
endif()
|
||||
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 12.0)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids -mllvm -exhaustive-register-search")
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids")
|
||||
else()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512 -mllvm -exhaustive-register-search")
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mllvm -exhaustive-register-search")
|
||||
endif()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
|
||||
if (((${TARGET} STREQUAL ZEN) AND HAVE_AVX512VL) AND NOT NO_AVX512)
|
||||
if ((${TARGET} STREQUAL HASWELL OR ${TARGET} STREQUAL ZEN) AND NOT NO_AVX2)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 12.99)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=znver4")
|
||||
else()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 15.99)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=znver4")
|
||||
else()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mllvm -exhaustive-register-search")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if ((${TARGET} STREQUAL HASWELL OR (${TARGET} STREQUAL ZEN AND NOT HAVE_AVX512VL)) AND NOT NO_AVX2)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 4.7 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 4.7)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
||||
endif()
|
||||
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
|
||||
|
@ -263,68 +239,26 @@ if (DEFINED TARGET)
|
|||
endif()
|
||||
|
||||
if (${TARGET} STREQUAL POWER10)
|
||||
if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.2 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.2)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math")
|
||||
else ()
|
||||
message(FATAL_ERROR "Compiler GCC ${CMAKE_C_COMPILER_VERSION} does not support Power10.")
|
||||
message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10.")
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL POWER9)
|
||||
if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 5.0 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 5.0)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math")
|
||||
else ()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
|
||||
message(WARNING "Compiler GCC ${CMAKE_C_COMPILER_VERSION} does not support fully Power9.")
|
||||
message(WARNING "Compiler GCC.${GCC_VERSION} does not support fully Power9.")
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL POWER8)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
|
||||
endif()
|
||||
|
||||
if (${TARGET} STREQUAL NEOVERSEV1)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
|
||||
else ()
|
||||
if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.4 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.4)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1")
|
||||
else ()
|
||||
message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_VERSION} does not support Neoverse V1.")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL NEOVERSEN2)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
|
||||
else ()
|
||||
if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.4 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.4)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
|
||||
else ()
|
||||
message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} ${CMAKE_C_COMPILER_VERSION} does not support Neoverse N2.")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL ARMV8SVE)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve")
|
||||
else ()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve")
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL A64FX)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx")
|
||||
else ()
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve -mtune=a64fx")
|
||||
else ()
|
||||
message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support A64FX.")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
endif()
|
||||
|
||||
if (DEFINED BINARY)
|
||||
message(STATUS "Compiling a ${BINARY}-bit binary.")
|
||||
endif ()
|
||||
|
@ -388,25 +322,15 @@ if (NEED_PIC)
|
|||
endif()
|
||||
endif ()
|
||||
|
||||
if (X86_64 OR ${CORE} STREQUAL POWER10 OR ARM64 OR LOONGARCH64)
|
||||
if (X86_64 OR ${CORE} STREQUAL POWER10)
|
||||
set(SMALL_MATRIX_OPT TRUE)
|
||||
endif ()
|
||||
if (ARM64)
|
||||
set(GEMM_GEMV_FORWARD TRUE)
|
||||
endif ()
|
||||
|
||||
if (GEMM_GEMV_FORWARD AND NOT ONLY_CBLAS)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DGEMM_GEMV_FORWARD")
|
||||
endif ()
|
||||
if (GEMM_GEMV_FORWARD_BF16 AND NOT ONLY_CBLAS)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DGEMM_GEMV_FORWARD_BF16")
|
||||
endif ()
|
||||
if (SMALL_MATRIX_OPT)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT")
|
||||
endif ()
|
||||
|
||||
if (DYNAMIC_ARCH)
|
||||
if (X86 OR X86_64 OR ARM64 OR POWER OR RISCV64 OR LOONGARCH64)
|
||||
if (X86 OR X86_64 OR ARM64 OR POWER)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
|
||||
if (DYNAMIC_OLDER)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
|
||||
|
@ -521,11 +445,10 @@ set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_TH
|
|||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(LIBPREFIX "lib${LIBNAMEPREFIX}openblas")
|
||||
|
||||
if (DEFINED LIBNAMESUFFIX)
|
||||
set(LIBPREFIX "${LIBNAMEPREFIX}_${LIBNAMESUFFIX}")
|
||||
set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}")
|
||||
else ()
|
||||
set(LIBPREFIX "libopenblas")
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED SYMBOLPREFIX)
|
||||
|
@ -624,10 +547,7 @@ set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} ${FCOMMON_OPT}")
|
|||
set(FPFLAGS "${FPFLAGS} ${FCOMMON_OPT} ${COMMON_PROF}")
|
||||
|
||||
#For LAPACK Fortran codes.
|
||||
set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}" )
|
||||
if (LAPACK_STRLEN)
|
||||
set (LAPACK_FFLAGS "${LAPACK_FFLAGS} -DLAPACK_STRLEN=${LAPACK_STRLEN}")
|
||||
endif()
|
||||
set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}")
|
||||
set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}")
|
||||
|
||||
#Disable -fopenmp for LAPACK Fortran codes on Windows.
|
||||
|
@ -639,19 +559,13 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
|
|||
endforeach ()
|
||||
endif ()
|
||||
|
||||
if (CMAKE_Fortran_COMPILER)
|
||||
if ("${F_COMPILER}" STREQUAL "NAG" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
|
||||
if ("${F_COMPILER}" STREQUAL "NAG" OR "${F_COMPILER}" STREQUAL "CRAY")
|
||||
set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512")
|
||||
if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
|
||||
message(STATUS "removing fortran flags")
|
||||
set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64")
|
||||
endif ()
|
||||
foreach (FILTER_FLAG ${FILTER_FLAGS})
|
||||
string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS})
|
||||
string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS})
|
||||
endforeach ()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if ("${F_COMPILER}" STREQUAL "GFORTRAN")
|
||||
# lapack-netlib is rife with uninitialized warnings -hpa
|
||||
|
@ -709,10 +623,6 @@ else ()
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (DEFINED FIXED_LIBNAME)
|
||||
set (LIBNAME "${LIBPREFIX}.${LIBSUFFIX}")
|
||||
set (LIBNAME "${LIBPREFIX}_p.${LIBSUFFIX}")
|
||||
endif()
|
||||
|
||||
set(LIBDLLNAME "${LIBPREFIX}.dll")
|
||||
set(LIBSONAME "${LIBNAME}.${LIBSUFFIX}.so")
|
||||
|
|
|
@ -38,15 +38,13 @@ if(CMAKE_CL_64 OR MINGW64)
|
|||
endif()
|
||||
elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING))
|
||||
set(X86 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc.*"))
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
|
||||
set(POWER 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
|
||||
set(MIPS64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
|
||||
set(LOONGARCH64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*")
|
||||
set(RISCV64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*"))
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
||||
if (NOT BINARY)
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
set(X86_64 1)
|
||||
|
@ -62,7 +60,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*" OR (CMAKE_SYSTE
|
|||
endif()
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
|
||||
set(X86 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*|armv8.*)")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)")
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
set(ARM64 1)
|
||||
else()
|
||||
|
@ -104,14 +102,12 @@ elseif(ARM)
|
|||
set(ARCH "arm")
|
||||
elseif(ARM64)
|
||||
set(ARCH "arm64")
|
||||
elseif(LOONGARCH64)
|
||||
set(ARCH "loongarch64")
|
||||
else()
|
||||
set(ARCH ${CMAKE_SYSTEM_PROCESSOR} CACHE STRING "Target Architecture")
|
||||
endif ()
|
||||
|
||||
if (NOT BINARY)
|
||||
if (X86_64 OR ARM64 OR MIPS64 OR LOONGARCH64 OR RISCV64 OR (POWER AND NOT (CMAKE_OSX_ARCHITECTURES STREQUAL "ppc")))
|
||||
if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64)
|
||||
set(BINARY 64)
|
||||
else ()
|
||||
set(BINARY 32)
|
||||
|
|
|
@ -87,15 +87,6 @@ macro(ParseMakefileVars MAKEFILE_IN)
|
|||
#message(STATUS "skipping ${makefile_line}")
|
||||
continue ()
|
||||
endif ()
|
||||
|
||||
# Example 1: SBGEMM_SMALL_M_PERMIT =
|
||||
# Unset the variable
|
||||
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*$" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
set(var_name ${CMAKE_MATCH_1})
|
||||
unset(${var_name})
|
||||
endif()
|
||||
|
||||
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "match on ${line_match}")
|
||||
|
@ -187,8 +178,8 @@ macro(ParseMakefileVars MAKEFILE_IN)
|
|||
set (HasValidGroup 1)
|
||||
set (STR ${CMAKE_MATCH_4})
|
||||
endif ()
|
||||
if (DEFINED CMAKE_MATCH_1 AND ${HasValidGroup} EQUAL 1)
|
||||
if (NOT (CMAKE_MATCH_1 STREQUAL ${STR}))
|
||||
if (DEFINED ${CMAKE_MATCH_1} AND ${HasValidGroup} EQUAL 1)
|
||||
if (NOT (${${CMAKE_MATCH_1}} STREQUAL ${STR}))
|
||||
#message (STATUS "condition is true")
|
||||
set (IfElse 1)
|
||||
continue ()
|
||||
|
|
26
common.h
26
common.h
|
@ -358,6 +358,12 @@ typedef int blasint;
|
|||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop; \n");
|
||||
#endif
|
||||
|
||||
#ifdef BULLDOZER
|
||||
#ifndef YIELDING
|
||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#ifndef YIELDING
|
||||
|
@ -365,13 +371,21 @@ typedef int blasint;
|
|||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(ARCH_X86_64)
|
||||
/*
|
||||
#ifdef PILEDRIVER
|
||||
#ifndef YIELDING
|
||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
||||
#endif
|
||||
#endif
|
||||
*/
|
||||
|
||||
/*
|
||||
#ifdef STEAMROLLER
|
||||
#ifndef YIELDING
|
||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
||||
#endif
|
||||
#endif
|
||||
*/
|
||||
|
||||
#ifdef __EMSCRIPTEN__
|
||||
#define YIELDING
|
||||
|
@ -382,7 +396,7 @@ typedef int blasint;
|
|||
#endif
|
||||
|
||||
/***
|
||||
To alloc job_t on heap or stack.
|
||||
To alloc job_t on heap or statck.
|
||||
please https://github.com/xianyi/OpenBLAS/issues/246
|
||||
***/
|
||||
#if defined(OS_WINDOWS)
|
||||
|
@ -468,10 +482,6 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
|||
#include "common_e2k.h"
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_CSKY
|
||||
#include "common_csky.h"
|
||||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#ifdef OS_WINDOWSSTORE
|
||||
typedef char env_var_t[MAX_PATH];
|
||||
|
@ -515,7 +525,7 @@ static inline unsigned long long rpcc(void){
|
|||
#endif // !RPCC_DEFINED
|
||||
|
||||
#if !defined(BLAS_LOCK_DEFINED) && defined(__GNUC__)
|
||||
static __inline void blas_lock(volatile BLASULONG *address){
|
||||
static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
do {
|
||||
while (*address) {YIELDING;};
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
#define WMB asm("wmb")
|
||||
#define RMB asm("mb")
|
||||
|
||||
static __inline void blas_lock(unsigned long *address){
|
||||
static void __inline blas_lock(unsigned long *address){
|
||||
#ifndef __DECC
|
||||
unsigned long tmp1, tmp2;
|
||||
asm volatile(
|
||||
|
|
|
@ -47,13 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#endif
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#define RETURN_BY_COMPLEX
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
#if defined(ARMV6) || defined(ARMV7) || defined(ARMV8)
|
||||
|
||||
static __inline void blas_lock(volatile BLASULONG *address){
|
||||
static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
int register ret;
|
||||
|
||||
|
|
|
@ -44,6 +44,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define RMB __asm__ __volatile__ ("dmb ishld" : : : "memory")
|
||||
#endif
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#if defined( F_INTERFACE_FLANG) || defined(F_INTERFACE_PGI)
|
||||
#define RETURN_BY_STACK
|
||||
#else
|
||||
|
@ -53,22 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifndef ASSEMBLER
|
||||
|
||||
|
||||
#ifndef NO_AFFINITY
|
||||
static __inline int WhereAmI(void){
|
||||
uint64_t ret;
|
||||
__asm__ volatile (
|
||||
" mrs x0, mpidr_el1 \n"
|
||||
" and x0, x0, 0xff \n"
|
||||
:"=r" (ret)
|
||||
:: "memory"
|
||||
);
|
||||
ret +=1;
|
||||
if ((int)ret <0) ret = 0;
|
||||
return (int)ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
static __inline void blas_lock(volatile BLASULONG *address){
|
||||
static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
BLASULONG ret;
|
||||
|
||||
|
@ -175,11 +162,7 @@ REALNAME:
|
|||
#define HUGE_PAGESIZE ( 4 << 20)
|
||||
|
||||
#ifndef BUFFERSIZE
|
||||
#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE)
|
||||
#define BUFFER_SIZE (32 << 22)
|
||||
#else
|
||||
#define BUFFER_SIZE (32 << 20)
|
||||
#endif
|
||||
#else
|
||||
#define BUFFER_SIZE (32 << BUFFERSIZE)
|
||||
#endif
|
||||
|
|
|
@ -1,56 +0,0 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2011-2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
|
||||
#ifndef COMMON_CSKY
|
||||
#define COMMON_CSKY
|
||||
|
||||
#define MB __sync_synchronize()
|
||||
#define WMB __sync_synchronize()
|
||||
#define RMB __sync_synchronize()
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#define BUFFER_SIZE ( 32 << 20)
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#endif
|
|
@ -41,6 +41,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
|
||||
#define RMB
|
||||
|
||||
#define INLINE __attribute__((__always_inline__)) inline
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y) {
|
||||
return x / y;
|
||||
}
|
||||
|
|
|
@ -47,11 +47,6 @@ int BLASFUNC(xerbla)(char *, blasint *info, blasint);
|
|||
|
||||
void openblas_set_num_threads_(int *);
|
||||
|
||||
/*Set the threading backend to a custom callback.*/
|
||||
typedef void (*openblas_dojob_callback)(int thread_num, void *jobdata, int dojob_data);
|
||||
typedef void (*openblas_threads_callback)(int sync, openblas_dojob_callback dojob, int numjobs, size_t jobdata_elsize, void *jobdata, int dojob_data);
|
||||
extern openblas_threads_callback openblas_threads_callback_;
|
||||
|
||||
FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *);
|
||||
FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *);
|
||||
|
||||
|
@ -778,8 +773,8 @@ xdouble BLASFUNC(qlamc3)(xdouble *, xdouble *);
|
|||
|
||||
void BLASFUNC(saxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *);
|
||||
void BLASFUNC(daxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *);
|
||||
void BLASFUNC(caxpby) (blasint *, void *, float *, blasint *, void *, float *, blasint *);
|
||||
void BLASFUNC(zaxpby) (blasint *, void *, double *, blasint *, void *, double *, blasint *);
|
||||
void BLASFUNC(caxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *);
|
||||
void BLASFUNC(zaxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *);
|
||||
|
||||
void BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *);
|
||||
void BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *);
|
||||
|
|
|
@ -1937,13 +1937,8 @@ int zimatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG);
|
|||
int sgeadd_k(BLASLONG, BLASLONG, float, float*, BLASLONG, float, float *, BLASLONG);
|
||||
int dgeadd_k(BLASLONG, BLASLONG, double, double*, BLASLONG, double, double *, BLASLONG);
|
||||
int cgeadd_k(BLASLONG, BLASLONG, float, float, float*, BLASLONG, float, float, float *, BLASLONG);
|
||||
int zgeadd_k(BLASLONG, BLASLONG, double,double, double*, BLASLONG, double, double, double *, BLASLONG);
|
||||
int zgeadd_k(BLASLONG, BLASLONG, double,double, double*, BLASLONG, double, double, double *, BLASLONG);
|
||||
|
||||
int sgemm_batch_thread(blas_arg_t * queue, BLASLONG nums);
|
||||
int dgemm_batch_thread(blas_arg_t * queue, BLASLONG nums);
|
||||
int cgemm_batch_thread(blas_arg_t * queue, BLASLONG nums);
|
||||
int zgemm_batch_thread(blas_arg_t * queue, BLASLONG nums);
|
||||
int sbgemm_batch_thread(blas_arg_t * queue, BLASLONG nums);
|
||||
|
||||
#ifdef __CUDACC__
|
||||
}
|
||||
|
|
|
@ -75,51 +75,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define WMB __sync_synchronize()
|
||||
#define RMB __sync_synchronize()
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
}
|
||||
|
||||
#ifndef NO_AFFINITY
|
||||
static inline int WhereAmI(void){
|
||||
int ret = 0, counter = 0;
|
||||
__asm__ volatile (
|
||||
"rdtimel.w %[counter], %[id]"
|
||||
: [id]"=r"(ret), [counter]"=r"(counter)
|
||||
:
|
||||
: "memory"
|
||||
);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline int get_cpu_model(char *model_name) {
|
||||
FILE *cpuinfo_file = fopen("/proc/cpuinfo", "r");
|
||||
if (!cpuinfo_file) {
|
||||
return 0;
|
||||
}
|
||||
char line[1024];
|
||||
while (fgets(line, sizeof(line), cpuinfo_file)) {
|
||||
if (strstr(line, "model name")) {
|
||||
char *token = strtok(line, ":");
|
||||
token = strtok(NULL, ":");
|
||||
while (*token == ' ')
|
||||
token++;
|
||||
char *end = token + strlen(token) - 1;
|
||||
while (end > token && (*end == '\n' || *end == '\r')) {
|
||||
*end = '\0';
|
||||
end--;
|
||||
}
|
||||
strcpy(model_name, token);
|
||||
fclose(cpuinfo_file);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
fclose(cpuinfo_file);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory")
|
||||
#else
|
||||
|
@ -143,50 +106,12 @@ static inline int get_cpu_model(char *model_name) {
|
|||
#define MOV fmov.d
|
||||
#define CMOVT fsel
|
||||
#define MTC movgr2fr.d
|
||||
#define MTG movfr2gr.d
|
||||
#define FABS fabs.d
|
||||
#define FMIN fmin.d
|
||||
#define FMINA fmina.d
|
||||
#define FMAX fmax.d
|
||||
#define FMAXA fmaxa.d
|
||||
#define CMPEQ fcmp.ceq.d
|
||||
#define CMPLE fcmp.cle.d
|
||||
#define CMPLT fcmp.clt.d
|
||||
#define NEG fneg.d
|
||||
#define FFINT ffint.d.l
|
||||
|
||||
#define XVFSUB xvfsub.d
|
||||
#define XVFADD xvfadd.d
|
||||
#define XVFMUL xvfmul.d
|
||||
#define XVFMADD xvfmadd.d
|
||||
#define XVFMIN xvfmin.d
|
||||
#define XVFMINA xvfmina.d
|
||||
#define XVFMAX xvfmax.d
|
||||
#define XVFMAXA xvfmaxa.d
|
||||
#define XVCMPEQ xvfcmp.ceq.d
|
||||
#define XVCMPLE xvfcmp.cle.d
|
||||
#define XVCMPLT xvfcmp.clt.d
|
||||
#define XVMUL xvfmul.d
|
||||
#define XVMSUB xvfmsub.d
|
||||
#define XVNMSUB xvfnmsub.d
|
||||
|
||||
#define VFSUB vfsub.d
|
||||
#define VFADD vfadd.d
|
||||
#define VFMUL vfmul.d
|
||||
#define VFMADD vfmadd.d
|
||||
#define VFMIN vfmin.d
|
||||
#define VFMINA vfmina.d
|
||||
#define VFMAX vfmax.d
|
||||
#define VFMAXA vfmaxa.d
|
||||
#define VCMPEQ vfcmp.ceq.d
|
||||
#define VCMPLE vfcmp.cle.d
|
||||
#define VCMPLT vfcmp.clt.d
|
||||
#define VMUL vfmul.d
|
||||
#define VMSUB vfmsub.d
|
||||
#define VNMSUB vfnmsub.d
|
||||
|
||||
#else
|
||||
|
||||
#define LD fld.s
|
||||
#define ST fst.s
|
||||
#define MADD fmadd.s
|
||||
|
@ -199,48 +124,11 @@ static inline int get_cpu_model(char *model_name) {
|
|||
#define MOV fmov.s
|
||||
#define CMOVT fsel
|
||||
#define MTC movgr2fr.w
|
||||
#define MTG movfr2gr.s
|
||||
#define FABS fabs.s
|
||||
#define FMIN fmin.s
|
||||
#define FMINA fmina.s
|
||||
#define FMAX fmax.s
|
||||
#define FMAXA fmaxa.s
|
||||
#define CMPEQ fcmp.ceq.s
|
||||
#define CMPLE fcmp.cle.s
|
||||
#define CMPLT fcmp.clt.s
|
||||
#define NEG fneg.s
|
||||
#define FFINT ffint.s.l
|
||||
|
||||
#define XVFSUB xvfsub.s
|
||||
#define XVFADD xvfadd.s
|
||||
#define XVFMUL xvfmul.s
|
||||
#define XVFMADD xvfmadd.s
|
||||
#define XVFMIN xvfmin.s
|
||||
#define XVFMINA xvfmina.s
|
||||
#define XVFMAX xvfmax.s
|
||||
#define XVFMAXA xvfmaxa.s
|
||||
#define XVCMPEQ xvfcmp.ceq.s
|
||||
#define XVCMPLE xvfcmp.cle.s
|
||||
#define XVCMPLT xvfcmp.clt.s
|
||||
#define XVMUL xvfmul.s
|
||||
#define XVMSUB xvfmsub.s
|
||||
#define XVNMSUB xvfnmsub.s
|
||||
|
||||
#define VFSUB vfsub.s
|
||||
#define VFADD vfadd.s
|
||||
#define VFMUL vfmul.s
|
||||
#define VFMADD vfmadd.s
|
||||
#define VFMIN vfmin.s
|
||||
#define VFMINA vfmina.s
|
||||
#define VFMAX vfmax.s
|
||||
#define VFMAXA vfmaxa.s
|
||||
#define VCMPEQ vfcmp.ceq.s
|
||||
#define VCMPLE vfcmp.cle.s
|
||||
#define VCMPLT vfcmp.clt.s
|
||||
#define VMUL vfmul.s
|
||||
#define VMSUB vfmsub.s
|
||||
#define VNMSUB vfnmsub.s
|
||||
|
||||
#endif /* defined(DOUBLE) */
|
||||
|
||||
#if defined(__64BIT__) && defined(USE64BITINT)
|
||||
|
@ -279,13 +167,9 @@ REALNAME: ;\
|
|||
#define GNUSTACK
|
||||
#endif /* defined(__linux__) && defined(__ELF__) */
|
||||
|
||||
#ifdef __clang__
|
||||
#define EPILOGUE .end
|
||||
#else
|
||||
#define EPILOGUE \
|
||||
.end REALNAME ;\
|
||||
GNUSTACK
|
||||
#endif
|
||||
|
||||
#define PROFCODE
|
||||
|
||||
|
|
|
@ -2655,20 +2655,9 @@ typedef struct {
|
|||
BLASLONG prea, preb, prec, pred;
|
||||
#endif
|
||||
|
||||
|
||||
//for gemm_batch
|
||||
void * routine;
|
||||
int routine_mode;
|
||||
|
||||
} blas_arg_t;
|
||||
#endif
|
||||
|
||||
#ifdef SMALL_MATRIX_OPT
|
||||
#define BLAS_SMALL_OPT 0x10000U
|
||||
#define BLAS_SMALL_B0_OPT 0x30000U
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef XDOUBLE
|
||||
|
||||
#define TRSV_NUU qtrsv_NUU
|
||||
|
|
|
@ -37,6 +37,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define WMB __sync_synchronize()
|
||||
#define RMB __sync_synchronize()
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#define RETURN_BY_COMPLEX
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
|
|
@ -75,6 +75,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define WMB __sync_synchronize()
|
||||
#define RMB __sync_synchronize()
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
static inline unsigned int rpcc(void){
|
||||
|
|
132
common_param.h
132
common_param.h
|
@ -1,6 +1,5 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -46,14 +45,12 @@
|
|||
|
||||
typedef struct {
|
||||
int dtb_entries;
|
||||
int switch_ratio;
|
||||
int offsetA, offsetB, align;
|
||||
|
||||
#if BUILD_BFLOAT16 == 1
|
||||
#ifdef BUILD_BFLOAT16
|
||||
int sbgemm_p, sbgemm_q, sbgemm_r;
|
||||
int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn;
|
||||
int sbgemm_align_k;
|
||||
int need_amxtile_permission; // 0 default, 1 for device support amx.
|
||||
|
||||
void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG);
|
||||
void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG);
|
||||
|
@ -164,59 +161,51 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG);
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if (BUILD_SINGLE == 1) || (BUILD_DOUBLE == 1) || (BUILD_COMPLEX == 1) || (BUILD_COMPLEX16 == 1)
|
||||
#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX)
|
||||
int sgemm_p, sgemm_q, sgemm_r;
|
||||
int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn;
|
||||
#endif
|
||||
|
||||
int exclusive_cache;
|
||||
|
||||
#if (BUILD_SINGLE == 1) || (BUILD_COMPLEX == 1)
|
||||
#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX)
|
||||
float (*samax_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*samin_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*smax_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*smin_k) (BLASLONG, float *, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE ==1) || (BUILD_COMPLEX==1)
|
||||
BLASLONG (*isamax_k)(BLASLONG, float *, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1)
|
||||
BLASLONG (*isamin_k)(BLASLONG, float *, BLASLONG);
|
||||
BLASLONG (*ismax_k) (BLASLONG, float *, BLASLONG);
|
||||
BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*snrm2_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*sasum_k) (BLASLONG, float *, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_SINGLE==1)
|
||||
#ifdef BUILD_SINGLE
|
||||
float (*ssum_k) (BLASLONG, float *, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1)
|
||||
#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX)
|
||||
int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
//double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
|
||||
int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float);
|
||||
#endif
|
||||
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1)
|
||||
|
||||
int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1)
|
||||
int (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1)
|
||||
int (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
|
||||
int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
#endif
|
||||
#if (BUILD_SINGLE==1)
|
||||
#ifdef BUILD_SINGLE
|
||||
int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
|
||||
int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
#endif
|
||||
|
||||
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1)
|
||||
#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX)
|
||||
#ifdef ARCH_X86_64
|
||||
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG);
|
||||
int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K);
|
||||
|
@ -231,7 +220,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
|
|||
int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||
#endif
|
||||
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1)
|
||||
#ifdef BUILD_SINGLE
|
||||
#ifdef SMALL_MATRIX_OPT
|
||||
int (*sgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
|
||||
|
||||
|
@ -267,8 +256,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
|
|||
int (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
#endif
|
||||
#if (BUILD_SINGLE==1)
|
||||
|
||||
int (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
|
@ -300,12 +288,12 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
|
|||
int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *);
|
||||
#endif
|
||||
|
||||
#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1)
|
||||
#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
|
||||
int dgemm_p, dgemm_q, dgemm_r;
|
||||
int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn;
|
||||
#endif
|
||||
|
||||
#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1)
|
||||
#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
|
||||
double (*damax_k) (BLASLONG, double *, BLASLONG);
|
||||
double (*damin_k) (BLASLONG, double *, BLASLONG);
|
||||
double (*dmax_k) (BLASLONG, double *, BLASLONG);
|
||||
|
@ -314,21 +302,23 @@ BLASLONG (*idamax_k)(BLASLONG, double *, BLASLONG);
|
|||
BLASLONG (*idamin_k)(BLASLONG, double *, BLASLONG);
|
||||
BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG);
|
||||
BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
|
||||
double (*dnrm2_k) (BLASLONG, double *, BLASLONG);
|
||||
double (*dasum_k) (BLASLONG, double *, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_DOUBLE==1)
|
||||
#ifdef BUILD_DOUBLE
|
||||
double (*dsum_k) (BLASLONG, double *, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1)
|
||||
#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
|
||||
int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1)
|
||||
#if defined (BUILD_SINGLE) || defined(BUILD_DOUBLE)
|
||||
double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1)
|
||||
#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
|
||||
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
|
||||
int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
|
@ -336,13 +326,13 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
|
|||
int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
|
||||
int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
|
||||
#endif
|
||||
#if (BUILD_DOUBLE==1)
|
||||
#ifdef BUILD_DOUBLE
|
||||
int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
|
||||
|
||||
int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
|
||||
int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
|
||||
#endif
|
||||
#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1)
|
||||
#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
|
||||
int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG);
|
||||
int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
|
||||
|
@ -351,7 +341,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
|
|||
int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
|
||||
int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
|
||||
#endif
|
||||
#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1)
|
||||
#ifdef BUILD_DOUBLE
|
||||
#ifdef SMALL_MATRIX_OPT
|
||||
int (*dgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta);
|
||||
|
||||
|
@ -365,8 +355,6 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
|
|||
int (*dgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*dgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
#endif
|
||||
#endif
|
||||
#if (BUILD_DOUBLE==1)
|
||||
int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
|
||||
int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
|
||||
int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
|
||||
|
@ -513,25 +501,23 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);
|
|||
|
||||
#endif
|
||||
|
||||
#if (BUILD_COMPLEX==1)
|
||||
#ifdef BUILD_COMPLEX
|
||||
int cgemm_p, cgemm_q, cgemm_r;
|
||||
int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn;
|
||||
|
||||
|
||||
float (*camax_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*camin_k) (BLASLONG, float *, BLASLONG);
|
||||
|
||||
BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG);
|
||||
BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG);
|
||||
BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
|
||||
|
||||
float (*cnrm2_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*casum_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*csum_k) (BLASLONG, float *, BLASLONG);
|
||||
|
||||
int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
|
||||
int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float);
|
||||
|
||||
int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
int (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
int (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
|
@ -725,7 +711,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
|
|||
int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *);
|
||||
#endif
|
||||
|
||||
#if (BUILD_COMPLEX16 == 1)
|
||||
#ifdef BUILD_COMPLEX16
|
||||
int zgemm_p, zgemm_q, zgemm_r;
|
||||
int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn;
|
||||
|
||||
|
@ -1107,34 +1093,34 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
|
|||
void (*init)(void);
|
||||
|
||||
int snum_opt, dnum_opt, qnum_opt;
|
||||
#if (BUILD_SINGLE==1)
|
||||
#ifdef BUILD_SINGLE
|
||||
int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_DOUBLE==1)
|
||||
#ifdef BUILD_DOUBLE
|
||||
int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_COMPLEX==1)
|
||||
#ifdef BUILD_COMPLEX
|
||||
int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_COMPLEX16==1)
|
||||
#ifdef BUILD_COMPLEX16
|
||||
int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if (BUILD_SINGLE==1)
|
||||
#ifdef BUILD_SINGLE
|
||||
int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
|
||||
int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
|
||||
int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
|
||||
int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if (BUILD_DOUBLE==1)
|
||||
#ifdef BUILD_DOUBLE
|
||||
int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
|
||||
int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
|
||||
int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
|
||||
int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if (BUILD_COMPLEX==1)
|
||||
#ifdef BUILD_COMPLEX
|
||||
int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
|
||||
int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
|
||||
int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
|
||||
|
@ -1146,7 +1132,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
|
|||
int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if (BUILD_COMPLEX16==1)
|
||||
#ifdef BUILD_COMPLEX16
|
||||
int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
|
||||
int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
|
||||
int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
|
||||
|
@ -1158,21 +1144,21 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
|
|||
int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if (BUILD_SINGLE==1)
|
||||
#ifdef BUILD_SINGLE
|
||||
int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG);
|
||||
int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG);
|
||||
int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG);
|
||||
int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if (BUILD_DOUBLE==1)
|
||||
#ifdef BUILD_DOUBLE
|
||||
int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG);
|
||||
int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG);
|
||||
int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG);
|
||||
int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if (BUILD_COMPLEX==1)
|
||||
#ifdef BUILD_COMPLEX
|
||||
int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG);
|
||||
int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG);
|
||||
int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG);
|
||||
|
@ -1184,7 +1170,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
|
|||
int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if (BUILD_COMPLEX16==1)
|
||||
#ifdef BUILD_COMPLEX16
|
||||
int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG);
|
||||
int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG);
|
||||
int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG);
|
||||
|
@ -1196,16 +1182,16 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
|
|||
int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if (BUILD_SINGLE==1)
|
||||
#ifdef BUILD_SINGLE
|
||||
int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_DOUBLE==1)
|
||||
#ifdef BUILD_DOUBLE
|
||||
int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_COMPLEX==1)
|
||||
#ifdef BUILD_COMPLEX
|
||||
int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_COMPLEX16==1)
|
||||
#ifdef BUILD_COMPLEX16
|
||||
int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG);
|
||||
#endif
|
||||
} gotoblas_t;
|
||||
|
@ -1221,7 +1207,7 @@ extern gotoblas_t *gotoblas;
|
|||
|
||||
#define HAVE_EX_L2 gotoblas -> exclusive_cache
|
||||
|
||||
#if (BUILD_BFLOAT16==1)
|
||||
#ifdef BUILD_BFLOAT16
|
||||
#define SBGEMM_P gotoblas -> sbgemm_p
|
||||
#define SBGEMM_Q gotoblas -> sbgemm_q
|
||||
#define SBGEMM_R gotoblas -> sbgemm_r
|
||||
|
@ -1230,7 +1216,7 @@ extern gotoblas_t *gotoblas;
|
|||
#define SBGEMM_UNROLL_MN gotoblas -> sbgemm_unroll_mn
|
||||
#endif
|
||||
|
||||
#if (BUILD_SINGLE==1)
|
||||
#if defined (BUILD_SINGLE)
|
||||
#define SGEMM_P gotoblas -> sgemm_p
|
||||
#define SGEMM_Q gotoblas -> sgemm_q
|
||||
#define SGEMM_R gotoblas -> sgemm_r
|
||||
|
@ -1239,21 +1225,13 @@ extern gotoblas_t *gotoblas;
|
|||
#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn
|
||||
#endif
|
||||
|
||||
#if (BUILD_DOUBLE==1)
|
||||
#if defined (BUILD_DOUBLE)
|
||||
#define DGEMM_P gotoblas -> dgemm_p
|
||||
#define DGEMM_Q gotoblas -> dgemm_q
|
||||
#define DGEMM_R gotoblas -> dgemm_r
|
||||
#define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m
|
||||
#define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n
|
||||
#define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn
|
||||
#if (BUILD_SINGLE != 1)
|
||||
#define SGEMM_P gotoblas -> sgemm_p
|
||||
#define SGEMM_Q gotoblas -> sgemm_q
|
||||
#define SGEMM_R 1024
|
||||
#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m
|
||||
#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n
|
||||
#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define QGEMM_P gotoblas -> qgemm_p
|
||||
|
@ -1263,14 +1241,14 @@ extern gotoblas_t *gotoblas;
|
|||
#define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n
|
||||
#define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn
|
||||
|
||||
#if (BUILD_COMPLEX==1)
|
||||
#ifdef BUILD_COMPLEX
|
||||
#define CGEMM_P gotoblas -> cgemm_p
|
||||
#define CGEMM_Q gotoblas -> cgemm_q
|
||||
#define CGEMM_R gotoblas -> cgemm_r
|
||||
#define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m
|
||||
#define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n
|
||||
#define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn
|
||||
#if (BUILD_SINGLE != 1)
|
||||
#ifndef BUILD_SINGLE
|
||||
#define SGEMM_P gotoblas -> sgemm_p
|
||||
#define SGEMM_Q gotoblas -> sgemm_q
|
||||
#define SGEMM_R 1024
|
||||
|
@ -1280,14 +1258,14 @@ extern gotoblas_t *gotoblas;
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if (BUILD_COMPLEX16==1)
|
||||
#ifdef BUILD_COMPLEX16
|
||||
#define ZGEMM_P gotoblas -> zgemm_p
|
||||
#define ZGEMM_Q gotoblas -> zgemm_q
|
||||
#define ZGEMM_R gotoblas -> zgemm_r
|
||||
#define ZGEMM_UNROLL_M gotoblas -> zgemm_unroll_m
|
||||
#define ZGEMM_UNROLL_N gotoblas -> zgemm_unroll_n
|
||||
#define ZGEMM_UNROLL_MN gotoblas -> zgemm_unroll_mn
|
||||
#if (BUILD_DOUBLE != 1)
|
||||
#ifndef BUILD_DOUBLE
|
||||
#define DGEMM_P gotoblas -> dgemm_p
|
||||
#define DGEMM_Q gotoblas -> dgemm_q
|
||||
#define DGEMM_R 1024
|
||||
|
@ -1295,14 +1273,6 @@ extern gotoblas_t *gotoblas;
|
|||
#define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n
|
||||
#define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn
|
||||
#endif
|
||||
#if (BUILD_COMPLEX != 1)
|
||||
#define CGEMM_P gotoblas -> cgemm_p
|
||||
#define CGEMM_Q gotoblas -> cgemm_q
|
||||
#define CGEMM_R gotoblas -> cgemm_r
|
||||
#define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m
|
||||
#define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n
|
||||
#define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define XGEMM_P gotoblas -> xgemm_p
|
||||
|
@ -1349,7 +1319,7 @@ extern gotoblas_t *gotoblas;
|
|||
#define HAVE_EX_L2 0
|
||||
#endif
|
||||
|
||||
#if (BUILD_BFLOAT16 == 1)
|
||||
#ifdef BUILD_BFLOAT16
|
||||
#define SBGEMM_P SBGEMM_DEFAULT_P
|
||||
#define SBGEMM_Q SBGEMM_DEFAULT_Q
|
||||
#define SBGEMM_R SBGEMM_DEFAULT_R
|
||||
|
|
|
@ -78,6 +78,8 @@
|
|||
#define RMB __asm__ __volatile__ ("sync")
|
||||
#endif
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#ifdef PPC440
|
||||
#define STDERR stdout
|
||||
#define QNONCACHE 0x1
|
||||
|
@ -89,7 +91,7 @@
|
|||
|
||||
void *qalloc(int flags, size_t bytes);
|
||||
|
||||
static inline void blas_lock(volatile unsigned long *address){
|
||||
static void INLINE blas_lock(volatile unsigned long *address){
|
||||
|
||||
long int ret, val = 1;
|
||||
|
||||
|
@ -839,17 +841,17 @@ Lmcount$lazy_ptr:
|
|||
#endif
|
||||
|
||||
#if defined(PPC440)
|
||||
#define BUFFER_SIZE ( 2UL << 20)
|
||||
#define BUFFER_SIZE ( 2 << 20)
|
||||
#elif defined(PPC440FP2)
|
||||
#define BUFFER_SIZE ( 16UL << 20)
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
#elif defined(POWER6) || defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#define BUFFER_SIZE ( 64UL << 22)
|
||||
#define BUFFER_SIZE ( 64 << 22)
|
||||
#else
|
||||
#define BUFFER_SIZE ( 16UL << 20)
|
||||
#define BUFFER_SIZE ( 16 << 20)
|
||||
#endif
|
||||
#ifdef DYNAMIC_ARCH
|
||||
#undef BUFFER_SIZE
|
||||
#define BUFFER_SIZE (64UL << 22)
|
||||
#define BUFFER_SIZE (64 << 22)
|
||||
#endif
|
||||
|
||||
#ifndef PAGESIZE
|
||||
|
|
|
@ -75,6 +75,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define WMB __sync_synchronize()
|
||||
#define RMB __sync_synchronize()
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
|
||||
|
@ -89,7 +91,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#define BUFFER_SIZE ( 32 << 20)
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(RISCV64_ZVL128B) || defined(x280)
|
||||
#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) || defined(x280)
|
||||
# include <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
|
|
|
@ -45,7 +45,7 @@
|
|||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
static __inline void blas_lock(volatile unsigned long *address){
|
||||
static void __inline blas_lock(volatile unsigned long *address){
|
||||
|
||||
long int ret = 1;
|
||||
|
||||
|
|
|
@ -111,9 +111,8 @@ typedef struct blas_queue {
|
|||
struct blas_queue *next;
|
||||
|
||||
#if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__)
|
||||
CRITICAL_SECTION lock;
|
||||
HANDLE finish;
|
||||
volatile int finished;
|
||||
CRITICAL_SECTION lock;
|
||||
HANDLE finish;
|
||||
#else
|
||||
pthread_mutex_t lock;
|
||||
pthread_cond_t finished;
|
||||
|
@ -136,32 +135,23 @@ typedef struct blas_queue {
|
|||
#ifdef SMP_SERVER
|
||||
|
||||
extern int blas_server_avail;
|
||||
extern int blas_omp_number_max;
|
||||
extern int blas_omp_threads_local;
|
||||
|
||||
static __inline int num_cpu_avail(int level) {
|
||||
|
||||
#ifdef USE_OPENMP
|
||||
int openmp_nthreads;
|
||||
openmp_nthreads=omp_get_max_threads();
|
||||
if (omp_in_parallel()) openmp_nthreads = blas_omp_threads_local;
|
||||
int openmp_nthreads=omp_get_max_threads();
|
||||
#endif
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
if (blas_cpu_number == 1
|
||||
#else
|
||||
if (openmp_nthreads == 1
|
||||
#endif
|
||||
#ifdef USE_OPENMP
|
||||
if (openmp_nthreads == 1 || omp_in_parallel()
|
||||
#endif
|
||||
) return 1;
|
||||
|
||||
#ifdef USE_OPENMP
|
||||
if (openmp_nthreads > blas_omp_number_max){
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr,"WARNING - more OpenMP threads requested (%d) than available (%d)\n",openmp_nthreads,blas_omp_number_max);
|
||||
#endif
|
||||
openmp_nthreads = blas_omp_number_max;
|
||||
}
|
||||
if (blas_cpu_number != openmp_nthreads) {
|
||||
if (blas_cpu_number != openmp_nthreads) {
|
||||
goto_set_num_threads(openmp_nthreads);
|
||||
}
|
||||
#endif
|
||||
|
@ -194,27 +184,27 @@ int exec_blas(BLASLONG num_cpu, blas_param_t *param, void *buffer);
|
|||
int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha,
|
||||
void *a, BLASLONG lda,
|
||||
void *b, BLASLONG ldb,
|
||||
void *c, BLASLONG ldc, int (*function)(void), int threads);
|
||||
void *c, BLASLONG ldc, int (*function)(), int threads);
|
||||
|
||||
int gemm_thread_m (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG ), void *, void *, BLASLONG);
|
||||
int gemm_thread_m (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
|
||||
|
||||
int gemm_thread_n (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT*, FLOAT*, BLASLONG), void *, void *, BLASLONG);
|
||||
int gemm_thread_n (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
|
||||
|
||||
int gemm_thread_mn(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG), void *, void *, BLASLONG);
|
||||
int gemm_thread_mn(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
|
||||
|
||||
int gemm_thread_variable(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG), void *, void *, BLASLONG, BLASLONG);
|
||||
int gemm_thread_variable(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG, BLASLONG);
|
||||
|
||||
int trsm_thread(int mode, BLASLONG m, BLASLONG n,
|
||||
double alpha_r, double alpha_i,
|
||||
void *a, BLASLONG lda,
|
||||
void *c, BLASLONG ldc, int (*function)(void), void *buffer);
|
||||
void *c, BLASLONG ldc, int (*function)(), void *buffer);
|
||||
|
||||
int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*, FLOAT *, FLOAT *, BLASLONG), void*, void*, BLASLONG);
|
||||
int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
|
||||
|
||||
int getrf_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k,
|
||||
void *offsetA, BLASLONG lda,
|
||||
void *offsetB, BLASLONG jb,
|
||||
void *ipiv, BLASLONG offset, int (*function)(void), void *buffer);
|
||||
void *ipiv, BLASLONG offset, int (*function)(), void *buffer);
|
||||
|
||||
#endif /* ENDIF ASSEMBLER */
|
||||
|
||||
|
|
|
@ -54,7 +54,7 @@
|
|||
#define __volatile__
|
||||
#endif
|
||||
|
||||
static __inline void blas_lock(volatile BLASULONG *address){
|
||||
static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
int ret;
|
||||
|
||||
|
|
|
@ -70,7 +70,7 @@
|
|||
#define RMB
|
||||
#endif
|
||||
|
||||
static __inline void blas_lock(volatile BLASULONG *address){
|
||||
static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
|
||||
#ifndef C_MSVC
|
||||
|
@ -253,7 +253,7 @@ static __inline unsigned int blas_quickdivide(unsigned int x, unsigned int y){
|
|||
#ifndef BUFFERSIZE
|
||||
#define BUFFER_SIZE (32 << 22)
|
||||
#else
|
||||
#define BUFFER_SIZE (32UL << BUFFERSIZE)
|
||||
#define BUFFER_SIZE (32 << BUFFERSIZE)
|
||||
#endif
|
||||
|
||||
#define SEEK_ADDRESS
|
||||
|
|
|
@ -37,12 +37,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define WMB
|
||||
#define RMB
|
||||
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#define RETURN_BY_COMPLEX
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
/*
|
||||
static __inline void blas_lock(volatile BLASULONG *address){
|
||||
static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
BLASULONG ret;
|
||||
|
||||
|
|
|
@ -42,11 +42,9 @@ size_t length64=sizeof(value64);
|
|||
#define CPU_CORTEXA57 3
|
||||
#define CPU_CORTEXA72 4
|
||||
#define CPU_CORTEXA73 5
|
||||
#define CPU_CORTEXA76 23
|
||||
#define CPU_NEOVERSEN1 11
|
||||
#define CPU_NEOVERSEV1 16
|
||||
#define CPU_NEOVERSEN2 17
|
||||
#define CPU_NEOVERSEV2 24
|
||||
#define CPU_CORTEXX1 18
|
||||
#define CPU_CORTEXX2 19
|
||||
#define CPU_CORTEXA510 20
|
||||
|
@ -91,9 +89,7 @@ static char *cpuname[] = {
|
|||
"CORTEXX2",
|
||||
"CORTEXA510",
|
||||
"CORTEXA710",
|
||||
"FT2000",
|
||||
"CORTEXA76",
|
||||
"NEOVERSEV2"
|
||||
"FT2000"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
|
@ -119,9 +115,7 @@ static char *cpuname_lower[] = {
|
|||
"cortexx2",
|
||||
"cortexa510",
|
||||
"cortexa710",
|
||||
"ft2000",
|
||||
"cortexa76",
|
||||
"neoversev2"
|
||||
"ft2000"
|
||||
};
|
||||
|
||||
int get_feature(char *search)
|
||||
|
@ -208,18 +202,10 @@ int detect(void)
|
|||
return CPU_CORTEXA510;
|
||||
else if (strstr(cpu_part, "0xd47"))
|
||||
return CPU_CORTEXA710;
|
||||
else if (strstr(cpu_part, "0xd4d")) //A715
|
||||
return CPU_CORTEXA710;
|
||||
else if (strstr(cpu_part, "0xd44"))
|
||||
return CPU_CORTEXX1;
|
||||
else if (strstr(cpu_part, "0xd4c"))
|
||||
return CPU_CORTEXX2;
|
||||
else if (strstr(cpu_part, "0xd4e")) //X3
|
||||
return CPU_CORTEXX2;
|
||||
else if (strstr(cpu_part, "0xd4f")) //NVIDIA Grace et al.
|
||||
return CPU_NEOVERSEV2;
|
||||
else if (strstr(cpu_part, "0xd0b"))
|
||||
return CPU_CORTEXA76;
|
||||
}
|
||||
// Qualcomm
|
||||
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
|
||||
|
@ -277,10 +263,8 @@ int detect(void)
|
|||
}
|
||||
#else
|
||||
#ifdef __APPLE__
|
||||
sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0);
|
||||
if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1
|
||||
if (value64 == 3660830781) return CPU_VORTEX; //A15/M2
|
||||
if (value64 == 2271604202) return CPU_VORTEX; //A16/M3
|
||||
sysctlbyname("hw.cpufamily",&value,&length,NULL,0);
|
||||
if (value ==131287967|| value == 458787763 ) return CPU_VORTEX;
|
||||
#endif
|
||||
return CPU_ARMV8;
|
||||
#endif
|
||||
|
@ -401,8 +385,6 @@ void get_cpuconfig(void)
|
|||
break;
|
||||
|
||||
case CPU_NEOVERSEV1:
|
||||
printf("#define HAVE_SVE 1\n");
|
||||
case CPU_CORTEXA76:
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
printf("#define L1_CODE_SIZE 65536\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
|
@ -430,32 +412,12 @@ void get_cpuconfig(void)
|
|||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 48\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define HAVE_SVE 1\n");
|
||||
break;
|
||||
case CPU_NEOVERSEV2:
|
||||
printf("#define ARMV9\n");
|
||||
printf("#define HAVE_SVE 1\n");
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
printf("#define L1_CODE_SIZE 65536\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 4\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 4\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
// L1 Data TLB = 48 entries
|
||||
// L2 Data TLB = 2048 entries
|
||||
printf("#define DTB_DEFAULT_ENTRIES 48\n");
|
||||
printf("#define DTB_SIZE 4096\n"); // Set to 4096 for symmetry with other configs.
|
||||
break;
|
||||
case CPU_CORTEXA510:
|
||||
case CPU_CORTEXA710:
|
||||
case CPU_CORTEXX1:
|
||||
case CPU_CORTEXX2:
|
||||
printf("#define ARMV9\n");
|
||||
printf("#define HAVE_SVE 1\n");
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
printf("#define L1_CODE_SIZE 65536\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
|
@ -572,7 +534,6 @@ void get_cpuconfig(void)
|
|||
break;
|
||||
case CPU_A64FX:
|
||||
printf("#define A64FX\n");
|
||||
printf("#define HAVE_SVE 1\n");
|
||||
printf("#define L1_CODE_SIZE 65535\n");
|
||||
printf("#define L1_DATA_SIZE 65535\n");
|
||||
printf("#define L1_DATA_LINESIZE 256\n");
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2011-2024, The OpenBLAS Project
|
||||
Copyright (c) 2011-2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
@ -32,299 +32,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
**********************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
#include <sys/auxv.h>
|
||||
|
||||
#define CPU_LA64_GENERIC 0
|
||||
#define CPU_LA264 1
|
||||
#define CPU_LA364 2
|
||||
#define CPU_LA464 3
|
||||
#define CPU_LA664 4
|
||||
/* If LASX extension instructions supported,
|
||||
* using core LOONGSON3R5
|
||||
* If only LSX extension instructions supported,
|
||||
* using core LOONGSON2K1000
|
||||
* If neither LASX nor LSX extension instructions supported,
|
||||
* using core LOONGSONGENERIC (As far as I know, there is no such
|
||||
* CPU yet)
|
||||
*/
|
||||
|
||||
#define CORE_LA64_GENERIC 0
|
||||
#define CORE_LA264 1
|
||||
#define CORE_LA464 2
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_LOONGSON3R5 1
|
||||
#define CPU_LOONGSON2K1000 2
|
||||
|
||||
#define LA_HWCAP_LSX (1U << 4)
|
||||
#define LA_HWCAP_LASX (1U << 5)
|
||||
#define LOONGARCH_CFG2 0x02
|
||||
#define LOONGARCH_LASX 1<<7
|
||||
#define LOONGARCH_LSX 1<<6
|
||||
|
||||
#define LOONGARCH_CFG0 0x00
|
||||
#define LOONGARCH_CFG2 0x02
|
||||
#define LOONGARCH_CFG10 0x10
|
||||
#define LOONGARCH_CFG11 0x11
|
||||
#define LOONGARCH_CFG12 0x12
|
||||
#define LOONGARCH_CFG13 0x13
|
||||
#define LOONGARCH_CFG14 0x14
|
||||
#define LASX_MASK 1<<7
|
||||
#define LSX_MASK 1<<6
|
||||
#define PRID_SERIES_MASK 0xf000
|
||||
#define PRID_SERIES_LA264 0xa000
|
||||
#define PRID_SERIES_LA364 0xb000
|
||||
#define PRID_SERIES_LA464 0xc000
|
||||
#define PRID_SERIES_LA664 0xd000
|
||||
|
||||
#define CACHE_INFO_L1_IU 0
|
||||
#define CACHE_INFO_L1_D 1
|
||||
#define CACHE_INFO_L2_IU 2
|
||||
#define CACHE_INFO_L2_D 3
|
||||
#define CACHE_INFO_L3_IU 4
|
||||
#define CACHE_INFO_L3_D 5
|
||||
#define L1_IU_PRESENT_MASK 0x0001
|
||||
#define L1_IU_UNITY_MASK 0x0002
|
||||
#define L1_D_PRESENT_MASK 0x0004
|
||||
#define L2_IU_PRESENT_MASK 0x0008
|
||||
#define L2_IU_UNITY_MASK 0x0010
|
||||
#define L2_D_PRESENT_MASK 0x0080
|
||||
#define L3_IU_PRESENT_MASK 0x0400
|
||||
#define L3_IU_UNITY_MASK 0x0800
|
||||
#define L3_D_PRESENT_MASK 0x4000
|
||||
#define CACHE_WAY_MINUS_1_MASK 0x0000ffff
|
||||
#define CACHE_INDEX_LOG2_MASK 0x00ff0000
|
||||
#define CACHE_LINESIZE_LOG2_MASK 0x7f000000
|
||||
|
||||
typedef struct {
|
||||
int size;
|
||||
int associative;
|
||||
int linesize;
|
||||
int unify;
|
||||
int present;
|
||||
} cache_info_t;
|
||||
|
||||
/* Using microarchitecture representation */
|
||||
static char *cpuname[] = {
|
||||
"LA64_GENERIC",
|
||||
"LA264", /* Loongson 64bit, 2-issue, Like 2K1000LA */
|
||||
"LA364", /* Loongson 64bit, 3-issue, Like 2K2000 */
|
||||
"LA464", /* Loongson 64bit, 4-issue, Like 3A5000, 3C5000L, 3C5000 and 3D5000 */
|
||||
"LA664" /* Loongson 64bit, 6-issue, Like 3A6000, 3C6000 and 3D6000 */
|
||||
"LOONGSONGENERIC",
|
||||
"LOONGSON3R5",
|
||||
"LOONGSON2K1000"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"la64_generic",
|
||||
"la264",
|
||||
"la364",
|
||||
"la464",
|
||||
"la664"
|
||||
"loongsongeneric",
|
||||
"loongson3r5",
|
||||
"loongson2k1000"
|
||||
};
|
||||
|
||||
static char *corename[] = {
|
||||
"LA64_GENERIC", /* Implies using scalar instructions for optimization */
|
||||
"LA264", /* Implies using LSX instructions for optimization */
|
||||
"LA464", /* Implies using LASX instructions for optimization */
|
||||
};
|
||||
|
||||
static char *corename_lower[] = {
|
||||
"la64_generic",
|
||||
"la264",
|
||||
"la464",
|
||||
};
|
||||
|
||||
/*
|
||||
* Obtain cache and processor identification
|
||||
* through the cpucfg command.
|
||||
*/
|
||||
static void get_cacheinfo(int type, cache_info_t *cacheinfo) {
|
||||
cache_info_t cache_info;
|
||||
memset(&cache_info, 0, sizeof(cache_info));
|
||||
uint32_t reg_10 = 0;
|
||||
__asm__ volatile (
|
||||
"cpucfg %0, %1 \n\t"
|
||||
: "+&r"(reg_10)
|
||||
: "r"(LOONGARCH_CFG10)
|
||||
);
|
||||
|
||||
switch (type) {
|
||||
case CACHE_INFO_L1_IU:
|
||||
if (reg_10 & L1_IU_PRESENT_MASK) {
|
||||
uint32_t reg_11 = 0;
|
||||
cache_info.present = reg_10 & L1_IU_PRESENT_MASK;
|
||||
cache_info.unify = reg_10 & L1_IU_UNITY_MASK;
|
||||
__asm__ volatile (
|
||||
"cpucfg %0, %1 \n\t"
|
||||
: "+&r"(reg_11)
|
||||
: "r"(LOONGARCH_CFG11)
|
||||
);
|
||||
cache_info.associative = (reg_11 & CACHE_WAY_MINUS_1_MASK) + 1;
|
||||
cache_info.linesize = 1 << ((reg_11 & CACHE_LINESIZE_LOG2_MASK) >> 24);
|
||||
cache_info.size = cache_info.associative * cache_info.linesize *
|
||||
(1 << ((reg_11 & CACHE_INDEX_LOG2_MASK) >> 16));
|
||||
}
|
||||
break;
|
||||
|
||||
case CACHE_INFO_L1_D:
|
||||
if (reg_10 & L1_D_PRESENT_MASK) {
|
||||
uint32_t reg_12 = 0;
|
||||
cache_info.present = reg_10 & L1_D_PRESENT_MASK;
|
||||
__asm__ volatile (
|
||||
"cpucfg %0, %1 \n\t"
|
||||
: "+&r"(reg_12)
|
||||
: "r"(LOONGARCH_CFG12)
|
||||
);
|
||||
cache_info.associative = (reg_12 & CACHE_WAY_MINUS_1_MASK) + 1;
|
||||
cache_info.linesize = 1 << ((reg_12 & CACHE_LINESIZE_LOG2_MASK) >> 24);
|
||||
cache_info.size = cache_info.associative * cache_info.linesize *
|
||||
(1 << ((reg_12 & CACHE_INDEX_LOG2_MASK) >> 16));
|
||||
}
|
||||
break;
|
||||
|
||||
case CACHE_INFO_L2_IU:
|
||||
if (reg_10 & L2_IU_PRESENT_MASK) {
|
||||
uint32_t reg_13 = 0;
|
||||
cache_info.present = reg_10 & L2_IU_PRESENT_MASK;
|
||||
cache_info.unify = reg_10 & L2_IU_UNITY_MASK;
|
||||
__asm__ volatile (
|
||||
"cpucfg %0, %1 \n\t"
|
||||
: "+&r"(reg_13)
|
||||
: "r"(LOONGARCH_CFG13)
|
||||
);
|
||||
cache_info.associative = (reg_13 & CACHE_WAY_MINUS_1_MASK) + 1;
|
||||
cache_info.linesize = 1 << ((reg_13 & CACHE_LINESIZE_LOG2_MASK) >> 24);
|
||||
cache_info.size = cache_info.associative * cache_info.linesize *
|
||||
(1 << ((reg_13 & CACHE_INDEX_LOG2_MASK) >> 16));
|
||||
}
|
||||
break;
|
||||
|
||||
case CACHE_INFO_L2_D:
|
||||
if (reg_10 & L2_D_PRESENT_MASK) {
|
||||
cache_info.present = reg_10 & L2_D_PRESENT_MASK;
|
||||
// No date fetch
|
||||
}
|
||||
break;
|
||||
|
||||
case CACHE_INFO_L3_IU:
|
||||
if (reg_10 & L3_IU_PRESENT_MASK) {
|
||||
uint32_t reg_14 = 0;
|
||||
cache_info.present = reg_10 & L3_IU_PRESENT_MASK;
|
||||
cache_info.unify = reg_10 & L3_IU_UNITY_MASK;
|
||||
__asm__ volatile (
|
||||
"cpucfg %0, %1 \n\t"
|
||||
: "+&r"(reg_14)
|
||||
: "r"(LOONGARCH_CFG14)
|
||||
);
|
||||
cache_info.associative = (reg_14 & CACHE_WAY_MINUS_1_MASK) + 1;
|
||||
cache_info.linesize = 1 << ((reg_14 & CACHE_LINESIZE_LOG2_MASK) >> 24);
|
||||
cache_info.size = cache_info.associative * cache_info.linesize *
|
||||
(1 << ((reg_14 & CACHE_INDEX_LOG2_MASK) >> 16));
|
||||
}
|
||||
break;
|
||||
|
||||
case CACHE_INFO_L3_D:
|
||||
if (reg_10 & L3_D_PRESENT_MASK) {
|
||||
cache_info.present = reg_10 & L3_D_PRESENT_MASK;
|
||||
// No data fetch
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
*cacheinfo = cache_info;
|
||||
}
|
||||
|
||||
static uint32_t get_prid() {
|
||||
int detect(void) {
|
||||
#ifdef __linux
|
||||
uint32_t reg = 0;
|
||||
|
||||
__asm__ volatile (
|
||||
"cpucfg %0, %1 \n\t"
|
||||
: "+&r"(reg)
|
||||
: "r"(LOONGARCH_CFG0)
|
||||
: "r"(LOONGARCH_CFG2)
|
||||
);
|
||||
return reg;
|
||||
}
|
||||
|
||||
static void get_cpucount(uint32_t *count) {
|
||||
uint32_t num = 0;
|
||||
FILE *f = fopen("/proc/cpuinfo", "r");
|
||||
if (!f) return;
|
||||
char buf[200];
|
||||
while (fgets(buf, sizeof(buf), f))
|
||||
{
|
||||
if (!strncmp("processor", buf, 9))
|
||||
num ++;
|
||||
}
|
||||
fclose(f);
|
||||
*count = num;
|
||||
}
|
||||
|
||||
/* Detect whether the OS supports the LASX instruction set */
|
||||
static int os_support_lasx() {
|
||||
int hwcap = (int)getauxval(AT_HWCAP);
|
||||
|
||||
if (hwcap & LA_HWCAP_LASX)
|
||||
return 1;
|
||||
if (reg & LOONGARCH_LASX)
|
||||
return CPU_LOONGSON3R5;
|
||||
else if (reg & LOONGARCH_LSX)
|
||||
return CPU_LOONGSON2K1000;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Detect whether the OS supports the LSX instruction set */
|
||||
static int os_support_lsx() {
|
||||
int hwcap = (int)getauxval(AT_HWCAP);
|
||||
|
||||
if (hwcap & LA_HWCAP_LSX)
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
int get_coretype(void) {
|
||||
uint32_t prid = get_prid();
|
||||
switch (prid & PRID_SERIES_MASK) {
|
||||
case (PRID_SERIES_LA464):
|
||||
case (PRID_SERIES_LA664):
|
||||
if (os_support_lasx())
|
||||
return CORE_LA464;
|
||||
else if (os_support_lsx())
|
||||
return CORE_LA264;
|
||||
else
|
||||
return CORE_LA64_GENERIC;
|
||||
break;
|
||||
|
||||
case (PRID_SERIES_LA264):
|
||||
case (PRID_SERIES_LA364):
|
||||
if (os_support_lsx())
|
||||
return CORE_LA264;
|
||||
else
|
||||
return CORE_LA64_GENERIC;
|
||||
break;
|
||||
|
||||
default:
|
||||
return CORE_LA64_GENERIC;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
int get_cputype(void) {
|
||||
uint32_t prid = get_prid();
|
||||
switch (prid & PRID_SERIES_MASK) {
|
||||
case (PRID_SERIES_LA264):
|
||||
return CPU_LA264;
|
||||
break;
|
||||
|
||||
case (PRID_SERIES_LA364):
|
||||
return CPU_LA364;
|
||||
break;
|
||||
|
||||
case (PRID_SERIES_LA464):
|
||||
return CPU_LA464;
|
||||
break;
|
||||
|
||||
case (PRID_SERIES_LA664):
|
||||
return CPU_LA664;
|
||||
break;
|
||||
|
||||
default:
|
||||
return CPU_LA64_GENERIC;
|
||||
break;
|
||||
}
|
||||
return CPU_GENERIC;
|
||||
#endif
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
||||
char *get_corename(void) {
|
||||
return corename[get_coretype()];
|
||||
}
|
||||
|
||||
void get_libname(void){
|
||||
printf("%s", corename_lower[get_coretype()]);
|
||||
return cpuname[detect()];
|
||||
}
|
||||
|
||||
void get_architecture(void) {
|
||||
|
@ -332,7 +91,8 @@ void get_architecture(void) {
|
|||
}
|
||||
|
||||
void get_subarchitecture(void) {
|
||||
printf("%s", cpuname[get_cputype()]);
|
||||
int d = detect();
|
||||
printf("%s", cpuname[d]);
|
||||
}
|
||||
|
||||
void get_subdirname(void) {
|
||||
|
@ -340,69 +100,44 @@ void get_subdirname(void) {
|
|||
}
|
||||
|
||||
void get_cpuconfig(void) {
|
||||
cache_info_t info;
|
||||
uint32_t num_cores = 0;
|
||||
int d = detect();
|
||||
switch (d) {
|
||||
case CPU_LOONGSON3R5:
|
||||
printf("#define LOONGSON3R5\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
break;
|
||||
|
||||
printf("#define %s\n", corename[get_coretype()]); // Core name
|
||||
case CPU_LOONGSON2K1000:
|
||||
printf("#define LOONGSON2K1000\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 262144\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
break;
|
||||
|
||||
printf("#define CPU_NAME %s\n", cpuname[get_cputype()]); // Cpu microarchitecture name
|
||||
|
||||
get_cacheinfo(CACHE_INFO_L1_IU, &info);
|
||||
if (info.present) {
|
||||
if (info.unify) { // Unified cache, without distinguishing between instructions and data
|
||||
printf("#define L1_SIZE %d\n", info.size);
|
||||
printf("#define L1_ASSOCIATIVE %d\n", info.associative);
|
||||
printf("#define L1_LINESIZE %d\n", info.linesize);
|
||||
} else {
|
||||
printf("#define L1_CODE_SIZE %d\n", info.size);
|
||||
printf("#define L1_CODE_ASSOCIATIVE %d\n", info.associative);
|
||||
printf("#define L1_CODE_LINESIZE %d\n", info.linesize);
|
||||
}
|
||||
default:
|
||||
printf("#define LOONGSONGENERIC\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 262144\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
break;
|
||||
}
|
||||
|
||||
if (!info.unify) {
|
||||
get_cacheinfo(CACHE_INFO_L1_D, &info);
|
||||
if (info.present) {
|
||||
printf("#define L1_DATA_SIZE %d\n", info.size);
|
||||
printf("#define L1_DATA_ASSOCIATIVE %d\n", info.associative);
|
||||
printf("#define L1_DATA_LINESIZE %d\n", info.linesize);
|
||||
}
|
||||
}
|
||||
|
||||
get_cacheinfo(CACHE_INFO_L2_IU, &info);
|
||||
if (info.present > 0) {
|
||||
if (info.unify) {
|
||||
printf("#define L2_SIZE %d\n", info.size);
|
||||
printf("#define L2_ASSOCIATIVE %d\n", info.associative);
|
||||
printf("#define L2_LINESIZE %d\n", info.linesize);
|
||||
} else {
|
||||
printf("#define L2_CODE_SIZE %d\n", info.size);
|
||||
printf("#define L2_CODE_ASSOCIATIVE %d\n", info.associative);
|
||||
printf("#define L2_CODE_LINESIZE %d\n", info.linesize);
|
||||
}
|
||||
}
|
||||
|
||||
get_cacheinfo(CACHE_INFO_L3_IU, &info);
|
||||
if (info.present > 0) {
|
||||
if (info.unify) {
|
||||
printf("#define L3_SIZE %d\n", info.size);
|
||||
printf("#define L3_ASSOCIATIVE %d\n", info.associative);
|
||||
printf("#define L3_LINESIZE %d\n", info.linesize);
|
||||
} else {
|
||||
printf("#define L3_CODE_SIZE %d\n", info.size);
|
||||
printf("#define L3_CODE_ASSOCIATIVE %d\n", info.associative);
|
||||
printf("#define L3_CODE_LINESIZE %d\n", info.linesize);
|
||||
}
|
||||
}
|
||||
|
||||
if(os_support_lsx) printf("#define HAVE_LSX\n");
|
||||
if(os_support_lasx) printf("#define HAVE_LASX\n");
|
||||
|
||||
get_cpucount(&num_cores);
|
||||
if (num_cores)
|
||||
printf("#define NUM_CORES %d\n", num_cores);
|
||||
|
||||
//TODO: It’s unclear what this entry represents, but it is indeed necessary.
|
||||
//It has been set based on reference to other platforms.
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
}
|
||||
|
||||
void get_libname(void){
|
||||
int d = detect();
|
||||
printf("%s", cpuname_lower[d]);
|
||||
}
|
||||
|
|
|
@ -165,9 +165,7 @@ void get_cpuconfig(void){
|
|||
}else{
|
||||
printf("#define UNKNOWN\n");
|
||||
}
|
||||
#ifndef NO_MSA
|
||||
if (get_feature("msa")) printf("#define HAVE_MSA\n");
|
||||
#endif
|
||||
if (!get_feature("msa")) printf("#define NO_MSA\n");
|
||||
}
|
||||
|
||||
void get_libname(void){
|
||||
|
|
|
@ -208,9 +208,7 @@ void get_cpuconfig(void){
|
|||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
}
|
||||
#ifndef NO_MSA
|
||||
if (get_feature("msa")) printf("#define HAVE_MSA\n");
|
||||
#endif
|
||||
if (!get_feature("msa")) printf("#define NO_MSA\n");
|
||||
}
|
||||
|
||||
void get_libname(void){
|
||||
|
|
|
@ -160,7 +160,6 @@ int detect(void){
|
|||
infoCount = HOST_BASIC_INFO_COUNT;
|
||||
host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, &infoCount);
|
||||
|
||||
if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7400) return CPUTYPE_PPCG4;
|
||||
if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7450) return CPUTYPE_PPCG4;
|
||||
if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_970) return CPUTYPE_PPC970;
|
||||
|
||||
|
|
|
@ -84,14 +84,6 @@ static char *cpuname[] = {
|
|||
"CPU_RISCV64_ZVL128B"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"riscv64_generic",
|
||||
"c910v",
|
||||
"x280",
|
||||
"riscv64_zvl256b",
|
||||
"riscv64_zvl128b"
|
||||
};
|
||||
|
||||
int detect(void){
|
||||
#ifdef __linux
|
||||
FILE *infile;
|
||||
|
@ -100,29 +92,23 @@ int detect(void){
|
|||
char *pmodel = NULL, *pisa = NULL;
|
||||
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
if (!infile)
|
||||
return CPU_GENERIC;
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if(!strncmp(buffer, "model name", 10)){
|
||||
strcpy(model_buffer, buffer);
|
||||
pmodel = strchr(model_buffer, ':');
|
||||
if (pmodel)
|
||||
pmodel++;
|
||||
pmodel = strchr(isa_buffer, ':') + 1;
|
||||
}
|
||||
|
||||
if(!strncmp(buffer, "isa", 3)){
|
||||
strcpy(isa_buffer, buffer);
|
||||
pisa = strchr(isa_buffer, '4');
|
||||
if (pisa)
|
||||
pisa++;
|
||||
pisa = strchr(isa_buffer, '4') + 1;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
if (!pmodel || !pisa)
|
||||
if (!pmodel)
|
||||
return(CPU_GENERIC);
|
||||
|
||||
|
||||
if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v'))
|
||||
return CPU_C910V;
|
||||
|
||||
|
@ -160,5 +146,5 @@ void get_cpuconfig(void){
|
|||
}
|
||||
|
||||
void get_libname(void){
|
||||
printf("%s", cpuname_lower[detect()]);
|
||||
printf("riscv64\n");
|
||||
}
|
||||
|
|
109
cpuid_x86.c
109
cpuid_x86.c
|
@ -194,7 +194,7 @@ static C_INLINE void xgetbv(int op, int * eax, int * edx){
|
|||
}
|
||||
#endif
|
||||
|
||||
int support_avx(void){
|
||||
int support_avx(){
|
||||
#ifndef NO_AVX
|
||||
int eax, ebx, ecx, edx;
|
||||
int ret=0;
|
||||
|
@ -212,7 +212,7 @@ int support_avx(void){
|
|||
#endif
|
||||
}
|
||||
|
||||
int support_avx2(void){
|
||||
int support_avx2(){
|
||||
#ifndef NO_AVX2
|
||||
int eax, ebx, ecx=0, edx;
|
||||
int ret=0;
|
||||
|
@ -228,7 +228,7 @@ int support_avx2(void){
|
|||
#endif
|
||||
}
|
||||
|
||||
int support_avx512(void){
|
||||
int support_avx512(){
|
||||
#if !defined(NO_AVX) && !defined(NO_AVX512)
|
||||
int eax, ebx, ecx, edx;
|
||||
int ret=0;
|
||||
|
@ -250,7 +250,7 @@ int support_avx512(void){
|
|||
#endif
|
||||
}
|
||||
|
||||
int support_avx512_bf16(void){
|
||||
int support_avx512_bf16(){
|
||||
#if !defined(NO_AVX) && !defined(NO_AVX512)
|
||||
int eax, ebx, ecx, edx;
|
||||
int ret=0;
|
||||
|
@ -271,7 +271,7 @@ int support_avx512_bf16(void){
|
|||
#define BIT_AMX_BF16 0x00400000
|
||||
#define BIT_AMX_ENBD 0x00060000
|
||||
|
||||
int support_amx_bf16(void) {
|
||||
int support_amx_bf16() {
|
||||
#if !defined(NO_AVX) && !defined(NO_AVX512)
|
||||
int eax, ebx, ecx, edx;
|
||||
int ret=0;
|
||||
|
@ -1479,8 +1479,6 @@ int get_cpuname(void){
|
|||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 15: // Sapphire Rapids
|
||||
if(support_amx_bf16())
|
||||
return CPUTYPE_SAPPHIRERAPIDS;
|
||||
if(support_avx512_bf16())
|
||||
return CPUTYPE_COOPERLAKE;
|
||||
if(support_avx512())
|
||||
|
@ -1527,29 +1525,14 @@ int get_cpuname(void){
|
|||
break;
|
||||
case 10: //family 6 exmodel 10
|
||||
switch (model) {
|
||||
case 13: // Granite Rapids
|
||||
if(support_amx_bf16())
|
||||
return CPUTYPE_SAPPHIRERAPIDS;
|
||||
if(support_avx512_bf16())
|
||||
return CPUTYPE_COOPERLAKE;
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 5: // Comet Lake H and S
|
||||
case 6: // Comet Lake U
|
||||
case 10: // Meteor Lake
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 0: // Meteor Lake
|
||||
case 7: // Rocket Lake
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
|
@ -1564,9 +1547,6 @@ int get_cpuname(void){
|
|||
case 11: //family 6 exmodel 11
|
||||
switch (model) {
|
||||
case 7: // Raptor Lake
|
||||
case 10:
|
||||
case 15:
|
||||
case 14: // Alder Lake N
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
|
@ -1575,19 +1555,6 @@ int get_cpuname(void){
|
|||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 12: //family 6 exmodel 12
|
||||
switch (model) {
|
||||
case 15:
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SAPPHIRERAPIDS;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case 0x7:
|
||||
|
@ -1688,14 +1655,7 @@ int get_cpuname(void){
|
|||
else
|
||||
return CPUTYPE_BARCELONA;
|
||||
}
|
||||
case 10: // Zen3/4
|
||||
case 11: // Zen5
|
||||
#ifndef NO_AVX512
|
||||
if(support_avx512_bf16())
|
||||
return CPUTYPE_COOPERLAKE;
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
#endif
|
||||
case 10: // Zen3
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_ZEN;
|
||||
|
@ -1883,8 +1843,7 @@ static char *cpuname[] = {
|
|||
"ZEN",
|
||||
"SKYLAKEX",
|
||||
"DHYANA",
|
||||
"COOPERLAKE",
|
||||
"SAPPHIRERAPIDS",
|
||||
"COOPERLAKE"
|
||||
};
|
||||
|
||||
static char *lowercpuname[] = {
|
||||
|
@ -1941,8 +1900,7 @@ static char *lowercpuname[] = {
|
|||
"zen",
|
||||
"skylakex",
|
||||
"dhyana",
|
||||
"cooperlake",
|
||||
"sapphirerapids",
|
||||
"cooperlake"
|
||||
};
|
||||
|
||||
static char *corename[] = {
|
||||
|
@ -1976,8 +1934,7 @@ static char *corename[] = {
|
|||
"ZEN",
|
||||
"SKYLAKEX",
|
||||
"DHYANA",
|
||||
"COOPERLAKE",
|
||||
"SAPPHIRERAPIDS",
|
||||
"COOPERLAKE"
|
||||
};
|
||||
|
||||
static char *corename_lower[] = {
|
||||
|
@ -2011,8 +1968,7 @@ static char *corename_lower[] = {
|
|||
"zen",
|
||||
"skylakex",
|
||||
"dhyana",
|
||||
"cooperlake",
|
||||
"sapphirerapids",
|
||||
"cooperlake"
|
||||
};
|
||||
|
||||
|
||||
|
@ -2318,18 +2274,16 @@ int get_coretype(void){
|
|||
return CORE_NEHALEM;
|
||||
}
|
||||
if (model == 15) { // Sapphire Rapids
|
||||
if(support_amx_bf16())
|
||||
return CORE_SAPPHIRERAPIDS;
|
||||
if(support_avx512_bf16())
|
||||
return CORE_COOPERLAKE;
|
||||
return CPUTYPE_COOPERLAKE;
|
||||
if(support_avx512())
|
||||
return CORE_SKYLAKEX;
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CORE_HASWELL;
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CORE_SANDYBRIDGE;
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -2366,22 +2320,8 @@ int get_coretype(void){
|
|||
|
||||
case 10:
|
||||
switch (model) {
|
||||
case 13: // Granite Rapids
|
||||
if(support_amx_bf16())
|
||||
return CORE_SAPPHIRERAPIDS;
|
||||
if(support_avx512_bf16())
|
||||
return CORE_COOPERLAKE;
|
||||
if(support_avx512())
|
||||
return CORE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CORE_HASWELL;
|
||||
if(support_avx())
|
||||
return CORE_SANDYBRIDGE;
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
case 5: // Comet Lake H and S
|
||||
case 6: // Comet Lake U
|
||||
case 10: // Meteor Lake
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
|
@ -2390,7 +2330,6 @@ int get_coretype(void){
|
|||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
case 0: // Meteor Lake
|
||||
case 7:// Rocket Lake
|
||||
#ifndef NO_AVX512
|
||||
if(support_avx512())
|
||||
|
@ -2409,9 +2348,6 @@ int get_coretype(void){
|
|||
case 11:
|
||||
switch (model) {
|
||||
case 7: // Raptor Lake
|
||||
case 10:
|
||||
case 15:
|
||||
case 14: // Alder Lake N
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CORE_HASWELL;
|
||||
|
@ -2421,10 +2357,10 @@ int get_coretype(void){
|
|||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
case 15:
|
||||
if (model <= 0x2) return CORE_NORTHWOOD;
|
||||
else return CORE_PRESCOTT;
|
||||
}
|
||||
case 15:
|
||||
if (model <= 0x2) return CORE_NORTHWOOD;
|
||||
else return CORE_PRESCOTT;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2480,7 +2416,7 @@ int get_coretype(void){
|
|||
}
|
||||
break;
|
||||
}
|
||||
} else if (exfamily == 8 || exfamily == 10 || exfamily == 11) {
|
||||
} else if (exfamily == 8 || exfamily == 10) {
|
||||
switch (model) {
|
||||
case 1:
|
||||
// AMD Ryzen
|
||||
|
@ -2488,12 +2424,6 @@ int get_coretype(void){
|
|||
// Ryzen 2
|
||||
default:
|
||||
// Matisse,Renoir Ryzen2 models
|
||||
#ifndef NO_AVX512
|
||||
if(support_avx512_bf16())
|
||||
return CORE_COOPERLAKE;
|
||||
if(support_avx512())
|
||||
return CORE_SKYLAKEX;
|
||||
#endif
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_ZEN;
|
||||
|
@ -2555,7 +2485,6 @@ int get_coretype(void){
|
|||
case 0x7:
|
||||
switch (exmodel) {
|
||||
case 5:
|
||||
case 6:
|
||||
if (support_avx2())
|
||||
return CORE_ZEN;
|
||||
else
|
||||
|
|
4
ctest.c
4
ctest.c
|
@ -173,10 +173,6 @@ HAVE_C11
|
|||
ARCH_E2K
|
||||
#endif
|
||||
|
||||
#if defined(__csky__)
|
||||
ARCH_CSKY
|
||||
#endif
|
||||
|
||||
#if defined(__EMSCRIPTEN__)
|
||||
ARCH_RISCV64
|
||||
OS_WINDOWS
|
||||
|
|
|
@ -6,10 +6,6 @@ enable_language(Fortran)
|
|||
endif()
|
||||
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS")
|
||||
if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_EQUAL 14.2)
|
||||
list(REMOVE_ITEM ${CMAKE_Fortran_FLAGS} -O3 -O2 -O1 -Os)
|
||||
set (CMAKE_Fortran_FLAGS_RELEASE "" CACHE STRING "" FORCE)
|
||||
endif()
|
||||
if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
|
||||
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize")
|
||||
endif()
|
||||
|
@ -44,10 +40,6 @@ else()
|
|||
c_${float_char}blas1.c)
|
||||
endif()
|
||||
target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
|
||||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
|
||||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
|
||||
target_link_libraries(x${float_char}cblat1 omp pthread)
|
||||
endif()
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
||||
target_link_libraries(x${float_char}cblat1 m)
|
||||
endif()
|
||||
|
@ -73,10 +65,6 @@ else()
|
|||
constant.c)
|
||||
endif()
|
||||
target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
|
||||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
|
||||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
|
||||
target_link_libraries(x${float_char}cblat2 omp pthread)
|
||||
endif()
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
||||
target_link_libraries(x${float_char}cblat2 m)
|
||||
endif()
|
||||
|
@ -92,17 +80,6 @@ if (NOT NOFORTRAN)
|
|||
auxiliary.c
|
||||
c_xerbla.c
|
||||
constant.c)
|
||||
if (USE_GEMM3M)
|
||||
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
|
||||
add_executable(x${float_char}cblat3_3m
|
||||
c_${float_char}blat3_3m.f
|
||||
c_${float_char}blas3_3m.c
|
||||
c_${float_char}3chke_3m.c
|
||||
auxiliary.c
|
||||
c_xerbla.c
|
||||
constant.c)
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
add_executable(x${float_char}cblat3
|
||||
c_${float_char}blat3c.c
|
||||
|
@ -111,44 +88,12 @@ else()
|
|||
auxiliary.c
|
||||
c_xerbla.c
|
||||
constant.c)
|
||||
if (USE_GEMM3M)
|
||||
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
|
||||
add_executable(x${float_char}cblat3_3m
|
||||
c_${float_char}blat3c_3m.c
|
||||
c_${float_char}blas3_3m.c
|
||||
c_${float_char}3chke_3m.c
|
||||
auxiliary.c
|
||||
c_xerbla.c
|
||||
constant.c)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
|
||||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
|
||||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
|
||||
target_link_libraries(x${float_char}cblat3 omp pthread)
|
||||
endif()
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
||||
target_link_libraries(x${float_char}cblat3 m)
|
||||
endif()
|
||||
if (USE_GEMM3M)
|
||||
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
|
||||
target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME})
|
||||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
|
||||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
|
||||
target_link_libraries(x${float_char}cblat3 omp pthread)
|
||||
endif()
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
||||
target_link_libraries(x${float_char}cblat3_3m m)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
add_test(NAME "x${float_char}cblat3"
|
||||
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
|
||||
if (USE_GEMM3M)
|
||||
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
|
||||
add_test(NAME "x${float_char}cblat3_3m"
|
||||
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3_3m> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3_3m")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
endforeach()
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue