Merge pull request #4606 from OpenMathLib/develop

Merge develop branch for 0.3.27
This commit is contained in:
Martin Kroeker 2024-04-04 22:24:56 +02:00 committed by GitHub
commit 8f3bb62254
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2639 changed files with 141761 additions and 40705 deletions

View File

@ -1,44 +1,44 @@
macos_instance:
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
task:
name: AppleM1/LLVM
compile_script:
- brew install llvm
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- make TARGET=VORTEX USE_OPENMP=1 CC=clang
#task:
# name: AppleM1/LLVM
# compile_script:
# - brew install llvm
# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
# - make TARGET=VORTEX USE_OPENMP=1 CC=clang
task:
name: AppleM1/LLVM/ILP64
compile_script:
- brew install llvm
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1
#task:
# name: AppleM1/LLVM/ILP64
# compile_script:
# - brew install llvm
# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
# - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1
task:
name: AppleM1/LLVM/CMAKE
compile_script:
- brew install llvm
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- mkdir build
- cd build
- cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
- make -j 4
#task:
# name: AppleM1/LLVM/CMAKE
# compile_script:
# - brew install llvm
# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
# - mkdir build
# - cd build
# - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
# - make -j 4
task:
name: AppleM1/GCC/MAKE/OPENMP
compile_script:
- brew install gcc@11
- export PATH=/opt/homebrew/bin:$PATH
- export LDFLAGS="-L/opt/homebrew/lib"
- export CPPFLAGS="-I/opt/homebrew/include"
- make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
#task:
# name: AppleM1/GCC/MAKE/OPENMP
# compile_script:
# - brew install gcc@11
# - export PATH=/opt/homebrew/bin:$PATH
# - export LDFLAGS="-L/opt/homebrew/lib"
# - export CPPFLAGS="-I/opt/homebrew/include"
# - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
macos_instance:
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
@ -58,8 +58,8 @@ task:
- export VALID_ARCHS="i386 x86_64"
- xcrun --sdk macosx --show-sdk-path
- xcodebuild -version
- export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX12.3.sdk -arch x86_64"
- export CC=/Applications/Xcode-15.3.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-15.3.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.4.sdk -arch x86_64"
- make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
always:
config_artifacts:
@ -78,8 +78,8 @@ task:
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
- export CC=/Applications/Xcode-15.3.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-15.3.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.4.sdk -arch arm64 -miphoneos-version-min=10.0"
- make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1
always:
config_artifacts:
@ -91,14 +91,16 @@ macos_instance:
task:
name: AppleM1/LLVM armv7-androidndk xbuild
compile_script:
- #brew install android-ndk
- brew install android-ndk
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- find /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b -name "armv7a-linux-androideabi*-ranlib"
- ls /System/Volumes/Data/opt/homebrew
- ls -l /System/Volumes/Data/opt/homebrew/Caskroom/
- find /System/Volumes/Data/opt/homebrew -name "armv7a-linux-androideabi*-ranlib"
- #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b/AndroidNDK8937393.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/26c/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
always:
config_artifacts:

149
.github/workflows/apple_m.yml vendored Normal file
View File

@ -0,0 +1,149 @@
name: apple m
on: [push, pull_request]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
permissions:
contents: read # to fetch code (actions/checkout)
jobs:
build:
if: "github.repository == 'OpenMathLib/OpenBLAS'"
runs-on: macos-14
strategy:
fail-fast: false
matrix:
build: [cmake, make]
fortran: [gfortran]
openmp: [0, 1]
ilp64: [0, 1]
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Print system information
run: |
if [ "$RUNNER_OS" == "Linux" ]; then
cat /proc/cpuinfo
elif [ "$RUNNER_OS" == "macOS" ]; then
sysctl -a | grep machdep.cpu
else
echo "::error::$RUNNER_OS not supported"
exit 1
fi
- name: Install Dependencies
run: |
if [ "$RUNNER_OS" == "Linux" ]; then
sudo apt-get install -y gfortran cmake ccache libtinfo5
elif [ "$RUNNER_OS" == "macOS" ]; then
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
brew reinstall gcc
brew install coreutils cmake ccache
brew install llvm
else
echo "::error::$RUNNER_OS not supported"
exit 1
fi
- name: Compilation cache
uses: actions/cache@v3
with:
path: ~/.ccache
# We include the commit sha in the cache key, as new cache entries are
# only created if there is no existing entry for the key yet.
# GNU make and cmake call the compilers differently. It looks like
# that causes the cache to mismatch. Keep the ccache for both build
# tools separate to avoid polluting each other.
key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
# Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
restore-keys: |
ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}-${{ github.ref }}
ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}
ccache-${{ runner.os }}-${{ matrix.build }}
- name: Configure ccache
run: |
if [ "${{ matrix.build }}" = "make" ]; then
# Add ccache to path
if [ "$RUNNER_OS" = "Linux" ]; then
echo "/usr/lib/ccache" >> $GITHUB_PATH
elif [ "$RUNNER_OS" = "macOS" ]; then
echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH
echo "/opt/homebrew/opt/llvm/bin" >>$GITHUB_PATH
echo "" >>$GITHUB_PATH
else
echo "::error::$RUNNER_OS not supported"
exit 1
fi
fi
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
test -d ~/.ccache || mkdir -p ~/.ccache
echo "max_size = 300M" > ~/.ccache/ccache.conf
echo "compression = true" >> ~/.ccache/ccache.conf
ccache -s
- name: Build OpenBLAS
run: |
export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
export CC="/opt/homebrew/opt/llvm/bin/clang"
case "${{ matrix.build }}" in
"make")
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=${{matrix.openmp}} INTERFACE64=${{matrix.ilp64}} FC="ccache ${{ matrix.fortran }}"
;;
"cmake")
export LDFLAGS="$LDFLAGS -Wl,-ld_classic"
mkdir build && cd build
cmake -DDYNAMIC_ARCH=1 \
-DUSE_OPENMP=${{matrix.openmp}} \
-DINTERFACE64=${{matrix.ilp64}} \
-DNOFORTRAN=0 \
-DBUILD_WITHOUT_LAPACK=0 \
-DCMAKE_VERBOSE_MAKEFILE=ON \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
..
cmake --build .
;;
*)
echo "::error::Configuration not supported"
exit 1
;;
esac
- name: Show ccache status
continue-on-error: true
run: ccache -s
- name: Run tests
timeout-minutes: 60
run: |
case "${{ matrix.build }}" in
"make")
MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0'
echo "::group::Tests in 'test' directory"
make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
echo "::endgroup::"
echo "::group::Tests in 'ctest' directory"
make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
echo "::endgroup::"
echo "::group::Tests in 'utest' directory"
make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
echo "::endgroup::"
;;
"cmake")
cd build && ctest
;;
*)
echo "::error::Configuration not supported"
exit 1
;;
esac

View File

@ -14,8 +14,8 @@ jobs:
if: "github.repository == 'OpenMathLib/OpenBLAS'"
runs-on: ubuntu-latest
env:
xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282
toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz
xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1698113812618
toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0-20231018.tar.gz
strategy:
fail-fast: false
matrix:
@ -76,7 +76,7 @@ jobs:
run: |
wget ${xuetie_toolchain}/${toolchain_file_name}
tar -xvf ${toolchain_file_name} -C /opt
export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1/bin:$PATH"
export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0/bin:$PATH"
make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)

View File

@ -42,6 +42,7 @@ jobs:
- name: Install Dependencies
run: |
if [ "$RUNNER_OS" == "Linux" ]; then
sudo apt-get update
sudo apt-get install -y gfortran cmake ccache libtinfo5
elif [ "$RUNNER_OS" == "macOS" ]; then
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.

253
.github/workflows/riscv64_vector.yml vendored Normal file
View File

@ -0,0 +1,253 @@
name: riscv64 zvl256b qemu test
on: [push, pull_request]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
permissions:
contents: read # to fetch code (actions/checkout)
jobs:
TEST:
if: "github.repository == 'OpenMathLib/OpenBLAS'"
runs-on: ubuntu-latest
env:
triple: riscv64-unknown-linux-gnu
riscv_gnu_toolchain: https://github.com/riscv-collab/riscv-gnu-toolchain
riscv_gnu_toolchain_version: 13.2.0
riscv_gnu_toolchain_nightly_download_path: /releases/download/2024.02.02/riscv64-glibc-ubuntu-22.04-llvm-nightly-2024.02.02-nightly.tar.gz
strategy:
fail-fast: false
matrix:
include:
- target: RISCV64_ZVL128B
opts: TARGET=RISCV64_ZVL128B BINARY=64 ARCH=riscv64
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=128,elen=64
- target: RISCV64_ZVL256B
opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: install build deps
run: |
sudo apt-get update
sudo apt-get install autoconf automake autotools-dev ninja-build make \
libgomp1-riscv64-cross ccache
wget ${riscv_gnu_toolchain}/${riscv_gnu_toolchain_nightly_download_path}
tar -xvf $(basename ${riscv_gnu_toolchain_nightly_download_path}) -C /opt
- name: Compilation cache
uses: actions/cache@v3
with:
path: ~/.ccache
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
restore-keys: |
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
ccache-${{ runner.os }}-${{ matrix.target }}
- name: Configure ccache
run: |
test -d ~/.ccache || mkdir -p ~/.ccache
echo "max_size = 300M" > ~/.ccache/ccache.conf
echo "compression = true" >> ~/.ccache/ccache.conf
ccache -s
- name: build OpenBLAS libs
run: |
export PATH="/opt/riscv/bin:$PATH"
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
CC='ccache clang --rtlib=compiler-rt -target ${triple} --sysroot /opt/riscv/sysroot --gcc-toolchain=/opt/riscv/lib/gcc/riscv64-unknown-linux-gnu/${riscv_gnu_toolchain_version}/' \
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
RANLIB='ccache ${triple}-ranlib' \
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
HOSTCC=gcc HOSTFC=gfortran -j$(nproc)
- name: build OpenBLAS tests
run: |
export PATH="/opt/riscv/bin:$PATH"
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
CC='${triple}-gcc' \
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
RANLIB='ccache ${triple}-ranlib' \
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
HOSTCC=gcc HOSTFC=gfortran -j$(nproc) tests
- name: build lapack-netlib tests
working-directory: ./lapack-netlib/TESTING
run: |
export PATH="/opt/riscv/bin:$PATH"
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
CC='${triple}-gcc' \
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
RANLIB='ccache ${triple}-ranlib' \
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
HOSTCC=gcc HOSTFC=gfortran -j$(nproc) \
LIN/xlintsts LIN/xlintstc LIN/xlintstd LIN/xlintstz LIN/xlintstrfs \
LIN/xlintstrfc LIN/xlintstrfd LIN/xlintstrfz LIN/xlintstds \
LIN/xlintstzc EIG/xeigtsts EIG/xeigtstc EIG/xeigtstd EIG/xeigtstz \
- name: OpenBLAS tests
shell: bash
run: |
export PATH="/opt/riscv/bin:$PATH"
export QEMU_CPU=${{ matrix.qemu_cpu }}
rm -rf ./test_out
mkdir -p ./test_out
run_test() { local DIR=$1; local CMD=$2; local DATA=$3; local OUTPUT="./test_out/$DIR.$CMD"; \
echo "`pwd`/$DIR/$CMD $DIR/$DATA" >> $OUTPUT; \
if [[ -z $DATA ]]; then qemu-riscv64 ./$DIR/$CMD |& tee $OUTPUT ; \
else qemu-riscv64 ./$DIR/$CMD < ./$DIR/$DATA |& tee $OUTPUT ; fi ; \
RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi \
}
run_test test cblat1 &
run_test test cblat2 cblat2.dat &
run_test test cblat3 cblat3.dat &
run_test test dblat1 &
run_test test dblat2 dblat2.dat &
run_test test dblat3 dblat3.dat &
run_test test sblat1 &
run_test test sblat2 sblat2.dat &
run_test test sblat3 sblat3.dat &
run_test test zblat1 &
run_test test zblat2 zblat2.dat &
run_test test zblat3 zblat3.dat &
run_test ctest xccblat1 &
run_test ctest xccblat2 cin2 &
run_test ctest xccblat3 cin3 &
run_test ctest xdcblat1 &
run_test ctest xdcblat2 din2 &
run_test ctest xdcblat3 din3 &
run_test ctest xscblat1 &
run_test ctest xscblat2 sin2 &
run_test ctest xscblat3 sin3 &
run_test ctest xzcblat1 &
run_test ctest xzcblat2 zin2 &
run_test ctest xzcblat3 zin3 &
wait
while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*)
if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi
- name: netlib tests
shell: bash
run: |
: # these take a very long time
echo "Skipping netlib tests in CI"
exit 0
: # comment out exit above to enable the tests
: # probably we want to identify a subset to run in CI
export PATH="/opt/riscv/bin:$PATH"
export QEMU_CPU=${{ matrix.qemu_cpu }}
rm -rf ./test_out
mkdir -p ./test_out
run_test() { local OUTPUT="./test_out/$1"; local DATA="./lapack-netlib/TESTING/$2"; local CMD="./lapack-netlib/TESTING/$3"; \
echo "$4" >> $OUTPUT; \
echo "$CMD" >> $OUTPUT; \
qemu-riscv64 $CMD < $DATA |& tee $OUTPUT; \
RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi; \
if grep -q fail $OUTPUT ; then echo "*** FAIL: log contains 'fail'" >> $OUTPUT ; fi ; \
if grep -q rror $OUTPUT | grep -v -q "passed" | grep -v "largest error" ; then echo "*** FAIL: log contains 'error'" >> $OUTPUT ; fi \
}
run_test stest.out stest.in LIN/xlintsts "Testing REAL LAPACK linear equation routines" &
run_test ctest.out ctest.in LIN/xlintstc "Testing COMPLEX LAPACK linear equation routines" &
run_test dtest.out dtest.in LIN/xlintstd "Testing DOUBLE PRECISION LAPACK linear equation routines" &
run_test ztest.out ztest.in LIN/xlintstz "Testing COMPLEX16 LAPACK linear equation routines" &
run_test dstest.out dstest.in LIN/xlintstds "Testing SINGLE-DOUBLE PRECISION LAPACK prototype linear equation routines" &
run_test zctest.out zctest.in LIN/xlintstzc "Testing COMPLEX-COMPLEX16 LAPACK prototype linear equation routines" &
run_test stest_rfp.out stest_rfp.in LIN/xlintstrfs "Testing REAL LAPACK RFP prototype linear equation routines" &
run_test dtest_rfp.out dtest_rfp.in LIN/xlintstrfd "Testing DOUBLE PRECISION LAPACK RFP prototype linear equation routines" &
run_test ctest_rfp.out ctest_rfp.in LIN/xlintstrfc "Testing COMPLEX LAPACK RFP prototype linear equation routines" &
run_test ztest_rfp.out ztest_rfp.in LIN/xlintstrfz "Testing COMPLEX16 LAPACK RFP prototype linear equation routines" &
run_test snep.out nep.in EIG/xeigtsts "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
run_test ssep.out sep.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test sse2.out se2.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test ssvd.out svd.in EIG/xeigtsts "SVD - Testing Singular Value Decomposition routines" &
run_test sec.out sec.in EIG/xeigtsts "SEC - Testing REAL Eigen Condition Routines" &
run_test sed.out sed.in EIG/xeigtsts "SEV - Testing REAL Nonsymmetric Eigenvalue Driver" &
run_test sgg.out sgg.in EIG/xeigtsts "SGG - Testing REAL Nonsymmetric Generalized Eigenvalue Problem routines" &
run_test sgd.out sgd.in EIG/xeigtsts "SGD - Testing REAL Nonsymmetric Generalized Eigenvalue Problem driver routines" &
run_test ssb.out ssb.in EIG/xeigtsts "SSB - Testing REAL Symmetric Eigenvalue Problem routines" &
run_test ssg.out ssg.in EIG/xeigtsts "SSG - Testing REAL Symmetric Generalized Eigenvalue Problem routines" &
run_test sbal.out sbal.in EIG/xeigtsts "SGEBAL - Testing the balancing of a REAL general matrix" &
run_test sbak.out sbak.in EIG/xeigtsts "SGEBAK - Testing the back transformation of a REAL balanced matrix" &
run_test sgbal.out sgbal.in EIG/xeigtsts "SGGBAL - Testing the balancing of a pair of REAL general matrices" &
run_test sgbak.out sgbak.in EIG/xeigtsts "SGGBAK - Testing the back transformation of a pair of REAL balanced matrices" &
run_test sbb.out sbb.in EIG/xeigtsts "SBB - Testing banded Singular Value Decomposition routines" &
run_test sglm.out glm.in EIG/xeigtsts "GLM - Testing Generalized Linear Regression Model routines" &
run_test sgqr.out gqr.in EIG/xeigtsts "GQR - Testing Generalized QR and RQ factorization routines" &
run_test sgsv.out gsv.in EIG/xeigtsts "GSV - Testing Generalized Singular Value Decomposition routines" &
run_test scsd.out csd.in EIG/xeigtsts "CSD - Testing CS Decomposition routines" &
run_test slse.out lse.in EIG/xeigtsts "LSE - Testing Constrained Linear Least Squares routines" &
run_test cnep.out nep.in EIG/xeigtstc "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
run_test csep.out sep.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test cse2.out se2.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test csvd.out svd.in EIG/xeigtstc "SVD - Testing Singular Value Decomposition routines" &
run_test cec.out cec.in EIG/xeigtstc "CEC - Testing COMPLEX Eigen Condition Routines" &
run_test ced.out ced.in EIG/xeigtstc "CES - Testing COMPLEX Nonsymmetric Schur Form Driver" &
run_test cgg.out cgg.in EIG/xeigtstc "CGG - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem routines" &
run_test cgd.out cgd.in EIG/xeigtstc "CGD - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem driver routines" &
run_test csb.out csb.in EIG/xeigtstc "CHB - Testing Hermitian Eigenvalue Problem routines" &
run_test csg.out csg.in EIG/xeigtstc "CSG - Testing Symmetric Generalized Eigenvalue Problem routines" &
run_test cbal.out cbal.in EIG/xeigtstc "CGEBAL - Testing the balancing of a COMPLEX general matrix" &
run_test cbak.out cbak.in EIG/xeigtstc "CGEBAK - Testing the back transformation of a COMPLEX balanced matrix" &
run_test cgbal.out cgbal.in EIG/xeigtstc "CGGBAL - Testing the balancing of a pair of COMPLEX general matrices" &
run_test cgbak.out cgbak.in EIG/xeigtstc "CGGBAK - Testing the back transformation of a pair of COMPLEX balanced matrices" &
run_test cbb.out cbb.in EIG/xeigtstc "CBB - Testing banded Singular Value Decomposition routines" &
run_test cglm.out glm.in EIG/xeigtstc "GLM - Testing Generalized Linear Regression Model routines" &
run_test cgqr.out gqr.in EIG/xeigtstc "GQR - Testing Generalized QR and RQ factorization routines" &
run_test cgsv.out gsv.in EIG/xeigtstc "GSV - Testing Generalized Singular Value Decomposition routines" &
run_test ccsd.out csd.in EIG/xeigtstc "CSD - Testing CS Decomposition routines" &
run_test clse.out lse.in EIG/xeigtstc "LSE - Testing Constrained Linear Least Squares routines" &
run_test dnep.out nep.in EIG/xeigtstd "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
run_test dsep.out sep.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test dse2.out se2.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test dsvd.out svd.in EIG/xeigtstd "SVD - Testing Singular Value Decomposition routines" &
run_test dec.out dec.in EIG/xeigtstd "DEC - Testing DOUBLE PRECISION Eigen Condition Routines" &
run_test ded.out ded.in EIG/xeigtstd "DEV - Testing DOUBLE PRECISION Nonsymmetric Eigenvalue Driver" &
run_test dgg.out dgg.in EIG/xeigtstd "DGG - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem routines" &
run_test dgd.out dgd.in EIG/xeigtstd "DGD - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem driver routines" &
run_test dsb.out dsb.in EIG/xeigtstd "DSB - Testing DOUBLE PRECISION Symmetric Eigenvalue Problem routines" &
run_test dsg.out dsg.in EIG/xeigtstd "DSG - Testing DOUBLE PRECISION Symmetric Generalized Eigenvalue Problem routines" &
run_test dbal.out dbal.in EIG/xeigtstd "DGEBAL - Testing the balancing of a DOUBLE PRECISION general matrix" &
run_test dbak.out dbak.in EIG/xeigtstd "DGEBAK - Testing the back transformation of a DOUBLE PRECISION balanced matrix" &
run_test dgbal.out dgbal.in EIG/xeigtstd "DGGBAL - Testing the balancing of a pair of DOUBLE PRECISION general matrices" &
run_test dgbak.out dgbak.in EIG/xeigtstd "DGGBAK - Testing the back transformation of a pair of DOUBLE PRECISION balanced matrices" &
run_test dbb.out dbb.in EIG/xeigtstd "DBB - Testing banded Singular Value Decomposition routines" &
run_test dglm.out glm.in EIG/xeigtstd "GLM - Testing Generalized Linear Regression Model routines" &
run_test dgqr.out gqr.in EIG/xeigtstd "GQR - Testing Generalized QR and RQ factorization routines" &
run_test dgsv.out gsv.in EIG/xeigtstd "GSV - Testing Generalized Singular Value Decomposition routines" &
run_test dcsd.out csd.in EIG/xeigtstd "CSD - Testing CS Decomposition routines" &
run_test dlse.out lse.in EIG/xeigtstd "LSE - Testing Constrained Linear Least Squares routines" &
run_test znep.out nep.in EIG/xeigtstz "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
run_test zsep.out sep.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test zse2.out se2.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test zsvd.out svd.in EIG/xeigtstz "SVD - Testing Singular Value Decomposition routines" &
run_test zec.out zec.in EIG/xeigtstz "ZEC - Testing COMPLEX16 Eigen Condition Routines" &
run_test zed.out zed.in EIG/xeigtstz "ZES - Testing COMPLEX16 Nonsymmetric Schur Form Driver" &
run_test zgg.out zgg.in EIG/xeigtstz "ZGG - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem routines" &
run_test zgd.out zgd.in EIG/xeigtstz "ZGD - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem driver routines" &
run_test zsb.out zsb.in EIG/xeigtstz "ZHB - Testing Hermitian Eigenvalue Problem routines" &
run_test zsg.out zsg.in EIG/xeigtstz "ZSG - Testing Symmetric Generalized Eigenvalue Problem routines" &
run_test zbal.out zbal.in EIG/xeigtstz "ZGEBAL - Testing the balancing of a COMPLEX16 general matrix" &
run_test zbak.out zbak.in EIG/xeigtstz "ZGEBAK - Testing the back transformation of a COMPLEX16 balanced matrix" &
run_test zgbal.out zgbal.in EIG/xeigtstz "ZGGBAL - Testing the balancing of a pair of COMPLEX general matrices" &
run_test zgbak.out zgbak.in EIG/xeigtstz "ZGGBAK - Testing the back transformation of a pair of COMPLEX16 balanced matrices" &
run_test zbb.out zbb.in EIG/xeigtstz "ZBB - Testing banded Singular Value Decomposition routines" &
run_test zglm.out glm.in EIG/xeigtstz "GLM - Testing Generalized Linear Regression Model routines" &
run_test zgqr.out gqr.in EIG/xeigtstz "GQR - Testing Generalized QR and RQ factorization routines" &
run_test zgsv.out gsv.in EIG/xeigtstz "GSV - Testing Generalized Singular Value Decomposition routines" &
run_test zcsd.out csd.in EIG/xeigtstz "CSD - Testing CS Decomposition routines" &
run_test zlse.out lse.in EIG/xeigtstz "LSE - Testing Constrained Linear Least Squares routines" &
wait
while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*)
python ./lapack-netlib/lapack_testing.py -d ./test_out -e > netlib_summary
TOTALS="$(grep 'ALL PRECISIONS' netlib_summary)"
NUMERICAL_ERRORS=-1
OTHER_ERRORS=-1
. <(awk '/ALL PRECISIONS/{printf "NUMERICAL_ERRORS=%s\nOTHER_ERRORS=%s\n", $5, $7}' netlib_summary
if (( NUMERICAL_ERRORS != 0 )) || (( OTHER_ERRORS != 0 )) ; then cat netlib_summary ; FAILURES=1 ; fi
if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi

13
.gitignore vendored
View File

@ -47,46 +47,59 @@ config_last.h
getarch
getarch_2nd
utest/openblas_utest
utest/openblas_utest_ext
ctest/xccblat1
ctest/xccblat2
ctest/xccblat3
ctest/xccblat3_3m
ctest/xdcblat1
ctest/xdcblat2
ctest/xdcblat3
ctest/xdcblat3_3m
ctest/xscblat1
ctest/xscblat2
ctest/xscblat3
ctest/xscblat3_3m
ctest/xzcblat1
ctest/xzcblat2
ctest/xzcblat3
ctest/xzcblat3_3m
exports/linktest.c
exports/linux.def
kernel/setparam_*.c
kernel/kernel_*.h
test/CBLAT2.SUMM
test/CBLAT3.SUMM
test/CBLAT3_3M.SUMM
test/DBLAT2.SUMM
test/DBLAT3.SUMM
test/DBLAT3_3M.SUMM
test/SBLAT2.SUMM
test/SBLAT3.SUMM
test/SBLAT3_3M.SUMM
test/ZBLAT2.SUMM
test/ZBLAT3.SUMM
test/ZBLAT3_3M.SUMM
test/SHBLAT3.SUMM
test/SBBLAT3.SUMM
test/cblat1
test/cblat2
test/cblat3
test/cblat3_3m
test/dblat1
test/dblat2
test/dblat3
test/dblat3_3m
test/sblat1
test/sblat2
test/sblat3
test/sblat3_3m
test/test_shgemm
test/test_sbgemm
test/zblat1
test/zblat2
test/zblat3
test/zblat3_3m
build
build.*
*.swp

View File

@ -24,6 +24,8 @@ option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, d
option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF)
option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
@ -40,6 +42,11 @@ option(USE_PERL "Use the older PERL scripts for build preparation instead of uni
option(NO_WARMUP "Do not run a benchmark on each startup just to find the best location for the memory buffer" ON)
option(FIXED_LIBNAME "Use a non-versioned name for the library and no symbolic linking to variant names" OFF)
set(LIBNAMEPREFIX "" CACHE STRING "Add a prefix to the openblas part of the library name" )
set(LIBNAMESUFFIX "" CACHE STRING "Add a suffix after the openblas part of the library name" )
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
else()
@ -96,7 +103,7 @@ message(WARNING "CMake support is experimental. It does not yet support all buil
include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE})
set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
set(BLASDIRS interface driver/level2 driver/level3 driver/others)
@ -323,7 +330,7 @@ if (NOT NOFORTRAN)
# Build test and ctest
add_subdirectory(test)
endif()
if (BUILD_TESTING)
if (BUILD_TESTING AND NOT BUILD_WITHOUT_LAPACK)
add_subdirectory(lapack-netlib/TESTING)
endif()
endif()
@ -336,11 +343,12 @@ endif()
add_subdirectory(cpp_thread_test)
endif()
if (NOT FIXED_LIBNAME)
set_target_properties(${OpenBLAS_LIBS} PROPERTIES
VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}
SOVERSION ${OpenBLAS_MAJOR_VERSION}
)
endif()
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
if (NOT MSVC)
target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition")
@ -452,6 +460,61 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
endif()
endif()
if (BUILD_BENCHMARKS)
#find_package(OpenMP REQUIRED)
file(GLOB SOURCES "benchmark/*.c")
if (NOT USE_OPENMP)
file(GLOB REMFILE "benchmark/smallscaling.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
endif()
if (BUILD_WITHOUT_LAPACK)
file(GLOB REMFILE "benchmark/cholesky.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
file(GLOB REMFILE "benchmark/geev.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
file(GLOB REMFILE "benchmark/gesv.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
file(GLOB REMFILE "benchmark/getri.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
file(GLOB REMFILE "benchmark/potrf.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
file(GLOB REMFILE "benchmark/spmv.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
file(GLOB REMFILE "benchmark/symv.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
file(GLOB REMFILE "benchmark/linpack.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
endif()
if (NOT USE_GEMM3M)
file(GLOB REMFILE "benchmark/gemm3m.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
endif()
foreach(source ${SOURCES})
get_filename_component(name ${source} NAME_WE)
if ((NOT ${name} STREQUAL "zdot-intel") AND (NOT ${name} STREQUAL "cula_wrapper"))
set(defines DEFAULT COMPLEX DOUBLE "COMPLEX\;DOUBLE")
foreach(define ${defines})
set(target_name "benchmark_${name}")
if (NOT "${define}" STREQUAL "DEFAULT")
string(JOIN "_" define_str ${define})
set(target_name "${target_name}_${define_str}")
endif()
if ((NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX_DOUBLE") AND
(NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX_DOUBLE") AND
(NOT ${target_name} STREQUAL "benchmark_max_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX_DOUBLE") AND
(NOT ${target_name} STREQUAL "benchmark_min_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX_DOUBLE"))
add_executable(${target_name} ${source})
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} )
# target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C)
if (NOT "${define}" STREQUAL "DEFAULT")
target_compile_definitions(${target_name} PRIVATE ${define})
endif()
endif()
endforeach()
endif()
endforeach()
endif()
# Install project

View File

@ -218,4 +218,8 @@ In chronological order:
* [2022-08] Fix building from sources for QNX
* Mark Seminatore <https://github.com/mseminatore>
* [2023-11-09] Improve Windows threading performance scaling
* [2023-11-09] Improve Windows threading performance scaling
* [2024-02-09] Introduce MT_TRACE facility and improve code consistency
* Dirreke <https://github.com/mseminatore>
* [2024-01-16] Add basic support for the CSKY architecture

View File

@ -1,4 +1,104 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.27
4-Apr-2024
general:
- added initial (generic) support for the CSKY architecture
- capped the maximum number of threads used in GEMM, GETRF and POTRF to avoid creating
underutilized or idle threads
- sped up multithreaded POTRF on all platforms
- added extension openblas_set_num_threads_local() that returns the previous thread count
- re-evaluated the SGEMV and DGEMV load thresholds to avoid activating multithreading
for too small workloads
- improved the fallback code used when the precompiled number of threads is exceeded,
and made it callable multiple times during the lifetime of an instance
- added CBLAS interfaces for the BLAS extensions ?AMIN,?AMAX, CAXPYC and ZAXPYC
- fixed a potential buffer overflow in the interface to the GEMMT kernels
- fixed use of incompatible pointer types in GEMMT and C/ZAXPBY as flagged by GCC-14
- fixed unwanted case sensitivity of the character parameters in ?TRTRS
- sped up the OpenMP thread management code
- fixed sizing of logical variables in INTERFACE64 builds of the C version of LAPACK
- fixed inclusion of new LAPACK and LAPACKE functions from LAPACK 3.11 in the shared library
- added a testsuite for the BLAS extensions
- modified the error thresholds for SGS/DGS functions in the LAPACK testsuite to suppress
spurious errors
- added support for building the benchmark collection with CMAKE
- added rewriting of linker options to avoid linking both libgomp and libomp in CMAKE builds
with OpenMP enabled that use clang with gfortran
- fixed building on systems with ucLibc
- added support for calling ?NRM2 with a negative increment value on all architectures
- added support for the LLVM18 version of the flang-new compiler
- fixed handling of the OPENBLAS_LOOPS variable in several benchmarks
- Integrated fixes from the Reference-LAPACK project:
- Increased accuracy in C/ZLARFGP (Reference-LAPACK PR 981)
x86:
- fixed handling of NaN and Inf arguments in ZSCAL
- fixed GEMM3M functions failing in CMAKE builds
x86-64:
- removed all instances of sched_yield() on Linux and BSD
- fixed a potential deadlock in the thread server on MSWindows (introduced in 0.3.26)
- fixed GEMM3M functions failing in CMAKE builds
- fixed handling of NaN and Inf arguments in ZSCAL
- added compiler checks for AVX512BF16 compatibility
- fixed LLVM compiler options for Sapphire Rapids
- fixed cpu handling fallbacks for Sapphire Rapids with
disabled AVX2 in DYNAMIC_ARCH mode
- fixed extensions SCSUM and DZSUM
- improved GEMM performance for ZEN targets
arm:
- fixed handling of NaN and Inf arguments in ZSCAL
arm64:
- added initial support for the Cortex-A76 cpu
- fixed handling of NaN and Inf arguments in ZSCAL
- fixed default compiler options for gcc (-march and -mtune)
- added support for ArmCompilerForLinux
- added support for the NeoverseV2 cpu in DYNAMIC_ARCH builds
- fixed mishandling of the INTERFACE64 option in CMAKE builds
- corrected SCSUM kernels (erroneously duplicating SCASUM behaviour)
- added SVE-enabled kernels for CSUM/ZSUM
- worked around an inaccuracy in the NRM2 kernels for NeoverseN1 and Apple M
power:
- improved performance of SGEMM on POWER8/9/10
- improved performance of DGEMM on POWER10
- added support for OpenMP builds with xlc/xlf on AIX
- improved cpu autodetection for DYNAMIC_ARCH builds on older AIX
- fixed cpu core counting on AIX
- added support for building a shared library on AIX
riscv64:
- added support for the X280 cpu
- added support for semi-generic RISCV models with vector length 128 or 256
- added support for compiling with either RVV 0.7.1 or RVV 1.0 standard compilers
- fixed handling of NaN and Inf arguments in ZSCAL
- improved cpu model autodetection
- fixed corner cases in ?AXPBY for C910V
- fixed handling of zero increments in ?AXPY kernels for C910V
loongarch64:
- added optimized kernels for ?AMIN and ?AMAX
- fixed handling of NaN and Inf arguments in ZSCAL
- fixed handling of corner cases in ?AXPBY
- fixed computation of SAMIN and DAMIN in LSX mode
- fixed computation of ?ROT
- added optimized SSYMV and DSYMV kernels for LSX and LASX mode
- added optimized CGEMM and ZGEMM kernels for LSX and LASX mode
- added optimized CGEMV and ZGEMV kernels
mips:
- fixed utilizing MSA on P5600 and related cpus (broken in 0.3.22)
- fixed handling of NaN and Inf arguments in ZSCAL
- fixed mishandling of the INTERFACE64 option in CMAKE builds
zarch:
- fixed handling of NaN and Inf arguments in ZSCAL
- fixed calculation of ?SUM on Z13
====================================================================
Version 0.3.26
2-Jan-2024

View File

@ -1,5 +1,9 @@
TOPDIR = .
include ./Makefile.system
LNCMD = ln -fs
ifeq ($(FIXED_LIBNAME), 1)
LNCMD = true
endif
BLASDIRS = interface driver/level2 driver/level3 driver/others
@ -134,17 +138,17 @@ shared : libs netlib $(RELA)
ifneq ($(NO_SHARED), 1)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
@$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so
@$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD))
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
@$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), Darwin)
@$(MAKE) -C exports dyn
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
@$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).dylib
@$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif
ifeq ($(OSNAME), WINNT)
@$(MAKE) -C exports dll
@ -152,6 +156,9 @@ endif
ifeq ($(OSNAME), CYGWIN_NT)
@$(MAKE) -C exports dll
endif
ifeq ($(OSNAME), AIX)
@$(MAKE) -C exports so
endif
endif
tests : shared
@ -229,13 +236,13 @@ ifeq ($(INTERFACE64),1)
endif
@echo THELIBNAME=$(LIBNAME) >> Makefile.conf_last
@echo THELIBSONAME=$(LIBSONAME) >> Makefile.conf_last
@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
@-$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
@touch lib.grd
prof : prof_blas prof_lapack
prof_blas :
ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
$(LNCMD) $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
for d in $(SUBDIRS) ; \
do if test -d $$d; then \
$(MAKE) -C $$d prof || exit 1 ; \
@ -246,7 +253,7 @@ ifeq ($(DYNAMIC_ARCH), 1)
endif
blas :
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
for d in $(BLASDIRS) ; \
do if test -d $$d; then \
$(MAKE) -C $$d libs || exit 1 ; \
@ -254,7 +261,7 @@ blas :
done
hpl :
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
for d in $(BLASDIRS) ../laswp exports ; \
do if test -d $$d; then \
$(MAKE) -C $$d $(@F) || exit 1 ; \
@ -268,7 +275,7 @@ ifeq ($(DYNAMIC_ARCH), 1)
endif
hpl_p :
ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
$(LNCMD) $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
for d in $(SUBDIRS) ../laswp exports ; \
do if test -d $$d; then \
$(MAKE) -C $$d $(@F) || exit 1 ; \
@ -309,8 +316,12 @@ endif
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1)
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc
else
ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGIBM1)
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc
else
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
endif
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
@ -401,6 +412,7 @@ lapack-runtest: lapack-test
blas-test:
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out)
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out)

View File

@ -58,6 +58,13 @@ FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
endif
endif
ifeq ($(CORE), CORTEXA76)
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a76
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a76
endif
endif
ifeq ($(CORE), FT2000)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
ifneq ($(F_COMPILER), NAG)
@ -104,19 +111,25 @@ ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
endif
else
CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native
CCOMMON_OPT += -march=armv8.4-a+sve
ifneq ($(CROSS), 1)
CCOMMON_OPT += -mtune=native
endif
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.4-a -mtune=native
FCOMMON_OPT += -march=armv8.4-a
ifneq ($(CROSS), 1)
FCOMMON_OPT += -mtune=native
endif
endif
endif
else
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
endif
endif
else
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
endif
@ -132,25 +145,31 @@ ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
ifneq ($(OSNAME), Darwin)
CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
else
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
endif
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
endif
else
CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native
CCOMMON_OPT += -march=armv8.5-a+sve
ifneq ($(CROSS), 1)
CCOMMON_OPT += -mtune=native
endif
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.5-a -mtune=native
FCOMMON_OPT += -march=armv8.5-a
ifneq ($(CROSS), 1)
FCOMMON_OPT += -mtune=native
endif
endif
endif
else
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
endif
endif
else
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
endif
@ -258,9 +277,17 @@ endif
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
ifeq ($(CORE), CORTEXX1)
CCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72
CCOMMON_OPT += -march=armv8.2-a
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ12) $(ISCLANG)))
CCOMMON_OPT += -mtune=cortex-x1
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-x1
endif
else
CCOMMON_OPT += -mtune=cortex-a72
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
endif
endif
endif
endif
@ -271,6 +298,12 @@ CCOMMON_OPT += -march=armv8.4-a+sve
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.4-a+sve
endif
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ12) $(ISCLANG)))
CCOMMON_OPT += -mtune=cortex-x2
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -mtune=cortex-x2
endif
endif
endif
endif
@ -290,6 +323,12 @@ CCOMMON_OPT += -march=armv8.4-a+sve
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.4-a+sve
endif
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ12) $(ISCLANG)))
CCOMMON_OPT += -mtune=cortex-a710
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -mtune=cortex-a710
endif
endif
endif
endif

4
Makefile.csky Normal file
View File

@ -0,0 +1,4 @@
ifeq ($(CORE), CK860FV)
CCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float
FCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float -static
endif

View File

@ -2,11 +2,15 @@ TOPDIR = .
export GOTOBLAS_MAKEFILE = 1
-include $(TOPDIR)/Makefile.conf_last
include ./Makefile.system
LNCMD = ln -fs
ifdef THELIBNAME
LIBNAME=$(THELIBNAME)
LIBSONAME=$(THELIBSONAME)
endif
ifeq ($(FIXED_LIBNAME), 1)
LNCMD = true
endif
ifeq ($(INTERFACE64),1)
USE_64BITINT=1
endif
@ -99,7 +103,7 @@ ifneq ($(NO_STATIC),1)
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
#for install shared library
ifneq ($(NO_SHARED),1)
@ -107,21 +111,21 @@ ifneq ($(NO_SHARED),1)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
@install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so ; \
$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD))
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), Darwin)
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@-install_name_tool -id "$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(MAJOR_VERSION).dylib" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif
ifeq ($(OSNAME), WINNT)
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
@ -149,15 +153,15 @@ ifneq ($(NO_STATIC),1)
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
#for install shared library
ifneq ($(NO_SHARED),1)
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so ; \
$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
endif
@ -170,6 +174,8 @@ endif
@echo Generating $(LIBSONAMEBASE)$(SUFFIX64).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(PKGFILE)"
@echo 'libprefix='$(LIBNAMEPREFIX) >> "$(PKGFILE)"
@echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)"
@echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)"
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)"
@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
@ -186,7 +192,7 @@ endif
ifneq ($(NO_SHARED),1)
#ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"

View File

@ -55,6 +55,26 @@ ifeq ($(TARGET), C910V)
TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d
endif
ifeq ($(TARGET), CK860FV)
TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float
endif
ifeq ($(TARGET), x280)
TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
endif
ifeq ($(TARGET), RISCV64_ZVL256B)
TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
endif
ifeq ($(TARGET), RISCV64_ZVL128B)
TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
endif
ifeq ($(TARGET), RISCV64_GENERIC)
TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d
endif
all: getarch_2nd
./getarch_2nd 0 >> $(TARGET_MAKE)
./getarch_2nd 1 >> $(TARGET_CONF)

View File

@ -2,3 +2,19 @@ ifeq ($(CORE), C910V)
CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920
FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static
endif
ifeq ($(CORE), x280)
CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math
FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
endif
ifeq ($(CORE), RISCV64_ZVL256B)
CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d
FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
endif
ifeq ($(CORE), RISCV64_ZVL128B)
CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
endif
ifeq ($(CORE), RISCV64_GENERIC)
CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d
FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static
endif

View File

@ -3,7 +3,12 @@
#
# This library's version
VERSION = 0.3.26
VERSION = 0.3.26.dev
# If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
# and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library
#
# LIBNAMEPREFIX = scipy
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

View File

@ -365,8 +365,9 @@ GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8)
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11)
GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11)
GCCVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12)
# Note that the behavior of -dumpversion is compile-time-configurable for
# gcc-7.x and newer. Use -dumpfullversion there
ifeq ($(GCCVERSIONGTEQ7),1)
@ -873,6 +874,11 @@ endif
endif
endif
ifeq ($(ARCH), csky)
NO_BINARY_MODE = 1
BINARY_DEFINED = 1
endif
#
# C Compiler dependent settings
#
@ -1176,7 +1182,7 @@ ifeq ($(F_COMPILER), IBM)
CCOMMON_OPT += -DF_INTERFACE_IBM
FEXTRALIB += -lxlf90
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG))
FCOMMON_OPT += -qextname
FCOMMON_OPT += -qextname -qzerosize
endif
# FCOMMON_OPT += -qarch=440
ifdef BINARY64
@ -1511,16 +1517,28 @@ ifndef LIBSONAMEBASE
LIBSONAMEBASE = openblas
endif
ifndef LIBNAMEPREFIX
LIBNAMEPREFIX =
endif
SYMPREFIX=$(SYMBOLPREFIX)
ifeq ($(SYMBOLPREFIX),$(LIBNAMEPREFIX))
SYMPREFIX=
endif
SYMSUFFIX=$(SYMBOLSUFFIX)
ifeq ($(SYMBOLSUFFIX),$(LIBNAMESUFFIX))
SYMSUFFIX=
endif
ifndef LIBNAMESUFFIX
LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)
LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)
else
LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX)
LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)$(LIBNAMESUFFIX)
endif
ifeq ($(OSNAME), CYGWIN_NT)
LIBPREFIX = cyg$(LIBNAMEBASE)
LIBPREFIX = cyg$(LIBNAMEPREFIX)$(LIBNAMEBASE)
else
LIBPREFIX = lib$(LIBNAMEBASE)
LIBPREFIX = lib$(LIBNAMEPREFIX)$(LIBNAMEBASE)
endif
KERNELDIR = $(TOPDIR)/kernel/$(ARCH)
@ -1652,6 +1670,10 @@ ifeq ($(F_COMPILER),CRAY)
LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
endif
ifeq ($(F_COMPILER),FLANGNEW)
LAPACK_FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
override FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
endif
LAPACK_CFLAGS = $(CFLAGS)
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
@ -1699,14 +1721,14 @@ LIBNAME_P = $(LIBPREFIX)p$(REVISION)_p.$(LIBSUFFIX)
endif
endif
ifeq ($(FIXED_LIBNAME),1)
LIBNAME = lib$(LIBNAMEPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).$(LIBSUFFIX)
LIBNAME_P = lib$(LIBNAMEPREFIX)$(LISOBNAMEBASE)$(LIBNAMESUFFIX)_p.$(LIBSUFFIX)
endif
LIBDLLNAME = $(LIBPREFIX).dll
IMPLIBNAME = lib$(LIBNAMEBASE).dll.a
ifneq ($(OSNAME), AIX)
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
else
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a)
endif
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)

View File

@ -130,11 +130,11 @@ ifeq ($(C_COMPILER), GCC)
endif
endif
else ifeq ($(C_COMPILER), CLANG)
# cooperlake support was added in clang 12
# sapphire rapids support was added in clang 12
ifeq ($(CLANGVERSIONGTEQ12), 1)
CCOMMON_OPT += -march=cooperlake
CCOMMON_OPT += -march=sapphirerapids
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=cooperlake
FCOMMON_OPT += -march=sapphirerapids
endif
else # not supported in clang, fallback to avx512
CCOMMON_OPT += -march=skylake-avx512

View File

@ -167,6 +167,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
- **Cortex A57**: Optimized Level-3 and Level-2 functions
- **Cortex A72**: same as A57 ( different cpu specifications)
- **Cortex A73**: same as A57 (different cpu specifications)
- **Cortex A76**: same as A57 (different cpu specifications)
- **Falkor**: same as A57 (different cpu specifications)
- **ThunderX**: Optimized some Level-1 functions
- **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
@ -185,6 +186,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
- **POWER10**: Optimized Level-3 BLAS including SBGEMM and some Level-1,2.
- **AIX**: Dynamic architecture with OpenXL and OpenMP.
```sh
make CC=ibm-clang_r FC=xlf TARGET=POWER7 BINARY=64 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 USE_THREAD=1
```
#### IBM zEnterprise System
- **Z13**: Optimized Level-3 BLAS and Level-1,2
@ -198,6 +204,21 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
```
(also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision)
- **x280**: Level-3 BLAS and Level-1,2 are optimized by RISC-V Vector extension 1.0.
```sh
make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran
```
- **ZVL???B**: Level-3 BLAS and Level-1,2 including vectorised kernels targeting generic RISCV cores with vector support with registers of at least the corresponding width; ZVL128B and ZVL256B are available.
e.g.:
```sh
make TARGET=RISCV64_ZVL256B CFLAGS="-DTARGET=RISCV64_ZVL256B" \
BINARY=64 ARCH=riscv64 CC='clang -target riscv64-unknown-linux-gnu' \
AR=riscv64-unknown-linux-gnu-ar AS=riscv64-unknown-linux-gnu-gcc \
LD=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran \
HOSTCC=gcc HOSTFC=gfortran -j
```
### Support for multiple targets in a single library
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
@ -227,7 +248,7 @@ Please note that it is not possible to combine support for different architectur
- **NetBSD**: Supported by the community. We don't actively test the library on this OS.
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
- **AIX**: Supported on PPC up to POWER8
- **AIX**: Supported on PPC up to POWER10
- **Haiku**: Supported by the community. We don't actively test the library on this OS.
- **SunOS**: Supported by the community. We don't actively test the library on this OS.
- **Cortex-M**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-on-Cortex-M>.

View File

@ -93,6 +93,7 @@ CORTEXA53
CORTEXA57
CORTEXA72
CORTEXA73
CORTEXA76
CORTEXA510
CORTEXA710
CORTEXX1
@ -118,8 +119,11 @@ Z13
Z14
10.RISC-V 64:
RISCV64_GENERIC
RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54)
RISCV64_ZVL128B
C910V
x280
RISCV64_ZVL256B
11.LOONGARCH64:
LOONGSONGENERIC
@ -133,3 +137,7 @@ E2K
EV4
EV5
EV6
14.CSKY
CSKY
CK860FV

View File

@ -37,6 +37,12 @@ ESSL=/opt/ibm/lib
#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
# x280 temporary workaround for gfortran
ifeq ($(TARGET), x280)
CCOMMON_OPT:=$(filter-out -mllvm --riscv-v-vector-bits-min=512,$(CCOMMON_OPT))
endif
ifneq ($(NO_LAPACK), 1)
GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \
@ -3436,4 +3442,4 @@ smallscaling: smallscaling.c ../$(LIBNAME)
clean ::
@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling
include $(TOPDIR)/Makefile.tail
include $(TOPDIR)/Makefile.tail

View File

@ -92,7 +92,7 @@ int main(int argc, char *argv[]){
if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step);

View File

@ -85,7 +85,7 @@ int main(int argc, char *argv[]){
double time1, time2, timeg1,timeg2;
char *p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
argc--;argv++;

View File

@ -120,7 +120,7 @@ int main(int argc, char *argv[]){
if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]);

View File

@ -54,7 +54,7 @@ int main(int argc, char *argv[]){
int step = 1;
int loops = 1;
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
double time1,timeg;

24
c_check
View File

@ -91,6 +91,7 @@ case "$data" in
*ARCH_ZARCH*) architecture=zarch ;;
*ARCH_RISCV64*) architecture=riscv64 ;;
*ARCH_LOONGARCH64*) architecture=loongarch64 ;;
*ARCH_CSKY*) architecture=csky ;;
esac
defined=0
@ -236,6 +237,7 @@ case "$data" in
*ARCH_ARM*) architecture=arm ;;
*ARCH_ZARCH*) architecture=zarch ;;
*ARCH_LOONGARCH64*) architecture=loongarch64 ;;
*ARCH_CSKY*) architecture=csky ;;
esac
binformat='bin32'
@ -244,6 +246,7 @@ case "$data" in
esac
no_avx512=0
no_avx512bf=0
if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
tmpf="$tmpd/a.c"
@ -262,6 +265,25 @@ if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
}
rm -rf "$tmpd"
if [ "$no_avx512" -eq 0 ]; then
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
tmpf="$tmpd/a.c"
code='"__m512 a= _mm512_dpbf16_ps(a, (__m512bh) _mm512_loadu_si512(%1]), (__m512bh) _mm512_loadu_si512(%2]));"'
printf "#include <immintrin.h>\n\nint main(void){ %s; }\n" "$code" >> "$tmpf"
if [ "$compiler" = "PGI" ]; then
args=" -tp cooperlake -c -o $tmpf.o $tmpf"
else
args=" -march=cooperlake -c -o $tmpf.o $tmpf"
fi
no_avx512bf=0
{
$compiler_name $flags $args >/dev/null 2>&1
} || {
no_avx512bf=1
}
rm -rf "$tmpd"
fi
fi
no_rv64gv=0
@ -409,6 +431,7 @@ done
[ "$makefile" = "-" ] && {
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
[ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
exit 0
@ -437,6 +460,7 @@ done
[ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n"
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
[ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
[ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"

View File

@ -97,6 +97,7 @@ $architecture = arm64 if ($data =~ /ARCH_ARM64/);
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
$architecture = csky if ($data =~ /ARCH_CSKY/);
$defined = 0;
@ -156,6 +157,11 @@ if ($architecture eq "loongarch64") {
$binary = 64;
}
if ($architecture eq "csky") {
$defined = 1;
$binary = 32;
}
if ($compiler eq "PGI") {
$compiler_name .= " -tp p7" if ($binary eq "32");
$compiler_name .= " -tp p7-64" if ($binary eq "64");
@ -284,6 +290,7 @@ $architecture = arm if ($data =~ /ARCH_ARM/);
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
$architecture = csky if ($data =~ /ARCH_CSKY/);
$binformat = bin32;
$binformat = bin64 if ($data =~ /BINARY_64/);

22
cblas.h
View File

@ -12,6 +12,7 @@ extern "C" {
/*Set the number of threads on runtime.*/
void openblas_set_num_threads(int num_threads);
void goto_set_num_threads(int num_threads);
int openblas_set_num_threads_local(int num_threads);
/*Get the number of threads on runtime.*/
int openblas_get_num_threads(void);
@ -100,6 +101,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
@ -115,6 +126,9 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
@ -289,6 +303,14 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA
void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
void cblas_sgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
void cblas_dgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
void cblas_cgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
void cblas_zgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);

View File

@ -64,6 +64,7 @@ else ()
"#define NEEDBUNDERSCORE 1\n")
endif()
if (CMAKE_Fortran_COMPILER)
get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE)
string(TOUPPER ${F_COMPILER} F_COMPILER)
endif()

View File

@ -6,9 +6,6 @@
if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
# This is for classic Flang. LLVM Flang is handled with gfortran below.
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
if (BINARY64 AND INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
endif ()
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
endif ()
@ -55,6 +52,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
if (MIPS64)
if (BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64")
if (INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
endif ()
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32")
endif ()
@ -83,9 +83,14 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
endif ()
endif ()
endif ()
if (ARM64 AND INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
endif ()
else ()
if (BINARY64)
if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
endif ()
if (INTERFACE64)
if (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel")
if (WIN32)
@ -98,7 +103,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
endif ()
endif ()
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -m32")
if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
set(FCOMMON_OPT "${FCOMMON_OPT} -m32")
endif ()
endif ()
endif ()

View File

@ -1,4 +1,6 @@
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
libnameprefix=@LIBNAMEPREFIX@
libnamesuffix=@LIBNAMESUFFIX@
libsuffix=@SUFFIX64_UNDERSCORE@
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
@ -7,5 +9,5 @@ Name: OpenBLAS
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
Version: @OpenBLAS_VERSION@
URL: https://github.com/OpenMathLib/OpenBLAS
Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix}
Libs: @OpenMP_C_FLAGS@ -L${libdir} -l${libnameprefix}openblas${libnamesuffix}${libsuffix}
Cflags: -I${includedir}

View File

@ -932,7 +932,7 @@ endif ()
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73")
elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73" OR "${TCORE}" STREQUAL "CORTEXA76")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t49152\n"
"#define L1_CODE_LINESIZE\t64\n"

View File

@ -501,10 +501,11 @@ set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_TH
endif()
endif()
endif()
set(LIBPREFIX "lib${LIBNAMEPREFIX}openblas")
if (DEFINED LIBNAMESUFFIX)
set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}")
else ()
set(LIBPREFIX "libopenblas")
set(LIBPREFIX "${LIBNAMEPREFIX}_${LIBNAMESUFFIX}")
endif ()
if (NOT DEFINED SYMBOLPREFIX)
@ -615,13 +616,19 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
endforeach ()
endif ()
if ("${F_COMPILER}" STREQUAL "NAG" OR "${F_COMPILER}" STREQUAL "CRAY")
if (CMAKE_Fortran_COMPILER)
if (${F_COMPILER} STREQUAL "NAG" OR ${F_COMPILER} STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512")
if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
message(STATUS "removing fortran flags")
set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64")
endif ()
foreach (FILTER_FLAG ${FILTER_FLAGS})
string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS})
string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS})
endforeach ()
endif ()
endif ()
if ("${F_COMPILER}" STREQUAL "GFORTRAN")
# lapack-netlib is rife with uninitialized warnings -hpa
@ -679,6 +686,10 @@ else ()
endif ()
endif ()
if (DEFINED FIXED_LIBNAME)
set (LIBNAME "${LIBPREFIX}.${LIBSUFFIX}")
set (LIBNAME "${LIBPREFIX}_p.${LIBSUFFIX}")
endif()
set(LIBDLLNAME "${LIBPREFIX}.dll")
set(LIBSONAME "${LIBNAME}.${LIBSUFFIX}.so")

View File

@ -358,12 +358,6 @@ typedef int blasint;
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop; \n");
#endif
#ifdef BULLDOZER
#ifndef YIELDING
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif
#endif
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
#ifndef YIELDING
@ -371,21 +365,13 @@ typedef int blasint;
#endif
#endif
/*
#ifdef PILEDRIVER
#ifndef YIELDING
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif
#endif
*/
/*
#ifdef STEAMROLLER
#if defined(ARCH_X86_64)
#ifndef YIELDING
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif
#endif
*/
#ifdef __EMSCRIPTEN__
#define YIELDING
@ -396,7 +382,7 @@ typedef int blasint;
#endif
/***
To alloc job_t on heap or statck.
To alloc job_t on heap or stack.
please https://github.com/xianyi/OpenBLAS/issues/246
***/
#if defined(OS_WINDOWS)
@ -482,6 +468,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#include "common_e2k.h"
#endif
#ifdef ARCH_CSKY
#include "common_csky.h"
#endif
#ifndef ASSEMBLER
#ifdef OS_WINDOWSSTORE
typedef char env_var_t[MAX_PATH];

56
common_csky.h Normal file
View File

@ -0,0 +1,56 @@
/*****************************************************************************
Copyright (c) 2011-2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#ifndef COMMON_CSKY
#define COMMON_CSKY
#define MB __sync_synchronize()
#define WMB __sync_synchronize()
#define RMB __sync_synchronize()
#define INLINE inline
#ifndef ASSEMBLER
static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
}
#endif
#define BUFFER_SIZE ( 32 << 20)
#define SEEK_ADDRESS
#endif

View File

@ -498,6 +498,15 @@ void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double *
void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);
void BLASFUNC(sgemmt)(char*, char *, char *, blasint *, blasint *, float *,
float *, blasint *, float *, blasint *, float *, float *, blasint *);
void BLASFUNC(dgemmt)(char*, char *, char *, blasint *, blasint *, double *,
double *, blasint *, double *, blasint *, double *, double *, blasint *);
void BLASFUNC(cgemmt)(char*, char *, char *, blasint *, blasint *, float *,
float *, blasint *, float *, blasint *, float *, float *, blasint *);
void BLASFUNC(zgemmt)(char*, char *, char *, blasint *, blasint *, double *,
double *, blasint *, double *, blasint *, double *, double *, blasint *);
int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *,
float *, float *, blasint *, float *, blasint *,
float *, float *, blasint *);
@ -764,8 +773,8 @@ xdouble BLASFUNC(qlamc3)(xdouble *, xdouble *);
void BLASFUNC(saxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *);
void BLASFUNC(daxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *);
void BLASFUNC(caxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *);
void BLASFUNC(zaxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *);
void BLASFUNC(caxpby) (blasint *, void *, float *, blasint *, void *, float *, blasint *);
void BLASFUNC(zaxpby) (blasint *, void *, double *, blasint *, void *, double *, blasint *);
void BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *);
void BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *);

View File

@ -91,8 +91,26 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define BUFFER_SIZE ( 32 << 20)
#define SEEK_ADDRESS
#if defined(C910V)
#include <riscv_vector.h>
#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(RISCV64_ZVL128B) || defined(x280)
# include <riscv_vector.h>
#endif
#if defined( __riscv_xtheadc ) && defined( __riscv_v ) && ( __riscv_v <= 7000 )
// t-head toolchain uses obsolete rvv intrinsics, can't build for C910V without this
#define RISCV_0p10_INTRINSICS
#define RISCV_RVV(x) x
#else
#define RISCV_RVV(x) __riscv_ ## x
#endif
#if defined(C910V) || defined(RISCV64_ZVL256B)
# if !defined(DOUBLE)
# define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f32m1_f32)(v)
# else
# define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f64m1_f64)(v)
# endif
#else
# define EXTRACT_FLOAT(v) (v[0])
#endif
#endif

View File

@ -137,19 +137,20 @@ typedef struct blas_queue {
extern int blas_server_avail;
extern int blas_omp_number_max;
extern int blas_omp_threads_local;
static __inline int num_cpu_avail(int level) {
#ifdef USE_OPENMP
int openmp_nthreads;
openmp_nthreads=omp_get_max_threads();
if (omp_in_parallel()) openmp_nthreads = blas_omp_threads_local;
#endif
#ifndef USE_OPENMP
if (blas_cpu_number == 1
#endif
#ifdef USE_OPENMP
if (openmp_nthreads == 1 || omp_in_parallel()
#else
if (openmp_nthreads == 1
#endif
) return 1;

View File

@ -42,6 +42,7 @@ size_t length64=sizeof(value64);
#define CPU_CORTEXA57 3
#define CPU_CORTEXA72 4
#define CPU_CORTEXA73 5
#define CPU_CORTEXA76 23
#define CPU_NEOVERSEN1 11
#define CPU_NEOVERSEV1 16
#define CPU_NEOVERSEN2 17
@ -89,7 +90,8 @@ static char *cpuname[] = {
"CORTEXX2",
"CORTEXA510",
"CORTEXA710",
"FT2000"
"FT2000",
"CORTEXA76"
};
static char *cpuname_lower[] = {
@ -115,7 +117,8 @@ static char *cpuname_lower[] = {
"cortexx2",
"cortexa510",
"cortexa710",
"ft2000"
"ft2000",
"cortexa76"
};
int get_feature(char *search)
@ -210,6 +213,8 @@ int detect(void)
return CPU_CORTEXX2;
else if (strstr(cpu_part, "0xd4e")) //X3
return CPU_CORTEXX2;
else if (strstr(cpu_part, "0xd0b"))
return CPU_CORTEXA76;
}
// Qualcomm
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
@ -391,6 +396,7 @@ void get_cpuconfig(void)
break;
case CPU_NEOVERSEV1:
case CPU_CORTEXA76:
printf("#define %s\n", cpuname[d]);
printf("#define L1_CODE_SIZE 65536\n");
printf("#define L1_CODE_LINESIZE 64\n");

View File

@ -70,12 +70,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define CPU_GENERIC 0
#define CPU_C910V 1
#define CPU_GENERIC 0
#define CPU_C910V 1
#define CPU_x280 2
#define CPU_RISCV64_ZVL256B 3
#define CPU_RISCV64_ZVL128B 4
static char *cpuname[] = {
"RISCV64_GENERIC",
"C910V"
"C910V",
"x280",
"CPU_RISCV64_ZVL256B",
"CPU_RISCV64_ZVL128B"
};
static char *cpuname_lower[] = {
"riscv64_generic",
"c910v",
"x280",
"riscv64_zvl256b",
"riscv64_zvl128b"
};
int detect(void){
@ -86,23 +100,29 @@ int detect(void){
char *pmodel = NULL, *pisa = NULL;
infile = fopen("/proc/cpuinfo", "r");
if (!infile)
return CPU_GENERIC;
while (fgets(buffer, sizeof(buffer), infile)){
if(!strncmp(buffer, "model name", 10)){
strcpy(model_buffer, buffer);
pmodel = strchr(isa_buffer, ':') + 1;
pmodel = strchr(model_buffer, ':');
if (pmodel)
pmodel++;
}
if(!strncmp(buffer, "isa", 3)){
strcpy(isa_buffer, buffer);
pisa = strchr(isa_buffer, '4') + 1;
pisa = strchr(isa_buffer, '4');
if (pisa)
pisa++;
}
}
fclose(infile);
if (!pmodel)
if (!pmodel || !pisa)
return(CPU_GENERIC);
if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v'))
return CPU_C910V;
@ -140,5 +160,5 @@ void get_cpuconfig(void){
}
void get_libname(void){
printf("riscv64\n");
printf("%s", cpuname_lower[detect()]);
}

View File

@ -173,6 +173,10 @@ HAVE_C11
ARCH_E2K
#endif
#if defined(__csky__)
ARCH_CSKY
#endif
#if defined(__EMSCRIPTEN__)
ARCH_RISCV64
OS_WINDOWS

View File

@ -40,6 +40,10 @@ else()
c_${float_char}blas1.c)
endif()
target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(x${float_char}cblat1 omp pthread)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat1 m)
endif()
@ -65,6 +69,10 @@ else()
constant.c)
endif()
target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(x${float_char}cblat2 omp pthread)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat2 m)
endif()
@ -80,6 +88,17 @@ if (NOT NOFORTRAN)
auxiliary.c
c_xerbla.c
constant.c)
if (USE_GEMM3M)
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
add_executable(x${float_char}cblat3_3m
c_${float_char}blat3_3m.f
c_${float_char}blas3_3m.c
c_${float_char}3chke_3m.c
auxiliary.c
c_xerbla.c
constant.c)
endif()
endif()
else()
add_executable(x${float_char}cblat3
c_${float_char}blat3c.c
@ -88,12 +107,44 @@ else()
auxiliary.c
c_xerbla.c
constant.c)
if (USE_GEMM3M)
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
add_executable(x${float_char}cblat3_3m
c_${float_char}blat3c_3m.c
c_${float_char}blas3_3m.c
c_${float_char}3chke_3m.c
auxiliary.c
c_xerbla.c
constant.c)
endif()
endif()
endif()
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(x${float_char}cblat3 omp pthread)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat3 m)
endif()
if (USE_GEMM3M)
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(x${float_char}cblat3 omp pthread)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat3_3m m)
endif()
endif()
endif()
add_test(NAME "x${float_char}cblat3"
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
if (USE_GEMM3M)
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
add_test(NAME "x${float_char}cblat3_3m"
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3_3m> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3_3m")
endif()
endif()
endforeach()

View File

@ -5,6 +5,24 @@
TOPDIR = ..
include $(TOPDIR)/Makefile.system
SUPPORT_GEMM3M = 0
ifeq ($(ARCH), x86)
SUPPORT_GEMM3M = 1
endif
ifeq ($(ARCH), x86_64)
SUPPORT_GEMM3M = 1
endif
ifeq ($(ARCH), ia64)
SUPPORT_GEMM3M = 1
endif
ifeq ($(ARCH), MIPS)
SUPPORT_GEMM3M = 1
endif
override CFLAGS += -DADD$(BU) -DCBLAS
ifeq ($(F_COMPILER),GFORTRAN)
override FFLAGS += -fno-tree-vectorize
@ -144,9 +162,15 @@ all3targets += xdcblat3
endif
ifeq ($(BUILD_COMPLEX),1)
all3targets += xccblat3
ifeq ($(SUPPORT_GEMM3M),1)
all3targets += xccblat3_3m
endif
endif
ifeq ($(BUILD_COMPLEX16),1)
all3targets += xzcblat3
ifeq ($(SUPPORT_GEMM3M),1)
all3targets += xzcblat3_3m
endif
endif
all3: $(all3targets)
@ -181,9 +205,9 @@ endif
endif
endif
all3_3m: xzcblat3_3m xccblat3_3m
ifeq ($(SUPPORT_GEMM3M),1)
ifeq ($(USE_OPENMP), 1)
ifeq ($(BUILD_SINGLE),1)
ifeq ($(BUILD_COMPLEX),1)
OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
endif
ifeq ($(BUILD_COMPLEX16),1)
@ -197,6 +221,7 @@ ifeq ($(BUILD_COMPLEX16),1)
OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
endif
endif
endif
@ -218,6 +243,9 @@ ifeq ($(F_COMPILER), IBM)
ifeq ($(C_COMPILER), GCC)
CEXTRALIB += -lgomp
endif
ifeq ($(C_COMPILER), CLANG)
CEXTRALIB += -lomp
endif
endif
endif
@ -268,8 +296,10 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
ifeq ($(SUPPORT_GEMM3M),1)
xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
endif
else
xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
@ -277,6 +307,10 @@ xccblat2: $(ctestl2o) c_cblat2c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
ifeq ($(SUPPORT_GEMM3M),1)
xccblat3_3m: $(ctestl3o_3m) c_cblat3c_3m.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xccblat3_3m c_cblat3c_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
endif
endif
endif
@ -290,8 +324,10 @@ xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
ifeq ($(SUPPORT_GEMM3M),1)
xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
endif
else
xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
@ -299,6 +335,10 @@ xzcblat2: $(ztestl2o) c_zblat2c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
ifeq ($(SUPPORT_GEMM3M),1)
xzcblat3_3m: $(ztestl3o_3m) c_zblat3c_3m.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xzcblat3_3m c_zblat3c_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
endif
endif
endif

View File

@ -96,7 +96,7 @@
INTEGER ICAMAXTEST
EXTERNAL SCASUMTEST, SCNRM2TEST, ICAMAXTEST
* .. External Subroutines ..
EXTERNAL CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1
EXTERNAL CSCALTEST, CSSCALTEST, CTEST, ITEST1, STEST1
* .. Intrinsic Functions ..
INTRINSIC MAX
* .. Common blocks ..
@ -214,8 +214,8 @@
CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1),
+ STRUE4(NP1),SFAC)
ELSE IF (ICASE.EQ.8) THEN
* .. CSCAL ..
CALL CSCAL(N,CA,CX,INCX)
* .. CSCALTEST ..
CALL CSCALTEST(N,CA,CX,INCX)
CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX),
+ SFAC)
ELSE IF (ICASE.EQ.9) THEN
@ -236,14 +236,14 @@
*
INCX = 1
IF (ICASE.EQ.8) THEN
* CSCAL
* CSCALTEST
* Add a test for alpha equal to zero.
CA = (0.0E0,0.0E0)
DO 80 I = 1, 5
MWPCT(I) = (0.0E0,0.0E0)
MWPCS(I) = (1.0E0,1.0E0)
80 CONTINUE
CALL CSCAL(5,CA,CX,INCX)
CALL CSCALTEST(5,CA,CX,INCX)
CALL CTEST(5,CX,MWPCT,MWPCS,SFAC)
ELSE IF (ICASE.EQ.9) THEN
* CSSCALTEST

View File

@ -440,6 +440,7 @@ static real c_b43 = (float)1.;
extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*);
static complex mwpcs[5], mwpct[5];
extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*);
extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_();
static complex cx[8];
extern real scnrm2test_(integer*, complex*, integer*);
static integer np1;
@ -481,7 +482,7 @@ static real c_b43 = (float)1.;
stest1_(&r__1, &strue4[np1 - 1], &strue4[np1 - 1], sfac);
} else if (combla_1.icase == 8) {
/* .. CSCAL .. */
cscal_(&combla_1.n, &ca, cx, &combla_1.incx);
cscaltest_(&combla_1.n, &ca, cx, &combla_1.incx);
ctest_(&len, cx, &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48],
&ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], sfac);
} else if (combla_1.icase == 9) {
@ -515,7 +516,7 @@ static real c_b43 = (float)1.;
mwpcs[i__1].r = (float)1., mwpcs[i__1].i = (float)1.;
/* L80: */
}
cscal_(&c__5, &ca, cx, &combla_1.incx);
cscaltest_(&c__5, &ca, cx, &combla_1.incx);
ctest_(&c__5, cx, mwpct, mwpcs, sfac);
} else if (combla_1.icase == 9) {
/* CSSCALTEST */

3942
ctest/c_cblat3c_3m.c Normal file

File diff suppressed because it is too large Load Diff

3951
ctest/c_zblat3c_3m.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -545,13 +545,31 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
*range_n, IFLOAT *sa, IFLOAT *sb,
BLASLONG nthreads_m, BLASLONG nthreads_n) {
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
#ifdef USE_OPENMP
static omp_lock_t level3_lock, critical_section_lock;
static volatile BLASLONG init_lock = 0, omp_lock_initialized = 0,
parallel_section_left = MAX_PARALLEL_NUMBER;
// Lock initialization; Todo : Maybe this part can be moved to blas_init() in blas_server_omp.c
while(omp_lock_initialized == 0)
{
blas_lock(&init_lock);
{
if(omp_lock_initialized == 0)
{
omp_init_lock(&level3_lock);
omp_init_lock(&critical_section_lock);
omp_lock_initialized = 1;
WMB;
}
blas_unlock(&init_lock);
}
}
#elif defined(OS_WINDOWS)
CRITICAL_SECTION level3_lock;
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
#else
CRITICAL_SECTION level3_lock;
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
#endif
blas_arg_t newarg;
@ -599,12 +617,28 @@ InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
pthread_mutex_lock(&level3_lock);
#ifdef USE_OPENMP
omp_set_lock(&level3_lock);
omp_set_lock(&critical_section_lock);
parallel_section_left--;
/*
How OpenMP locks works with NUM_PARALLEL
1) parallel_section_left = Number of available concurrent executions of OpenBLAS - Number of currently executing OpenBLAS executions
2) level3_lock is acting like a master lock or barrier which stops OpenBLAS calls when all the parallel_section are currently busy executing other OpenBLAS calls
3) critical_section_lock is used for updating variables shared between threads executing OpenBLAS calls concurrently and for unlocking of master lock whenever required
4) Unlock master lock only when we have not already exhausted all the parallel_sections and allow another thread with a OpenBLAS call to enter
*/
if(parallel_section_left != 0)
omp_unset_lock(&level3_lock);
omp_unset_lock(&critical_section_lock);
#elif defined(OS_WINDOWS)
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
#else
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
pthread_mutex_lock(&level3_lock);
#endif
#ifdef USE_ALLOC_HEAP
@ -732,12 +766,24 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
free(job);
#endif
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
pthread_mutex_unlock(&level3_lock);
#else
#ifdef USE_OPENMP
omp_set_lock(&critical_section_lock);
parallel_section_left++;
/*
Unlock master lock only when all the parallel_sections are already exhausted and one of the thread has completed its OpenBLAS call
otherwise just increment the parallel_section_left
The master lock is only locked when we have exhausted all the parallel_sections, So only unlock it then and otherwise just increment the count
*/
if(parallel_section_left == 1)
omp_unset_lock(&level3_lock);
omp_unset_lock(&critical_section_lock);
#elif defined(OS_WINDOWS)
LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#else
pthread_mutex_unlock(&level3_lock);
#endif
return 0;

View File

@ -113,6 +113,8 @@ extern unsigned int openblas_thread_timeout(void);
/* We need this global for checking if initialization is finished. */
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
int blas_omp_threads_local = 1;
/* Local Variables */
#if defined(USE_PTHREAD_LOCK)
static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER;

View File

@ -69,6 +69,7 @@
int blas_server_avail = 0;
int blas_omp_number_max = 0;
int blas_omp_threads_local = 1;
extern int openblas_omp_adaptive_env(void);
@ -406,7 +407,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
}
#endif
while(true) {
while (true) {
for(i=0; i < MAX_PARALLEL_NUMBER; i++) {
#ifdef HAVE_C11
_Bool inuse = false;
@ -419,10 +420,9 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
break;
}
}
if(i != MAX_PARALLEL_NUMBER)
break;
}
if (i != MAX_PARALLEL_NUMBER)
break;
}
if (openblas_omp_adaptive_env() != 0) {
#pragma omp parallel for num_threads(num) schedule(OMP_SCHED)
for (i = 0; i < num; i ++) {

View File

@ -48,6 +48,12 @@
#endif
#endif
#ifdef SMP_DEBUG
# define MT_TRACE(...) fprintf(stderr, __VA_ARGS__)
#else
# define MT_TRACE(...)
#endif
/* This is a thread implementation for Win32 lazy implementation */
/* Thread server common information */
@ -59,6 +65,8 @@ static CRITICAL_SECTION queue_lock;
/* We need this global for checking if initialization is finished. */
int blas_server_avail = 0;
int blas_omp_threads_local = 1;
/* Local Variables */
static BLASULONG server_lock = 0;
@ -66,19 +74,12 @@ static HANDLE blas_threads [MAX_CPU_NUMBER];
static DWORD blas_threads_id[MAX_CPU_NUMBER];
static volatile int thread_target; // target num of live threads, volatile for cross-thread reads
#if defined (__GNUC__) && (__GNUC__ < 6)
#define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch)
#else
#if defined(_WIN64)
#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp)
#else
#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp)
#endif
#endif
//
// Legacy code path
//
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) {
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
if (!(mode & BLAS_COMPLEX)){
if (!(mode & BLAS_COMPLEX)) {
#ifdef EXPRECISION
if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
/* REAL / Extended Double */
@ -93,7 +94,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> c, args -> ldc, sb);
} else
#endif
if ((mode & BLAS_PREC) == BLAS_DOUBLE){
if ((mode & BLAS_PREC) == BLAS_DOUBLE) {
/* REAL / Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
double *, BLASLONG, double *, BLASLONG,
@ -104,7 +105,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
} else if ((mode & BLAS_PREC) == BLAS_SINGLE){
} else if ((mode & BLAS_PREC) == BLAS_SINGLE) {
/* REAL / Single */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, float *, BLASLONG,
@ -116,7 +117,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
#ifdef BUILD_BFLOAT16
} else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){
} else if ((mode & BLAS_PREC) == BLAS_BFLOAT16) {
/* REAL / BFLOAT16 */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
bfloat16 *, BLASLONG, bfloat16 *, BLASLONG,
@ -127,7 +128,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
} else if ((mode & BLAS_PREC) == BLAS_STOBF16){
} else if ((mode & BLAS_PREC) == BLAS_STOBF16) {
/* REAL / BLAS_STOBF16 */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, bfloat16 *, BLASLONG,
@ -138,7 +139,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
} else if ((mode & BLAS_PREC) == BLAS_DTOBF16){
} else if ((mode & BLAS_PREC) == BLAS_DTOBF16) {
/* REAL / BLAS_DTOBF16 */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
double *, BLASLONG, bfloat16 *, BLASLONG,
@ -155,7 +156,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
}
} else {
#ifdef EXPRECISION
if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
if ((mode & BLAS_PREC) == BLAS_XDOUBLE) {
/* COMPLEX / Extended Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
xdouble *, BLASLONG, xdouble *, BLASLONG,
@ -169,7 +170,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> c, args -> ldc, sb);
} else
#endif
if ((mode & BLAS_PREC) == BLAS_DOUBLE){
if ((mode & BLAS_PREC) == BLAS_DOUBLE) {
/* COMPLEX / Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
double *, BLASLONG, double *, BLASLONG,
@ -199,10 +200,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
}
}
/* This is a main routine of threads. Each thread waits until job is */
/* queued. */
static DWORD WINAPI blas_thread_server(void *arg){
//
// This is a main routine of threads. Each thread waits until job is queued.
//
static DWORD WINAPI blas_thread_server(void *arg) {
/* Thread identifier */
BLASLONG cpu = (BLASLONG)arg;
@ -213,31 +214,24 @@ static DWORD WINAPI blas_thread_server(void *arg){
/* Each server needs each buffer */
buffer = blas_memory_alloc(2);
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu);
#endif
MT_TRACE("Server[%2ld] Thread is started!\n", cpu);
while (1){
while (1) {
/* Waiting for Queue */
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu);
#endif
// event raised when work is added to the queue
WaitForSingleObject(kickoff_event, INFINITE);
MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu);
if (cpu > thread_target - 2)
{
//printf("thread [%d] exiting.\n", cpu);
break; // excess thread, so worker thread exits
}
// event raised when work is added to the queue
WaitForSingleObject(kickoff_event, INFINITE);
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Got it.\n", cpu);
#endif
if (cpu > thread_target - 2) {
//MT_TRACE("thread [%d] exiting.\n", cpu);
break; // excess thread, so worker thread exits
}
MT_TRACE("Server[%2ld] Got it.\n", cpu);
#if 1
EnterCriticalSection(&queue_lock);
queue = work_queue;
@ -245,53 +239,39 @@ static DWORD WINAPI blas_thread_server(void *arg){
work_queue = work_queue->next;
LeaveCriticalSection(&queue_lock);
#else
volatile blas_queue_t* queue_next;
INT_PTR prev_value;
do {
queue = (volatile blas_queue_t*)work_queue;
if (!queue)
break;
queue_next = (volatile blas_queue_t*)queue->next;
prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue);
} while (prev_value != queue);
#endif
if (queue) {
if (queue) {
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
sa = queue -> sa;
sb = queue -> sb;
#ifdef CONSISTENT_FPCSR
__asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
#endif
#ifdef CONSISTENT_FPCSR
__asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
#endif
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
MT_TRACE("Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
#endif
// fprintf(stderr, "queue start[%ld]!!!\n", cpu);
#ifdef MONITOR
main_status[cpu] = MAIN_RUNNING1;
#endif
#ifdef MONITOR
main_status[cpu] = MAIN_RUNNING1;
#endif
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
if (sa == NULL)
sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
if (sb == NULL) {
if (!(queue -> mode & BLAS_COMPLEX)){
if (!(queue -> mode & BLAS_COMPLEX)) {
#ifdef EXPRECISION
if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE) {
sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else
#endif
if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) {
#ifdef BUILD_DOUBLE
sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
@ -325,65 +305,58 @@ static DWORD WINAPI blas_thread_server(void *arg){
/* Other types in future */
}
}
queue->sb=sb;
queue->sb=sb;
}
#ifdef MONITOR
main_status[cpu] = MAIN_RUNNING2;
#endif
#ifdef MONITOR
main_status[cpu] = MAIN_RUNNING2;
#endif
if (!(queue -> mode & BLAS_LEGACY)) {
(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
} else {
legacy_exec(routine, queue -> mode, queue -> args, sb);
legacy_exec(routine, queue -> mode, queue -> args, sb);
}
}else{
continue; //if queue == NULL
}
} else {
continue; //if queue == NULL
}
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
#endif
MT_TRACE("Server[%2ld] Finished!\n", cpu);
queue->finished = 1;
queue->finished = 1;
}
/* Shutdown procedure */
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu);
#endif
MT_TRACE("Server[%2ld] Shutdown!\n", cpu);
blas_memory_free(buffer);
return 0;
}
}
/* Initializing routine */
int blas_thread_init(void){
//
// Initializing routine
//
int blas_thread_init(void) {
BLASLONG i;
if (blas_server_avail || (blas_cpu_number <= 1)) return 0;
LOCK_COMMAND(&server_lock);
#ifdef SMP_DEBUG
fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n",
blas_cpu_number);
#endif
MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number);
if (!blas_server_avail){
// create the kickoff Event
kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
if (!blas_server_avail) {
// create the kickoff Event
kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
thread_target = blas_cpu_number;
thread_target = blas_cpu_number;
InitializeCriticalSection(&queue_lock);
for(i = 0; i < blas_cpu_number - 1; i++){
//printf("thread_init: creating thread [%d]\n", i);
for(i = 0; i < blas_cpu_number - 1; i++) {
//MT_TRACE("thread_init: creating thread [%d]\n", i);
blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i,
@ -398,15 +371,12 @@ int blas_thread_init(void){
return 0;
}
/*
User can call one of two routines.
exec_blas_async ... immediately returns after jobs are queued.
exec_blas ... returns after jobs are finished.
*/
int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
//
// User can call one of two routines.
// exec_blas_async ... immediately returns after jobs are queued.
// exec_blas ... returns after jobs are finished.
//
int exec_blas_async(BLASLONG pos, blas_queue_t *queue) {
#if defined(SMP_SERVER)
// Handle lazy re-init of the thread-pool after a POSIX fork
@ -426,7 +396,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
__asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode));
#endif
current->finished = 0;
current->finished = 0;
current = current -> next;
pos ++;
}
@ -435,18 +405,18 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
if (!work_queue)
{
work_queue = queue;
work_queue = queue;
}
else
{
blas_queue_t *next_item = work_queue;
blas_queue_t *queue_item = work_queue;
// find the end of the work queue
while (next_item)
next_item = next_item->next;
// find the end of the work queue
while (queue_item->next)
queue_item = queue_item->next;
// add new work to the end
next_item = queue;
// add new work to the end
queue_item->next = queue;
}
LeaveCriticalSection(&queue_lock);
@ -456,26 +426,25 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
return 0;
}
int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
//
// Join. Wait for all queued tasks to complete
//
int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) {
#ifdef SMP_DEBUG
fprintf(STDERR, "Synchronization Waiting.\n");
#endif
MT_TRACE("Synchronization Waiting.\n");
while (num){
#ifdef SMP_DEBUG
fprintf(STDERR, "Waiting Queue ..\n");
#endif
while (!queue->finished)
YIELDING;
while (num) {
MT_TRACE("Waiting Queue ..\n");
queue = queue->next;
num--;
}
while (!queue->finished)
YIELDING;
queue = queue->next;
num--;
}
MT_TRACE("Completely Done.\n\n");
#ifdef SMP_DEBUG
fprintf(STDERR, "Completely Done.\n\n");
#endif
// if work was added to the queue after this batch we can't sleep the worker threads
// by resetting the event
EnterCriticalSection(&queue_lock);
@ -488,8 +457,10 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
return 0;
}
/* Execute Threads */
int exec_blas(BLASLONG num, blas_queue_t *queue){
//
// Execute Threads
//
int exec_blas(BLASLONG num, blas_queue_t *queue) {
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
// Handle lazy re-init of the thread-pool after a POSIX fork
@ -502,29 +473,33 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
if ((num <= 0) || (queue == NULL)) return 0;
if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next);
if ((num > 1) && queue -> next)
exec_blas_async(1, queue -> next);
routine = queue -> routine;
if (queue -> mode & BLAS_LEGACY) {
legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
} else
} else {
if (queue -> mode & BLAS_PTHREAD) {
void (*pthreadcompat)(void *) = queue -> routine;
(pthreadcompat)(queue -> args);
} else
(routine)(queue -> args, queue -> range_m, queue -> range_n,
queue -> sa, queue -> sb, 0);
queue -> sa, queue -> sb, 0);
}
if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next);
if ((num > 1) && queue -> next)
exec_blas_async_wait(num - 1, queue -> next);
return 0;
}
/* Shutdown procedure, but user don't have to call this routine. The */
/* kernel automatically kill threads. */
int BLASFUNC(blas_thread_shutdown)(void){
//
// Shutdown procedure, but user don't have to call this routine. The
// kernel automatically kill threads.
//
int BLASFUNC(blas_thread_shutdown)(void) {
int i;
@ -532,9 +507,9 @@ int BLASFUNC(blas_thread_shutdown)(void){
LOCK_COMMAND(&server_lock);
if (blas_server_avail){
if (blas_server_avail) {
for(i = 0; i < blas_num_threads - 1; i++){
for (i = 0; i < blas_num_threads - 1; i++) {
// Could also just use WaitForMultipleObjects
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
@ -556,6 +531,9 @@ int BLASFUNC(blas_thread_shutdown)(void){
return 0;
}
//
// Legacy function to set numbef of threads
//
void goto_set_num_threads(int num_threads)
{
long i;
@ -569,7 +547,7 @@ void goto_set_num_threads(int num_threads)
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
if (blas_server_avail && num_threads < blas_num_threads) {
if (blas_server_avail && num_threads < blas_num_threads) {
LOCK_COMMAND(&server_lock);
thread_target = num_threads;
@ -577,11 +555,11 @@ void goto_set_num_threads(int num_threads)
SetEvent(kickoff_event);
for (i = num_threads - 1; i < blas_num_threads - 1; i++) {
//printf("set_num_threads: waiting on thread [%d] to quit.\n", i);
//MT_TRACE("set_num_threads: waiting on thread [%d] to quit.\n", i);
WaitForSingleObject(blas_threads[i], INFINITE);
//printf("set_num_threads: thread [%d] has quit.\n", i);
//MT_TRACE("set_num_threads: thread [%d] has quit.\n", i);
CloseHandle(blas_threads[i]);
}
@ -599,8 +577,8 @@ void goto_set_num_threads(int num_threads)
thread_target = num_threads;
//increased_threads = 1;
if (!blas_server_avail){
//increased_threads = 1;
if (!blas_server_avail) {
// create the kickoff Event
kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
@ -609,8 +587,8 @@ void goto_set_num_threads(int num_threads)
blas_server_avail = 1;
}
for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
//printf("set_num_threads: creating thread [%d]\n", i);
for (i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) {
//MT_TRACE("set_num_threads: creating thread [%d]\n", i);
blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i,
@ -625,6 +603,9 @@ void goto_set_num_threads(int num_threads)
blas_cpu_number = num_threads;
}
//
// Openblas function to set thread count
//
void openblas_set_num_threads(int num)
{
goto_set_num_threads(num);

View File

@ -275,6 +275,7 @@ extern gotoblas_t gotoblas_EXCAVATOR;
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
#define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE
#else
extern gotoblas_t gotoblas_HASWELL;
extern gotoblas_t gotoblas_ZEN;

View File

@ -1,6 +1,6 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* Copyright 2023-2024 The OpenBLAS Project */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
@ -143,12 +143,13 @@ extern gotoblas_t gotoblas_ARMV8SVE;
#endif
extern gotoblas_t gotoblas_THUNDERX3T110;
#endif
#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1
extern void openblas_warning(int verbose, const char * msg);
#define FALLBACK_VERBOSE 1
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"
#define NUM_CORETYPES 16
#define NUM_CORETYPES 17
/*
* In case asm/hwcap.h is outdated on the build system, make sure
@ -178,6 +179,7 @@ static char *corename[] = {
"emag8180",
"neoversen1",
"neoversev1",
"neoversev2",
"neoversen2",
"thunderx3t110",
"cortexa55",
@ -198,10 +200,11 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_EMAG8180) return corename[ 9];
if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10];
if (gotoblas == &gotoblas_NEOVERSEV1) return corename[11];
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12];
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
if (gotoblas == &gotoblas_CORTEXA55) return corename[14];
if (gotoblas == &gotoblas_ARMV8SVE) return corename[15];
if (gotoblas == &gotoblas_NEOVERSEV2) return corename[12];
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[13];
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[14];
if (gotoblas == &gotoblas_CORTEXA55) return corename[15];
if (gotoblas == &gotoblas_ARMV8SVE) return corename[16];
return corename[NUM_CORETYPES];
}
@ -233,10 +236,11 @@ static gotoblas_t *force_coretype(char *coretype) {
case 9: return (&gotoblas_EMAG8180);
case 10: return (&gotoblas_NEOVERSEN1);
case 11: return (&gotoblas_NEOVERSEV1);
case 12: return (&gotoblas_NEOVERSEN2);
case 13: return (&gotoblas_THUNDERX3T110);
case 14: return (&gotoblas_CORTEXA55);
case 15: return (&gotoblas_ARMV8SVE);
case 12: return (&gotoblas_NEOVERSEV2);
case 13: return (&gotoblas_NEOVERSEN2);
case 14: return (&gotoblas_THUNDERX3T110);
case 15: return (&gotoblas_CORTEXA55);
case 16: return (&gotoblas_ARMV8SVE);
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
@ -312,6 +316,13 @@ static gotoblas_t *get_coretype(void) {
return &gotoblas_NEOVERSEN1;
}else
return &gotoblas_NEOVERSEV1;
case 0xd4f:
if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
return &gotoblas_NEOVERSEN1;
} else {
return &gotoblas_NEOVERSEV2;
}
#endif
case 0xd05: // Cortex A55
return &gotoblas_CORTEXA55;

View File

@ -43,6 +43,13 @@ char *gotoblas_corename(void) {
#define CPU_POWER9 9
#define CPU_POWER10 10
#ifndef POWER_9
#define POWER_9 0x20000 /* 9 class CPU */
#endif
#ifndef POWER_10
#define POWER_10 0x40000 /* 10 class CPU */
#endif
#ifdef _AIX
#include <sys/systemcfg.h>
@ -62,7 +69,7 @@ static int cpuid(void)
else if (arch == POWER_9) return CPU_POWER9;
#endif
#ifdef POWER_10
else if (arch == POWER_10) return CPU_POWER10;
else if (arch >= POWER_10) return CPU_POWER10;
#endif
return CPU_UNKNOWN;
}
@ -332,6 +339,9 @@ void gotoblas_dynamic_init(void) {
if (gotoblas && gotoblas -> init) {
strncpy(coren,gotoblas_corename(),20);
sprintf(coremsg, "Core: %s\n",coren);
if (getenv("GET_OPENBLAS_CORETYPE")) {
fprintf(stderr, "%s", coremsg);
}
openblas_warning(2, coremsg);
gotoblas -> init();
} else {

View File

@ -3214,7 +3214,7 @@ void blas_shutdown(void){
#endif
memory[pos].lock = 0;
}
if (memory_overflowed)
if (memory_overflowed) {
for (pos = 0; pos < NEW_BUFFERS; pos ++){
newmemory[pos].addr = (void *)0;
newmemory[pos].used = 0;
@ -3222,6 +3222,10 @@ void blas_shutdown(void){
newmemory[pos].pos = -1;
#endif
newmemory[pos].lock = 0;
}
free(newmemory);
newmemory = NULL;
memory_overflowed = 0;
}
UNLOCK_COMMAND(&alloc_lock);

View File

@ -36,11 +36,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef SMP_SERVER
extern void openblas_set_num_threads(int num_threads) ;
extern int openblas_get_num_threads(void) ;
void openblas_set_num_threads_(int* num_threads){
openblas_set_num_threads(*num_threads);
}
int openblas_set_num_threads_local(int num_threads){
int ret = openblas_get_num_threads();
openblas_set_num_threads(num_threads);
blas_omp_threads_local=num_threads;
return ret;
}
#else
//Single thread
@ -50,4 +59,8 @@ void openblas_set_num_threads(int num_threads) {
void openblas_set_num_threads_(int* num_threads){
}
int openblas_set_num_threads_local(int num_threads){
return 1;
}
#endif

View File

@ -73,6 +73,10 @@ endif
endif
endif
ifeq ($(F_COMPILER)$(OSNAME), IBMAIX)
EXTRALIB += -lxlf90
endif
ifeq ($(C_COMPILER), PGI)
EXTRALIB += -pgf90libs
endif
@ -132,8 +136,12 @@ libgoto_hpl.def : $(GENSYM)
./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
ifeq ($(OSNAME), Darwin)
ifeq ($(FIXED_LIBNAME),1)
INTERNALNAME = $(LIBPREFIX)$(LIBNAMESUFFIX).dylib
else
INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif
endif
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
@ -169,8 +177,12 @@ INTERNALNAME = $(LIBPREFIX).so
FEXTRALIB += -lm
EXTRALIB += -lm
else
ifeq ($(FIXED_LIBNAME),1)
INTERNALNAME = $(LIBPREFIX)$(LIBNAMESUFFIX).so
else
INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
endif
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
../$(LIBSONAME) : ../$(LIBNAME) linktest.c
@ -248,6 +260,20 @@ endif
ifeq ($(OSNAME), AIX)
so : ../$(LIBSONAME) linktest.c
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(EXTRALIB) && echo OK.
rm -f linktest
../$(LIBSONAME) : aix.exp
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,-bcdtors:all:-2147481648:s,-bE:aix.exp -Wl,-bbigtoc ../$(LIBNAME) $(EXTRALIB)
aix.exp :
/usr/bin/nm -X32_64 -PCpgl ../$(LIBNAME) | /usr/bin/awk '{ if ((($$ 2 == "T") \
|| ($$ 2 == "D") || ($$ 2 == "B") || ($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) && (substr($$ 1,1,1) != ".")) \
{ if (($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) { print $$ 1 " weak" } else { print $$ 1 } } }' | \
/usr/bin/sort -u > aix.exp
ifeq ($(COMPILER_F77), xlf)
goto32.$(SUFFIX) : ../$(LIBNAME) aix.def
@ -289,6 +315,11 @@ test : linktest.c
linktest.c : $(GENSYM) ../Makefile.system ../getarch.c
./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c
ifeq ($(F_COMPILER), IBM)
mv linktest.c linktest.c.FIRST
egrep -v 'second_|dsecnd_' linktest.c.FIRST > linktest.c
rm linktest.c.FIRST
endif
clean ::
@rm -f *.def *.dylib __.SYMDEF* *.renamed

View File

@ -60,6 +60,7 @@ cblasobjsc="
cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv
cblas_scnrm2 cblas_scasum cblas_cgemmt
cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy
cblas_caxpyc cblas_crotg cblas_csrot cblas_scamax cblas_scamin
"
cblasobjsd="
cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot
@ -69,6 +70,7 @@ cblasobjsd="
cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv
cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt
cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy
cblas_damax cblas_damin
"
cblasobjss="
@ -80,6 +82,7 @@ cblasobjss="
cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm
cblas_strsv cblas_sgeadd cblas_sgemmt
cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy
cblas_samax cblas_samin
"
cblasobjsz="
@ -91,6 +94,7 @@ cblasobjsz="
cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub
cblas_zaxpby cblas_zgeadd cblas_zgemmt
cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy
cblas_zaxpyc cblas_zdrot cblas_zrotg cblas_dzamax cblas_dzamin
"
cblasobjs="cblas_xerbla"
@ -861,6 +865,53 @@ lapackobjs2z="$lapackobjs2z
zgedmd
zgedmdq
"
#functions added post 3.11
lapackobjs2c="$lapackobjs2c
claqp2rk
claqp3rk
ctrsyl3
"
# claqz0
# claqz1
# claqz2
# claqz3
# clatrs3
lapackobjs2d="$lapackobjs2d
dgelqs
dgelst
dgeqp3rk
dgeqrs
dlaqp2rk
dlaqp3rk
dlarmm
dlatrs3
dtrsyl3
"
# dlaqz0
# dlaqz1
# dlaqz2
# dlaqz3
# dlaqz4
lapackobjs2z="$lapackobjs2z
zgelqs
zgelst
zgeqp3rk
zgeqrs
zlaqp2rk
zlaqp3rk
zlatrs3
zrscl
ztrsyl3
"
# zlaqz0
# zlaqz1
# zlaqz2
# zlaqz3
lapack_extendedprecision_objs="
zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx
dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx
@ -1622,6 +1673,14 @@ lapackeobjsc="
LAPACKE_cgetsqrhrt_work
LAPACKE_cungtsqr_row
LAPACKE_cungtsqr_row_work
LAPACKE_clangb
LAPACKE_clangb_work
LAPACKE_ctrsyl3
LAPACKE_ctrsyl3_work
LAPACKE_ctz_nancheck
LAPACKE_ctz_trans
LAPACKE_cunhr_col
LAPACKE_cunhr_col_work
"
lapackeobjsd="
@ -2239,6 +2298,14 @@ lapackeobjsd="
LAPACKE_dgetsqrhrt_work
LAPACKE_dorgtsqr_row
LAPACKE_dorgtsqr_row_work
LAPACKE_dlangb
LAPACKE_dlangb_work
LAPACKE_dorhr_col
LAPACKE_dorhr_col_work
LAPACKE_dtrsyl3
LAPACKE_dtrsyl3_work
LAPACKE_dtz_nancheck
LAPACKE_dtz_trans
"
lapackeobjss="
@ -2848,6 +2915,14 @@ lapackeobjss="
LAPACKE_sgetsqrhrt_work
LAPACKE_sorgtsqr_row
LAPACKE_sorgtsqr_row_work
LAPACKE_slangb
LAPACKE_slangb_work
LAPACKE_sorhr_col
LAPACKE_sorhr_col_work
LAPACKE_strsyl3
LAPACKE_strsyl3_work
LAPACKE_stz_nancheck
LAPACKE_stz_trans
"
lapackeobjsz="
@ -3515,6 +3590,14 @@ lapackeobjsz="
LAPACKE_zgetsqrhrt_work
LAPACKE_zungtsqr_row
LAPACKE_zungtsqr_row_work
LAPACKE_zlangb
LAPACKE_zlangb_work
LAPACKE_ztrsyl3
LAPACKE_ztrsyl3_work
LAPACKE_ztz_nancheck
LAPACKE_ztz_trans
LAPACKE_zunhr_col
LAPACKE_zunhr_col_work
"
## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile`
## Not exported: requires LAPACKE_EXTENDED to be set and depends on the
@ -3616,6 +3699,7 @@ lapack_embeded_underscore_objs_s="
ssysv_aa_2stage ssytrf_aa_2stage
ssytrs_aa_2stage
slaorhr_col_getrfnp slaorhr_col_getrfnp2 sorhr_col
slarfb_gett
"
lapack_embeded_underscore_objs_c="
chetf2_rook chetrf_rook chetri_rook
@ -3641,6 +3725,7 @@ lapack_embeded_underscore_objs_c="
csysv_aa_2stage csytrf_aa_2stage
csytrs_aa_2stage
claunhr_col_getrfnp claunhr_col_getrfnp2 cunhr_col
clarfb_gett
"
lapack_embeded_underscore_objs_d="
dlasyf_rook
@ -3658,6 +3743,7 @@ lapack_embeded_underscore_objs_d="
dsysv_aa_2stage
dsytrf_aa_2stage dsytrs_aa_2stage
dlaorhr_col_getrfnp dlaorhr_col_getrfnp2 dorhr_col
dlarfb_gett
"
lapack_embeded_underscore_objs_z="
zhetf2_rook zhetrf_rook zhetri_rook
@ -3682,6 +3768,7 @@ lapack_embeded_underscore_objs_z="
zhetrs_aa_2stage zsysv_aa_2stage
zsytrf_aa_2stage zsytrs_aa_2stage
zlaunhr_col_getrfnp zlaunhr_col_getrfnp2 zunhr_col
zlarfb_gett
"
dirname=`pwd -P`/../lapack-netlib

10
f_check
View File

@ -45,7 +45,7 @@ if [ -z "$compiler" ]; then
pathf90 pathf95
pgf95 pgf90 pgf77 pgfortran nvfortran
flang egfortran
ifort nagfor ifx ftn crayftn"
ifort nagfor ifx ftn crayftn armflang"
for list in $lists; do
for p in $path; do
@ -85,7 +85,11 @@ else
*Hewlett*)
vendor=CRAY
openmp='-fopenmp'
;;
;;
*Arm\ F90*)
vendor=FLANG
openmp='-fopenmp'
;;
*GNU*|*GCC*)
v="${data#*GCC: *\) }"
@ -108,7 +112,7 @@ else
if [ "$major" -ge 17 ]; then
vendor=FLANGNEW
fi
;;
;;
*ifort*|*ifx*)
vendor=INTEL
openmp='-fopenmp'

View File

@ -90,7 +90,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <sys/sysinfo.h>
#include <unistd.h>
#endif
#if defined(AIX)
#if defined(_AIX)
#include <unistd.h>
#include <sys/systemcfg.h>
#include <sys/sysinfo.h>
#endif
@ -150,6 +152,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_EV4 */
/* #define FORCE_EV5 */
/* #define FORCE_EV6 */
/* #define FORCE_CSKY */
/* #define FORCE_CK860FV */
/* #define FORCE_GENERIC */
#ifdef FORCE_P2
@ -1327,6 +1331,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "CORTEXA73"
#endif
#ifdef FORCE_CORTEXA76
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "CORTEXA76"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXA76 " \
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa76"
#define CORENAME "CORTEXA76"
#endif
#ifdef FORCE_CORTEXX1
#define FORCE
#define ARCHITECTURE "ARM64"
@ -1677,9 +1696,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define LIBNAME "c910v"
#define CORENAME "C910V"
#endif
#endif
#ifdef FORCE_x280
#define FORCE
#define ARCHITECTURE "RISCV64"
#define SUBARCHITECTURE "x280"
#define SUBDIRNAME "riscv64"
#define ARCHCONFIG "-Dx280 " \
"-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
#define LIBNAME "x280"
#define CORENAME "x280"
#else
#endif
#ifdef FORCE_RISCV64_ZVL256B
#define FORCE
#define ARCHITECTURE "RISCV64"
#define SUBARCHITECTURE "RISCV64_ZVL256B"
#define SUBDIRNAME "riscv64"
#define ARCHCONFIG "-DRISCV64_ZVL256B " \
"-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
#define LIBNAME "riscv64_zvl256b"
#define CORENAME "RISCV64_ZVL256B"
#endif
#ifdef FORCE_RISCV64_ZVL128B
#define FORCE
#define ARCHITECTURE "RISCV64"
#define SUBARCHITECTURE "RISCV64_ZVL128B"
#define SUBDIRNAME "riscv64"
#define ARCHCONFIG "-DRISCV64_ZVL128B " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
#define LIBNAME "riscv64_zvl128b"
#define CORENAME "RISCV64_ZVL128B"
#endif
#if defined(FORCE_E2K) || defined(__e2k__)
#define FORCE
@ -1692,6 +1748,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "generic"
#endif
#ifdef FORCE_CSKY
#define FORCE
#define ARCHITECTURE "CSKY"
#define SUBARCHITECTURE "CSKY"
#define SUBDIRNAME "csky"
#define ARCHCONFIG "-DCSKY" \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
#define LIBNAME "csky"
#define CORENAME "CSKY"
#endif
#ifdef FORCE_CK860FV
#define FORCE
#define ARCHITECTURE "CSKY"
#define SUBARCHITECTURE "CK860V"
#define SUBDIRNAME "csky"
#define ARCHCONFIG "-DCK860FV " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
#define LIBNAME "ck860fv"
#define CORENAME "CK860FV"
#endif
#ifndef FORCE
#ifdef USER_TARGET
@ -1766,7 +1849,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define OPENBLAS_SUPPORTED
#endif
#ifndef OPENBLAS_SUPPORTED
#error "This arch/CPU is not supported by OpenBLAS."
#endif
@ -1805,11 +1887,13 @@ static int get_num_cores(void) {
return count;
#elif defined(AIX)
#elif defined(_AIX)
//returns the number of processors which are currently online
count = sysconf(_SC_NPROCESSORS_ONLN);
if (count <= 0) count = 2;
return count;
#else
return 2;
#endif
@ -1831,7 +1915,7 @@ int main(int argc, char *argv[]){
#ifdef FORCE
printf("CORE=%s\n", CORENAME);
#else
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__)
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) || defined(__csky__)
printf("CORE=%s\n", get_corename());
#endif
#endif
@ -1979,7 +2063,7 @@ printf("ELF_VERSION=2\n");
#ifdef FORCE
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
#else
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv)
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__csky__)
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
#endif
#endif

View File

@ -119,6 +119,7 @@ endif ()
if (BUILD_BFLOAT16)
GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
@ -130,6 +131,8 @@ endif ()
foreach (float_type ${FLOAT_TYPES})
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type})
GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type})
GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type})
GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type})

View File

@ -270,7 +270,8 @@ CSBLAS1OBJS = \
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX)
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_samax.$(SUFFIX) \
cblas_samin.$(SUFFIX)
CSBLAS2OBJS = \
cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
@ -295,7 +296,8 @@ CDBLAS1OBJS = \
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX)
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_damax.$(SUFFIX) \
cblas_damin.$(SUFFIX)
CDBLAS2OBJS = \
cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
@ -315,7 +317,7 @@ CCBLAS1OBJS = \
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
cblas_caxpby.$(SUFFIX) \
cblas_caxpby.$(SUFFIX) cblas_scamax.$(SUFFIX) cblas_caxpyc.$(SUFFIX) cblas_scamin.$(SUFFIX) \
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX)
CCBLAS2OBJS = \
@ -340,12 +342,12 @@ CXERBLAOBJ = \
CZBLAS1OBJS = \
cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
cblas_zcopy.$(SUFFIX) \
cblas_zcopy.$(SUFFIX) cblas_dzamax.$(SUFFIX) cblas_dzamin.$(SUFFIX) \
cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
cblas_zaxpby.$(SUFFIX) \
cblas_zaxpby.$(SUFFIX) cblas_zaxpyc.$(SUFFIX) \
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX)
@ -1301,7 +1303,7 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c
ifeq ($(BUILD_BFLOAT16),1)
sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : gemmt.c ../param.h
sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
endif
@ -1533,6 +1535,30 @@ cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c
cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
cblas_scamax.$(SUFFIX) cblas_scamax.$(PSUFFIX) : max.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
cblas_dzamax.$(SUFFIX) cblas_dzamax.$(PSUFFIX) : max.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_scamin.$(SUFFIX) cblas_scamin.$(PSUFFIX) : max.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_dzamin.$(SUFFIX) cblas_dzamin.$(PSUFFIX) : max.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
@ -1627,6 +1653,15 @@ cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c
cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_caxpyc.$(SUFFIX) cblas_caxpyc.$(PSUFFIX) : zaxpy.c
$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c
$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c
$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
@ -1932,7 +1967,7 @@ cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
ifeq ($(BUILD_BFLOAT16),1)
cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h
cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
endif

View File

@ -117,8 +117,8 @@ void CNAME(enum CBLAS_ORDER order,
if (ldc < MAX(1, m)) info = 8;
if (lda < MAX(1, m)) info = 5;
if (n < 0) info = 2;
if (m < 0) info = 1;
if (n < 0) info = 1;
if (m < 0) info = 2;
}
if (info >= 0) {

View File

@ -533,8 +533,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
args.nthreads = 1;
else
else {
args.nthreads = num_cpu_avail(3);
if (MNK/args.nthreads < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD)
args.nthreads = MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD);
}
args.common = NULL;
if (args.nthreads == 1) {

View File

@ -78,6 +78,9 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
char transA, transB, Uplo;
blasint nrowa, nrowb;
#if defined(COMPLEX)
blasint ncolb;
#endif
IFLOAT *buffer;
IFLOAT *aa, *bb;
FLOAT *cc;
@ -155,19 +158,27 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
uplo = 0;
if (Uplo == 'L')
uplo = 1;
nrowa = m;
if (transa) nrowa = k;
if (transa & 1) nrowa = k;
nrowb = k;
if (transb) nrowb = m;
#if defined(COMPLEX)
ncolb = m;
#endif
if (transb & 1) {
nrowb = m;
#if defined(COMPLEX)
ncolb = k;
#endif
}
info = 0;
if (ldc < MAX(1, m))
info = 13;
if (ldb < MAX(1, nrowa))
if (ldb < MAX(1, nrowb))
info = 10;
if (lda < MAX(1, nrowb))
if (lda < MAX(1, nrowa))
info = 8;
if (k < 0)
info = 5;
@ -211,6 +222,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
blasint info;
blasint lda, ldb;
FLOAT *a, *b;
#if defined(COMPLEX)
blasint nrowb, ncolb;
#endif
XFLOAT *buffer;
PRINT_DEBUG_CNAME;
@ -262,11 +276,22 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
info = -1;
blasint nrowa, nrowb;
blasint nrowa;
#if !defined(COMPLEX)
blasint nrowb;
#endif
nrowa = m;
if (transa) nrowa = k;
if (transa & 1) nrowa = k;
nrowb = k;
if (transb) nrowb = m;
#if defined(COMPLEX)
ncolb = m;
#endif
if (transb & 1) {
nrowb = m;
#if defined(COMPLEX)
ncolb = k;
#endif
}
if (ldc < MAX(1, m))
info = 13;
@ -330,26 +355,38 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
info = -1;
blasint ncola, ncolb;
ncola = k;
if (transa) ncola = m;
ncolb = m;
if (transb) ncolb = k;
blasint ncola;
#if !defined(COMPLEX)
blasint ncolb;
#endif
ncola = m;
if (transa & 1) ncola = k;
ncolb = k;
#if defined(COMPLEX)
nrowb = m;
#endif
if (transb & 1) {
#if defined(COMPLEX)
nrowb = k;
#endif
ncolb = m;
}
if (ldc < MAX(1,m))
info = 13;
if (ldb < MAX(1, ncolb))
info = 10;
if (lda < MAX(1, ncola))
info = 8;
if (lda < MAX(1, ncola))
info = 10;
if (k < 0)
info = 5;
if (m < 0)
info = 4;
if (transb < 0)
info = 3;
if (transa < 0)
info = 2;
if (transa < 0)
info = 3;
if (uplo < 0)
info = 1;
}
@ -428,7 +465,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
IDEBUG_START;
const blasint incb = (transb == 0) ? 1 : ldb;
#if defined(COMPLEX)
if (transb > 1){
#ifndef CBLAS
IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
#else
if (order == CblasColMajor)
IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
if (order == CblasRowMajor)
IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
#endif
}
#endif
const blasint incb = ((transb & 1) == 0) ? 1 : ldb;
if (uplo == 1) {
for (i = 0; i < m; i++) {
@ -438,19 +488,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#if defined(COMPLEX)
aa = a + i * 2;
bb = b + i * ldb * 2;
if (transa) {
if (transa & 1) {
aa = a + lda * i * 2;
}
if (transb)
if (transb & 1)
bb = b + i * 2;
cc = c + i * 2 * ldc + i * 2;
#else
aa = a + i;
bb = b + i * ldb;
if (transa) {
if (transa & 1) {
aa = a + lda * i;
}
if (transb)
if (transb & 1)
bb = b + i;
cc = c + i * ldc + i;
#endif
@ -461,7 +511,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
NULL, 0);
if (alpha_r == ZERO && alpha_i == ZERO)
return;
continue;
#else
if (beta != ONE)
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
@ -472,13 +522,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
IDEBUG_START;
buffer_size = j + k + 128 / sizeof(FLOAT);
buffer_size = 2 * (j + k) + 128 / sizeof(FLOAT);
#ifdef WINDOWS_ABI
buffer_size += 160 / sizeof(FLOAT);
#endif
// for alignment
buffer_size = (buffer_size + 3) & ~3;
STACK_ALLOC(buffer_size, FLOAT, buffer);
STACK_ALLOC(buffer_size, IFLOAT, buffer);
#ifdef SMP
@ -491,7 +541,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
#if defined(COMPLEX)
if (!transa)
if (!(transa & 1))
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
aa, lda, bb, incb, cc, 1,
buffer);
@ -500,7 +550,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
aa, lda, bb, incb, cc, 1,
buffer);
#else
if (!transa)
if (!(transa & 1))
(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
bb, incb, cc, 1, buffer);
else
@ -509,7 +559,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
#ifdef SMP
} else {
if (!transa)
if (!(transa & 1))
(gemv_thread[(int)transa]) (j, k, alpha, aa,
lda, bb, incb, cc,
1, buffer,
@ -533,13 +583,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
l = j;
#if defined COMPLEX
bb = b + i * ldb * 2;
if (transb) {
if (transb & 1) {
bb = b + i * 2;
}
cc = c + i * 2 * ldc;
#else
bb = b + i * ldb;
if (transb) {
if (transb & 1) {
bb = b + i;
}
cc = c + i * ldc;
@ -551,7 +601,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
NULL, 0);
if (alpha_r == ZERO && alpha_i == ZERO)
return;
continue;
#else
if (beta != ONE)
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
@ -561,13 +611,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
IDEBUG_START;
buffer_size = j + k + 128 / sizeof(FLOAT);
buffer_size = 2 * (j + k) + 128 / sizeof(FLOAT);
#ifdef WINDOWS_ABI
buffer_size += 160 / sizeof(FLOAT);
#endif
// for alignment
buffer_size = (buffer_size + 3) & ~3;
STACK_ALLOC(buffer_size, FLOAT, buffer);
STACK_ALLOC(buffer_size, IFLOAT, buffer);
#ifdef SMP
@ -580,7 +630,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif
#if defined(COMPLEX)
if (!transa)
if (!(transa & 1))
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
a, lda, bb, incb, cc, 1,
buffer);
@ -589,7 +639,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
a, lda, bb, incb, cc, 1,
buffer);
#else
if (!transa)
if (!(transa & 1))
(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
incb, cc, 1, buffer);
else
@ -599,7 +649,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#ifdef SMP
} else {
if (!transa)
if (!(transa & 1))
(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
bb, incb, cc, 1,
buffer, nthreads);

View File

@ -226,7 +226,7 @@ void CNAME(enum CBLAS_ORDER order,
#ifdef SMP
if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD )
if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD )
nthreads = 1;
else
nthreads = num_cpu_avail(2);

View File

@ -154,7 +154,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
}
#endif
msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT);
if ( *rows > *cols )
msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT);
else
msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT);
b = malloc(msize);
if ( b == NULL )

View File

@ -95,14 +95,19 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
#ifdef SMP
args.common = NULL;
#ifndef DOUBLE
if (args.m*args.n < 40000)
#else
if (args.m*args.n < 10000)
int nmax = 40000;
#else
int nmax = 10000;
#endif
args.nthreads=1;
else
args.nthreads = num_cpu_avail(4);
if (args.m*args.n <nmax) {
args.nthreads = 1;
} else {
args.nthreads = num_cpu_avail(4);
if ((args.m*args.n)/args.nthreads <nmax)
args.nthreads = (args.m*args.n)/nmax;
}
if (args.nthreads == 1) {
#endif

View File

@ -113,13 +113,17 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
#ifdef SMP
args.common = NULL;
#ifndef DOUBLE
if (args.n <128)
#else
if (args.n <64)
int nmax = 128;
#else
int nmax = 64;
#endif
if (args.n <nmax) {
args.nthreads = 1;
else
args.nthreads = num_cpu_avail(4);
} else {
args.nthreads = num_cpu_avail(4);
if (args.n/args.nthreads <nmax)
args.nthreads = args.n/nmax;
}
if (args.nthreads == 1) {
#endif

View File

@ -95,10 +95,12 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *
if (trans_arg == 'R') trans = 0;
if (trans_arg == 'C') trans = 1;
TOUPPER(uplo_arg);
uplo = -1;
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
TOUPPER(diag_arg);
diag = -1;
if (diag_arg == 'U') diag = 0;
if (diag_arg == 'N') diag = 1;

View File

@ -95,10 +95,12 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *
if (trans_arg == 'R') trans = 2;
if (trans_arg == 'C') trans = 3;
TOUPPER(uplo_arg);
uplo = -1;
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
TOUPPER(diag_arg);
diag = -1;
if (diag_arg == 'U') diag = 0;
if (diag_arg == 'N') diag = 1;

View File

@ -46,6 +46,12 @@
#ifdef USE_ABS
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#ifndef USE_MIN
/* ABS & MAX */
@ -92,6 +98,8 @@
#else
#define ABS
#ifndef USE_MIN
/* MAX */
@ -130,6 +138,12 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
if (n <= 0) return 0;
#ifndef COMPLEX
if (incx == 0) return (ABS(*x));
#else
if (incx == 0) return (ABS(*x) + ABS(*(x+1)));
#endif
IDEBUG_START;
FUNCTION_PROFILE_START();
@ -145,14 +159,25 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
#else
#ifdef COMPLEX
FLOAT CNAME(blasint n, void *vx, blasint incx){
FLOAT *x = (FLOAT*) vx;
#else
FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
#endif
FLOAT ret;
PRINT_DEBUG_CNAME;
if (n <= 0) return 0;
#ifndef COMPLEX
if (incx == 0) return (ABS(*x));
#else
if (incx == 0) return (ABS(*x) + ABS(*(x+1)));
#endif
IDEBUG_START;
FUNCTION_PROFILE_START();

View File

@ -96,12 +96,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
else
{
dp2 = *dd2 * dy1;
if(dp2 == ZERO)
{
dflag = -TWO;
dparam[0] = dflag;
return;
}
dp1 = *dd1 * *dx1;
dq2 = dp2 * dy1;
dq1 = dp1 * *dx1;
@ -113,24 +107,10 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
dh12 = dp2 / dp1;
du = ONE - dh12 * dh21;
if(du > ZERO)
{
dflag = ZERO;
*dd1 = *dd1 / du;
*dd2 = *dd2 / du;
*dx1 = *dx1 * du;
} else {
dflag = -ONE;
dh11 = ZERO;
dh12 = ZERO;
dh21 = ZERO;
dh22 = ZERO;
*dd1 = ZERO;
*dd2 = ZERO;
*dx1 = ZERO;
}
dflag = ZERO;
*dd1 = *dd1 / du;
*dd2 = *dd2 / du;
*dx1 = *dx1 * du;
}
else

447
interface/sbgemmt.c Normal file
View File

@ -0,0 +1,447 @@
/*********************************************************************/
/* Copyright 2024, The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include "common.h"
#define SMP_THRESHOLD_MIN 65536.0
#define ERROR_NAME "SBGEMMT "
#ifndef GEMM_MULTITHREAD_THRESHOLD
#define GEMM_MULTITHREAD_THRESHOLD 4
#endif
#ifndef CBLAS
void NAME(char *UPLO, char *TRANSA, char *TRANSB,
blasint * M, blasint * K,
FLOAT * Alpha,
IFLOAT * a, blasint * ldA,
IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
{
blasint m, k;
blasint lda, ldb, ldc;
int transa, transb, uplo;
blasint info;
char transA, transB, Uplo;
blasint nrowa, nrowb;
IFLOAT *buffer;
IFLOAT *aa, *bb;
FLOAT *cc;
FLOAT alpha, beta;
PRINT_DEBUG_NAME;
m = *M;
k = *K;
alpha = *Alpha;
beta = *Beta;
lda = *ldA;
ldb = *ldB;
ldc = *ldC;
transA = *TRANSA;
transB = *TRANSB;
Uplo = *UPLO;
TOUPPER(transA);
TOUPPER(transB);
TOUPPER(Uplo);
transa = -1;
transb = -1;
uplo = -1;
if (transA == 'N')
transa = 0;
if (transA == 'T')
transa = 1;
if (transA == 'R')
transa = 0;
if (transA == 'C')
transa = 1;
if (transB == 'N')
transb = 0;
if (transB == 'T')
transb = 1;
if (transB == 'R')
transb = 0;
if (transB == 'C')
transb = 1;
if (Uplo == 'U')
uplo = 0;
if (Uplo == 'L')
uplo = 1;
nrowa = m;
if (transa & 1) nrowa = k;
nrowb = k;
if (transb & 1) nrowb = m;
info = 0;
if (ldc < MAX(1, m))
info = 13;
if (ldb < MAX(1, nrowb))
info = 10;
if (lda < MAX(1, nrowa))
info = 8;
if (k < 0)
info = 5;
if (m < 0)
info = 4;
if (transb < 0)
info = 3;
if (transa < 0)
info = 2;
if (uplo < 0)
info = 1;
if (info != 0) {
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
#else
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m,
blasint k,
FLOAT alpha,
IFLOAT * A, blasint LDA,
IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc)
{
IFLOAT *aa, *bb;
FLOAT *cc;
int transa, transb, uplo;
blasint info;
blasint lda, ldb;
IFLOAT *a, *b;
XFLOAT *buffer;
PRINT_DEBUG_CNAME;
uplo = -1;
transa = -1;
transb = -1;
info = 0;
if (order == CblasColMajor) {
if (Uplo == CblasUpper) uplo = 0;
if (Uplo == CblasLower) uplo = 1;
if (TransA == CblasNoTrans)
transa = 0;
if (TransA == CblasTrans)
transa = 1;
if (TransA == CblasConjNoTrans)
transa = 0;
if (TransA == CblasConjTrans)
transa = 1;
if (TransB == CblasNoTrans)
transb = 0;
if (TransB == CblasTrans)
transb = 1;
if (TransB == CblasConjNoTrans)
transb = 0;
if (TransB == CblasConjTrans)
transb = 1;
a = (void *)A;
b = (void *)B;
lda = LDA;
ldb = LDB;
info = -1;
blasint nrowa;
blasint nrowb;
nrowa = m;
if (transa & 1) nrowa = k;
nrowb = k;
if (transb & 1) nrowb = m;
if (ldc < MAX(1, m))
info = 13;
if (ldb < MAX(1, nrowb))
info = 10;
if (lda < MAX(1, nrowa))
info = 8;
if (k < 0)
info = 5;
if (m < 0)
info = 4;
if (transb < 0)
info = 3;
if (transa < 0)
info = 2;
if (uplo < 0)
info = 1;
}
if (order == CblasRowMajor) {
a = (void *)B;
b = (void *)A;
lda = LDB;
ldb = LDA;
if (Uplo == CblasUpper) uplo = 0;
if (Uplo == CblasLower) uplo = 1;
if (TransB == CblasNoTrans)
transa = 0;
if (TransB == CblasTrans)
transa = 1;
if (TransB == CblasConjNoTrans)
transa = 0;
if (TransB == CblasConjTrans)
transa = 1;
if (TransA == CblasNoTrans)
transb = 0;
if (TransA == CblasTrans)
transb = 1;
if (TransA == CblasConjNoTrans)
transb = 0;
if (TransA == CblasConjTrans)
transb = 1;
info = -1;
blasint ncola;
blasint ncolb;
ncola = m;
if (transa & 1) ncola = k;
ncolb = k;
if (transb & 1) {
ncolb = m;
}
if (ldc < MAX(1,m))
info = 13;
if (ldb < MAX(1, ncolb))
info = 8;
if (lda < MAX(1, ncola))
info = 10;
if (k < 0)
info = 5;
if (m < 0)
info = 4;
if (transb < 0)
info = 2;
if (transa < 0)
info = 3;
if (uplo < 0)
info = 1;
}
if (info >= 0) {
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
#endif
int buffer_size;
blasint i, j;
#ifdef SMP
int nthreads;
#endif
#ifdef SMP
static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *,
BLASLONG, IFLOAT *, BLASLONG, FLOAT,
FLOAT *, BLASLONG, int) = {
sbgemv_thread_n, sbgemv_thread_t,
};
#endif
int (*gemv[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, BLASLONG,
IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = {
SBGEMV_N, SBGEMV_T,};
if (m == 0)
return;
IDEBUG_START;
const blasint incb = ((transb & 1) == 0) ? 1 : ldb;
if (uplo == 1) {
for (i = 0; i < m; i++) {
j = m - i;
aa = a + i;
bb = b + i * ldb;
if (transa & 1) {
aa = a + lda * i;
}
if (transb & 1)
bb = b + i;
cc = c + i * ldc + i;
#if 0
if (beta != ONE)
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
if (alpha == ZERO)
continue;
#endif
IDEBUG_START;
buffer_size = j + k + 128 / sizeof(FLOAT);
#ifdef WINDOWS_ABI
buffer_size += 160 / sizeof(FLOAT);
#endif
// for alignment
buffer_size = (buffer_size + 3) & ~3;
STACK_ALLOC(buffer_size, IFLOAT, buffer);
#ifdef SMP
if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
nthreads = 1;
else
nthreads = num_cpu_avail(2);
if (nthreads == 1) {
#endif
if (!(transa & 1))
(gemv[(int)transa]) (j, k, alpha, aa, lda,
bb, incb, beta, cc, 1);
else
(gemv[(int)transa]) (k, j, alpha, aa, lda,
bb, incb, beta, cc, 1);
#ifdef SMP
} else {
if (!(transa & 1))
(gemv_thread[(int)transa]) (j, k, alpha, aa,
lda, bb, incb, beta, cc,
1, nthreads);
else
(gemv_thread[(int)transa]) (k, j, alpha, aa,
lda, bb, incb, beta, cc,
1, nthreads);
}
#endif
STACK_FREE(buffer);
}
} else {
for (i = 0; i < m; i++) {
j = i + 1;
bb = b + i * ldb;
if (transb & 1) {
bb = b + i;
}
cc = c + i * ldc;
#if 0
if (beta != ONE)
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
if (alpha == ZERO)
continue;
#endif
IDEBUG_START;
buffer_size = j + k + 128 / sizeof(FLOAT);
#ifdef WINDOWS_ABI
buffer_size += 160 / sizeof(FLOAT);
#endif
// for alignment
buffer_size = (buffer_size + 3) & ~3;
STACK_ALLOC(buffer_size, IFLOAT, buffer);
#ifdef SMP
if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
nthreads = 1;
else
nthreads = num_cpu_avail(2);
if (nthreads == 1) {
#endif
if (!(transa & 1))
(gemv[(int)transa]) (j, k, alpha, a, lda, bb,
incb, beta, cc, 1);
else
(gemv[(int)transa]) (k, j, alpha, a, lda, bb,
incb, beta, cc, 1);
#ifdef SMP
} else {
if (!(transa & 1))
(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
bb, incb, beta, cc, 1,
nthreads);
else
(gemv_thread[(int)transa]) (k, j, alpha, a, lda,
bb, incb, beta, cc, 1,
nthreads);
}
#endif
STACK_FREE(buffer);
}
}
IDEBUG_END;
return;
}

View File

@ -39,12 +39,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef CBLAS
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY)
void NAME(blasint *N, void *VALPHA, FLOAT *x, blasint *INCX, void *VBETA, FLOAT *y, blasint *INCY)
{
blasint n = *N;
blasint incx = *INCX;
blasint incy = *INCY;
FLOAT* ALPHA = (FLOAT*) VALPHA;
FLOAT* BETA = (FLOAT*) VBETA;
#else

View File

@ -66,7 +66,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
info = 0;
if (lda < MAX(1, m)) info = 6;
if (lda < MAX(1, m)) info = 5;
if (ldc < MAX(1, m)) info = 8;
if (n < 0) info = 2;
@ -115,8 +115,8 @@ void CNAME(enum CBLAS_ORDER order,
if (ldc < MAX(1, m)) info = 8;
if (lda < MAX(1, m)) info = 5;
if (n < 0) info = 2;
if (m < 0) info = 1;
if (n < 0) info = 1;
if (m < 0) info = 2;
}
if (info >= 0) {

View File

@ -183,7 +183,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
}
#endif
msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT) * 2;
if ( *rows > *cols )
msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT) * 2;
else
msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT) * 2;
b = malloc(msize);
if ( b == NULL )

View File

@ -102,7 +102,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
if (ada >= h *safmin) {
*C = sqrt(ada/h);
*R = *DA / *C;
*(R+1) = *(DA+1) / *(C+1);
*(R+1) = *(DA+1) / *C;
rtmax *= 2.;
if ( ada > rtmin && h < rtmax) { // no risk of intermediate overflow
*S = *S1 * (*DA / adahsq) - *(S1+1)* (*(DA+1)/adahsq);
@ -115,7 +115,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
*C = ada / adahsq;
if (*C >= safmin) {
*R = *DA / *C;
*(R+1) = *(DA+1) / *(C+1);
*(R+1) = *(DA+1) / *C;
} else {
*R = *DA * (h / adahsq);
*(R+1) = *(DA+1) * (h / adahsq);

View File

@ -1349,6 +1349,9 @@ endif ()
set_target_properties(kernel${TSUFFIX} PROPERTIES COMPILE_FLAGS "${KERNEL_DEFINITIONS}")
get_target_property(KERNEL_INCLUDE_DIRECTORIES kernel${TSUFFIX} INCLUDE_DIRECTORIES)
set_target_properties(kernel${TSUFFIX} PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDE_DIRECTORIES};${TARGET_CONF_DIR}")
if (USE_GEMM3M)
target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M)
endif()
endfunction ()

View File

@ -61,7 +61,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( n == 1 ) return( ABS(x[0]) );
n *= inc_x;
while(i < n)
while(abs(i) < abs(n))
{
if ( x[i] != 0.0 )

View File

@ -62,7 +62,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
inc_x2 = 2 * inc_x;
n *= inc_x2;
while(i < n)
while(abs(i) < abs(n))
{
if ( x[i] != 0.0 )

View File

@ -60,6 +60,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
else
{
temp = - da_i * x[ip+1] ;
if (isnan(x[ip]) || isinf(x[ip])) temp = NAN;
x[ip+1] = da_i * x[ip] ;
}
}

View File

@ -1,3 +1,5 @@
CSUMKERNEL=csum.S
ifndef SNRM2KERNEL
SNRM2KERNEL = ../arm/nrm2.c
endif

View File

@ -1,3 +1,6 @@
CSUMKERNEL = csum_thunderx2t99.c
ZSUMKERNEL = zsum_thunderx2t99.c
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c

View File

@ -0,0 +1,3 @@
include $(KERNELDIR)/KERNEL.CORTEXA57

View File

@ -91,8 +91,8 @@ IDAMAXKERNEL = iamax_thunderx2t99.c
ICAMAXKERNEL = izamax_thunderx2t99.c
IZAMAXKERNEL = izamax_thunderx2t99.c
SNRM2KERNEL = scnrm2_thunderx2t99.c
DNRM2KERNEL = dznrm2_thunderx2t99.c
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = scnrm2_thunderx2t99.c
ZNRM2KERNEL = dznrm2_thunderx2t99.c

View File

@ -0,0 +1,247 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <arm_neon.h>
#define N "x0" /* vector length */
#define X "x1" /* "X" vector address */
#define INC_X "x2" /* "X" stride */
#define J "x5" /* loop variable */
#define REG0 "wzr"
#define SUMF "s0"
#define SUMFD "d0"
/******************************************************************************/
#define KERNEL_F1 \
"ldr d1, ["X"] \n" \
"add "X", "X", #8 \n" \
"ext v2.8b, v1.8b, v1.8b, #4 \n" \
"fadd s1, s1, s2 \n" \
"fadd "SUMF", "SUMF", s1 \n"
#define KERNEL_F32 \
"ldr q16, ["X"] \n" \
"ldr q17, ["X", #16] \n" \
"ldr q18, ["X", #32] \n" \
"ldr q19, ["X", #48] \n" \
"ldp q20, q21, ["X", #64] \n" \
"ldp q22, q23, ["X", #96] \n" \
"ldp q24, q25, ["X", #128] \n" \
"ldp q26, q27, ["X", #160] \n" \
"fadd v16.4s, v16.4s, v17.4s \n" \
"fadd v18.4s, v18.4s, v19.4s \n" \
"ldp q28, q29, ["X", #192] \n" \
"ldp q30, q31, ["X", #224] \n" \
"add "X", "X", #256 \n" \
"fadd v20.4s, v20.4s, v21.4s \n" \
"fadd v22.4s, v22.4s, v23.4s \n" \
"PRFM PLDL1KEEP, ["X", #1024] \n" \
"PRFM PLDL1KEEP, ["X", #1024+64] \n" \
"fadd v24.4s, v24.4s, v25.4s \n" \
"fadd v26.4s, v26.4s, v27.4s \n" \
"fadd v0.4s, v0.4s, v16.4s \n" \
"fadd v1.4s, v1.4s, v18.4s \n" \
"fadd v2.4s, v2.4s, v20.4s \n" \
"fadd v3.4s, v3.4s, v22.4s \n" \
"PRFM PLDL1KEEP, ["X", #1024+128] \n" \
"PRFM PLDL1KEEP, ["X", #1024+192] \n" \
"fadd v28.4s, v28.4s, v29.4s \n" \
"fadd v30.4s, v30.4s, v31.4s \n" \
"fadd v4.4s, v4.4s, v24.4s \n" \
"fadd v5.4s, v5.4s, v26.4s \n" \
"fadd v6.4s, v6.4s, v28.4s \n" \
"fadd v7.4s, v7.4s, v30.4s \n"
#define KERNEL_F32_FINALIZE \
"fadd v0.4s, v0.4s, v1.4s \n" \
"fadd v2.4s, v2.4s, v3.4s \n" \
"fadd v4.4s, v4.4s, v5.4s \n" \
"fadd v6.4s, v6.4s, v7.4s \n" \
"fadd v0.4s, v0.4s, v2.4s \n" \
"fadd v4.4s, v4.4s, v6.4s \n" \
"fadd v0.4s, v0.4s, v4.4s \n" \
"ext v1.16b, v0.16b, v0.16b, #8 \n" \
"fadd v0.2s, v0.2s, v1.2s \n" \
"faddp "SUMF", v0.2s \n"
#define INIT_S \
"lsl "INC_X", "INC_X", #3 \n"
#define KERNEL_S1 \
"ldr d1, ["X"] \n" \
"add "X", "X", "INC_X" \n" \
"ext v2.8b, v1.8b, v1.8b, #4 \n" \
"fadd s1, s1, s2 \n" \
"fadd "SUMF", "SUMF", s1 \n"
#if defined(SMP)
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
void *c, BLASLONG ldc, int (*function)(), int nthreads);
#endif
static FLOAT casum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
FLOAT asum = 0.0 ;
if ( n < 0 ) return(asum);
__asm__ __volatile__ (
" mov "N", %[N_] \n"
" mov "X", %[X_] \n"
" mov "INC_X", %[INCX_] \n"
" fmov "SUMF", "REG0" \n"
" fmov s1, "REG0" \n"
" fmov s2, "REG0" \n"
" fmov s3, "REG0" \n"
" fmov s4, "REG0" \n"
" fmov s5, "REG0" \n"
" fmov s6, "REG0" \n"
" fmov s7, "REG0" \n"
" cmp "N", xzr \n"
" ble 9f //asum_kernel_L999 \n"
" cmp "INC_X", xzr \n"
" ble 9f //asum_kernel_L999 \n"
" cmp "INC_X", #1 \n"
" bne 5f //asum_kernel_S_BEGIN \n"
"1: //asum_kernel_F_BEGIN: \n"
" asr "J", "N", #5 \n"
" cmp "J", xzr \n"
" beq 3f //asum_kernel_F1 \n"
"2: //asum_kernel_F32: \n"
" "KERNEL_F32" \n"
" subs "J", "J", #1 \n"
" bne 2b //asum_kernel_F32 \n"
" "KERNEL_F32_FINALIZE" \n"
"3: //asum_kernel_F1: \n"
" ands "J", "N", #31 \n"
" ble 9f //asum_kernel_L999 \n"
"4: //asum_kernel_F10: \n"
" "KERNEL_F1" \n"
" subs "J", "J", #1 \n"
" bne 4b //asum_kernel_F10 \n"
" b 9f //asum_kernel_L999 \n"
"5: //asum_kernel_S_BEGIN: \n"
" "INIT_S" \n"
" asr "J", "N", #2 \n"
" cmp "J", xzr \n"
" ble 7f //asum_kernel_S1 \n"
"6: //asum_kernel_S4: \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" subs "J", "J", #1 \n"
" bne 6b //asum_kernel_S4 \n"
"7: //asum_kernel_S1: \n"
" ands "J", "N", #3 \n"
" ble 9f //asum_kernel_L999 \n"
"8: //asum_kernel_S10: \n"
" "KERNEL_S1" \n"
" subs "J", "J", #1 \n"
" bne 8b //asum_kernel_S10 \n"
"9: //asum_kernel_L999: \n"
" fmov %[ASUM_], "SUMFD" \n"
: [ASUM_] "=r" (asum) //%0
: [N_] "r" (n), //%1
[X_] "r" (x), //%2
[INCX_] "r" (inc_x) //%3
: "cc",
"memory",
"x0", "x1", "x2", "x3", "x4", "x5",
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
);
return asum;
}
#if defined(SMP)
static int casum_thread_function(BLASLONG n, BLASLONG dummy0,
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
{
*result = casum_compute(n, x, inc_x);
return 0;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
#if defined(SMP)
int nthreads;
FLOAT dummy_alpha;
#endif
FLOAT asum = 0.0;
#if defined(SMP)
if (inc_x == 0 || n <= 10000)
nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) {
asum = casum_compute(n, x, inc_x);
} else {
int mode, i;
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
FLOAT *ptr;
mode = BLAS_SINGLE | BLAS_COMPLEX;
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
x, inc_x, NULL, 0, result, 0,
( void *)casum_thread_function, nthreads);
ptr = (FLOAT *)result;
for (i = 0; i < nthreads; i++) {
asum = asum + (*ptr);
ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2);
}
}
#else
asum = casum_compute(n, x, inc_x);
#endif
return asum;
}

View File

@ -77,7 +77,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
" cmp "N", xzr \n"
" ble 9f //nrm2_kernel_L999 \n"
" cmp "INC_X", xzr \n"
" ble 9f //nrm2_kernel_L999 \n"
" beq 9f //nrm2_kernel_L999 \n"
"1: //nrm2_kernel_F_BEGIN: \n"
" mov x6, #0x7FF0000000000000 //+Infinity \n"
@ -345,7 +345,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
#endif
FLOAT ssq, scale;
if (n <= 0 || inc_x <= 0) return 0.0;
if (n <= 0 || inc_x == 0) return 0.0;
#if defined(SMP)
if (n <= 10000)

View File

@ -229,7 +229,7 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
" cmp "N", xzr \n"
" ble 9f //nrm2_kernel_L999 \n"
" cmp "INC_X", xzr \n"
" ble 9f //nrm2_kernel_L999 \n"
" beq 9f //nrm2_kernel_L999 \n"
" cmp "INC_X", #1 \n"
" bne 5f //nrm2_kernel_S_BEGIN \n"
@ -315,7 +315,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT nrm2 = 0.0;
double nrm2_double = 0.0;
if (n <= 0 || inc_x <= 0) return 0.0;
if (n <= 0 || inc_x == 0) return 0.0;
#if defined(SMP)
if (n <= 10000)

View File

@ -223,7 +223,7 @@ zscal_begin:
fcmp DA_I, #0.0
beq .Lzscal_kernel_RI_zero
b .Lzscal_kernel_R_zero
// b .Lzscal_kernel_R_zero
.Lzscal_kernel_R_non_zero:

View File

@ -0,0 +1,244 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <arm_neon.h>
#define N "x0" /* vector length */
#define X "x1" /* "X" vector address */
#define INC_X "x2" /* "X" stride */
#define J "x5" /* loop variable */
#define REG0 "xzr"
#define SUMF "d0"
#define TMPF "d1"
/******************************************************************************/
#define KERNEL_F1 \
"ldr q1, ["X"] \n" \
"add "X", "X", #16 \n" \
"faddp d1, v1.2d \n" \
"fadd "SUMF", "SUMF", d1 \n"
#define KERNEL_F16 \
"ldr q16, ["X"] \n" \
"ldr q17, ["X", #16] \n" \
"ldr q18, ["X", #32] \n" \
"ldr q19, ["X", #48] \n" \
"ldp q20, q21, ["X", #64] \n" \
"ldp q22, q23, ["X", #96] \n" \
"ldp q24, q25, ["X", #128] \n" \
"ldp q26, q27, ["X", #160] \n" \
"fadd v16.2d, v16.2d, v17.2d \n" \
"fadd v18.2d, v18.2d, v19.2d \n" \
"ldp q28, q29, ["X", #192] \n" \
"ldp q30, q31, ["X", #224] \n" \
"add "X", "X", #256 \n" \
"fadd v20.2d, v20.2d, v21.2d \n" \
"fadd v22.2d, v22.2d, v23.2d \n" \
"PRFM PLDL1KEEP, ["X", #1024] \n" \
"PRFM PLDL1KEEP, ["X", #1024+64] \n" \
"fadd v24.2d, v24.2d, v25.2d \n" \
"fadd v26.2d, v26.2d, v27.2d \n" \
"fadd v28.2d, v28.2d, v29.2d \n" \
"fadd v30.2d, v30.2d, v31.2d \n" \
"fadd v0.2d, v0.2d, v16.2d \n" \
"fadd v1.2d, v1.2d, v18.2d \n" \
"fadd v2.2d, v2.2d, v20.2d \n" \
"fadd v3.2d, v3.2d, v22.2d \n" \
"PRFM PLDL1KEEP, ["X", #1024+128] \n" \
"PRFM PLDL1KEEP, ["X", #1024+192] \n" \
"fadd v4.2d, v4.2d, v24.2d \n" \
"fadd v5.2d, v5.2d, v26.2d \n" \
"fadd v6.2d, v6.2d, v28.2d \n" \
"fadd v7.2d, v7.2d, v30.2d \n"
#define KERNEL_F16_FINALIZE \
"fadd v0.2d, v0.2d, v1.2d \n" \
"fadd v2.2d, v2.2d, v3.2d \n" \
"fadd v4.2d, v4.2d, v5.2d \n" \
"fadd v6.2d, v6.2d, v7.2d \n" \
"fadd v0.2d, v0.2d, v2.2d \n" \
"fadd v4.2d, v4.2d, v6.2d \n" \
"fadd v0.2d, v0.2d, v4.2d \n" \
"faddp "SUMF", v0.2d \n"
#define INIT_S \
"lsl "INC_X", "INC_X", #4 \n"
#define KERNEL_S1 \
"ldr q1, ["X"] \n" \
"add "X", "X", "INC_X" \n" \
"faddp d1, v1.2d \n" \
"fadd "SUMF", "SUMF", d1 \n"
#if defined(SMP)
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
void *c, BLASLONG ldc, int (*function)(), int nthreads);
#endif
static FLOAT zasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
FLOAT asum = 0.0 ;
if ( n < 0 ) return(asum);
__asm__ __volatile__ (
" mov "N", %[N_] \n"
" mov "X", %[X_] \n"
" mov "INC_X", %[INCX_] \n"
" fmov "SUMF", "REG0" \n"
" fmov d1, "REG0" \n"
" fmov d2, "REG0" \n"
" fmov d3, "REG0" \n"
" fmov d4, "REG0" \n"
" fmov d5, "REG0" \n"
" fmov d6, "REG0" \n"
" fmov d7, "REG0" \n"
" cmp "N", xzr \n"
" ble 9f //asum_kernel_L999 \n"
" cmp "INC_X", xzr \n"
" ble 9f //asum_kernel_L999 \n"
" cmp "INC_X", #1 \n"
" bne 5f //asum_kernel_S_BEGIN \n"
"1: //asum_kernel_F_BEGIN: \n"
" asr "J", "N", #4 \n"
" cmp "J", xzr \n"
" beq 3f //asum_kernel_F1 \n"
".align 5 \n"
"2: //asum_kernel_F16: \n"
" "KERNEL_F16" \n"
" subs "J", "J", #1 \n"
" bne 2b //asum_kernel_F16 \n"
" "KERNEL_F16_FINALIZE" \n"
"3: //asum_kernel_F1: \n"
" ands "J", "N", #15 \n"
" ble 9f //asum_kernel_L999 \n"
"4: //asum_kernel_F10: \n"
" "KERNEL_F1" \n"
" subs "J", "J", #1 \n"
" bne 4b //asum_kernel_F10 \n"
" b 9f //asum_kernel_L999 \n"
"5: //asum_kernel_S_BEGIN: \n"
" "INIT_S" \n"
" asr "J", "N", #2 \n"
" cmp "J", xzr \n"
" ble 7f //asum_kernel_S1 \n"
"6: //asum_kernel_S4: \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" subs "J", "J", #1 \n"
" bne 6b //asum_kernel_S4 \n"
"7: //asum_kernel_S1: \n"
" ands "J", "N", #3 \n"
" ble 9f //asum_kernel_L999 \n"
"8: //asum_kernel_S10: \n"
" "KERNEL_S1" \n"
" subs "J", "J", #1 \n"
" bne 8b //asum_kernel_S10 \n"
"9: //asum_kernel_L999: \n"
" fmov %[ASUM_], "SUMF" \n"
: [ASUM_] "=r" (asum) //%0
: [N_] "r" (n), //%1
[X_] "r" (x), //%2
[INCX_] "r" (inc_x) //%3
: "cc",
"memory",
"x0", "x1", "x2", "x3", "x4", "x5",
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
);
return asum;
}
#if defined(SMP)
static int zasum_thread_function(BLASLONG n, BLASLONG dummy0,
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
{
*result = zasum_compute(n, x, inc_x);
return 0;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
#if defined(SMP)
int nthreads;
FLOAT dummy_alpha;
#endif
FLOAT asum = 0.0;
#if defined(SMP)
if (inc_x == 0 || n <= 10000)
nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) {
asum = zasum_compute(n, x, inc_x);
} else {
int mode, i;
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
FLOAT *ptr;
mode = BLAS_DOUBLE | BLAS_COMPLEX;
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
x, inc_x, NULL, 0, result, 0,
( void *)zasum_thread_function, nthreads);
ptr = (FLOAT *)result;
for (i = 0; i < nthreads; i++) {
asum = asum + (*ptr);
ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2);
}
}
#else
asum = zasum_compute(n, x, inc_x);
#endif
return asum;
}

149
kernel/csky/KERNEL Normal file
View File

@ -0,0 +1,149 @@
SAMAXKERNEL = ../arm/amax.c
DAMAXKERNEL = ../arm/amax.c
CAMAXKERNEL = ../arm/zamax.c
ZAMAXKERNEL = ../arm/zamax.c
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMAXKERNEL = ../arm/iamax.c
IDAMAXKERNEL = ../arm/iamax.c
ICAMAXKERNEL = ../arm/izamax.c
IZAMAXKERNEL = ../arm/izamax.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
SASUMKERNEL = ../arm/asum.c
DASUMKERNEL = ../arm/asum.c
CASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = ../arm/zasum.c
SSUMKERNEL = ../arm/sum.c
DSUMKERNEL = ../arm/sum.c
CSUMKERNEL = ../arm/zsum.c
ZSUMKERNEL = ../arm/zsum.c
SAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = ../arm/axpy.c
CAXPYKERNEL = ../arm/zaxpy.c
ZAXPYKERNEL = ../arm/zaxpy.c
SCOPYKERNEL = ../arm/copy.c
DCOPYKERNEL = ../arm/copy.c
CCOPYKERNEL = ../arm/zcopy.c
ZCOPYKERNEL = ../arm/zcopy.c
SDOTKERNEL = ../arm/dot.c
DDOTKERNEL = ../arm/dot.c
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
DSDOTKERNEL = ../generic/dot.c
SNRM2KERNEL = ../arm/nrm2.c
DNRM2KERNEL = ../arm/nrm2.c
CNRM2KERNEL = ../arm/znrm2.c
ZNRM2KERNEL = ../arm/znrm2.c
SROTKERNEL = ../arm/rot.c
DROTKERNEL = ../arm/rot.c
CROTKERNEL = ../arm/zrot.c
ZROTKERNEL = ../arm/zrot.c
SSCALKERNEL = ../arm/scal.c
DSCALKERNEL = ../arm/scal.c
CSCALKERNEL = ../arm/zscal.c
ZSCALKERNEL = ../arm/zscal.c
SSWAPKERNEL = ../arm/swap.c
DSWAPKERNEL = ../arm/swap.c
CSWAPKERNEL = ../arm/zswap.c
ZSWAPKERNEL = ../arm/zswap.c
SGEMVNKERNEL = ../arm/gemv_n.c
DGEMVNKERNEL = ../arm/gemv_n.c
CGEMVNKERNEL = ../arm/zgemv_n.c
ZGEMVNKERNEL = ../arm/zgemv_n.c
SGEMVTKERNEL = ../arm/gemv_t.c
DGEMVTKERNEL = ../arm/gemv_t.c
CGEMVTKERNEL = ../arm/zgemv_t.c
ZGEMVTKERNEL = ../arm/zgemv_t.c
STRMMKERNEL = ../generic/trmmkernel_2x2.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SCABS_KERNEL = ../generic/cabs.c
DCABS_KERNEL = ../generic/cabs.c
QCABS_KERNEL = ../generic/cabs.c
LSAME_KERNEL = ../generic/lsame.c
SGEMM_BETA = ../generic/gemm_beta.c
DGEMM_BETA = ../generic/gemm_beta.c
CGEMM_BETA = ../generic/zgemm_beta.c
ZGEMM_BETA = ../generic/zgemm_beta.c

1
kernel/csky/Makefile Normal file
View File

@ -0,0 +1 @@
clean ::

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -40,7 +40,6 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a,
if ( rows <= 0 ) return(0);
if ( cols <= 0 ) return(0);
if ( alpha_r == 1.0 && alpha_i == 0.0 ) return (0);
aptr = a;
lda *= 2;

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,587 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG i, j;
FLOAT *aoffset;
FLOAT *aoffset1, *aoffset2;
FLOAT *boffset;
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
aoffset = a;
boffset = b;
lda *= 2;
#if 0
fprintf(stderr, "M = %d N = %d\n", m, n);
#endif
j = (n >> 4);
if (j > 0){
do{
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 32;
i = (m >> 1);
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
ctemp05 = *(aoffset1 + 4);
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
ctemp09 = *(aoffset1 + 8);
ctemp10 = *(aoffset1 + 9);
ctemp11 = *(aoffset1 + 10);
ctemp12 = *(aoffset1 + 11);
ctemp13 = *(aoffset1 + 12);
ctemp14 = *(aoffset1 + 13);
ctemp15 = *(aoffset1 + 14);
ctemp16 = *(aoffset1 + 15);
ctemp17 = *(aoffset1 + 16);
ctemp18 = *(aoffset1 + 17);
ctemp19 = *(aoffset1 + 18);
ctemp20 = *(aoffset1 + 19);
ctemp21 = *(aoffset1 + 20);
ctemp22 = *(aoffset1 + 21);
ctemp23 = *(aoffset1 + 22);
ctemp24 = *(aoffset1 + 23);
ctemp25 = *(aoffset1 + 24);
ctemp26 = *(aoffset1 + 25);
ctemp27 = *(aoffset1 + 26);
ctemp28 = *(aoffset1 + 27);
ctemp29 = *(aoffset1 + 28);
ctemp30 = *(aoffset1 + 29);
ctemp31 = *(aoffset1 + 30);
ctemp32 = *(aoffset1 + 31);
ctemp33 = *(aoffset2 + 0);
ctemp34 = *(aoffset2 + 1);
ctemp35 = *(aoffset2 + 2);
ctemp36 = *(aoffset2 + 3);
ctemp37 = *(aoffset2 + 4);
ctemp38 = *(aoffset2 + 5);
ctemp39 = *(aoffset2 + 6);
ctemp40 = *(aoffset2 + 7);
ctemp41 = *(aoffset2 + 8);
ctemp42 = *(aoffset2 + 9);
ctemp43 = *(aoffset2 + 10);
ctemp44 = *(aoffset2 + 11);
ctemp45 = *(aoffset2 + 12);
ctemp46 = *(aoffset2 + 13);
ctemp47 = *(aoffset2 + 14);
ctemp48 = *(aoffset2 + 15);
ctemp49 = *(aoffset2 + 16);
ctemp50 = *(aoffset2 + 17);
ctemp51 = *(aoffset2 + 18);
ctemp52 = *(aoffset2 + 19);
ctemp53 = *(aoffset2 + 20);
ctemp54 = *(aoffset2 + 21);
ctemp55 = *(aoffset2 + 22);
ctemp56 = *(aoffset2 + 23);
ctemp57 = *(aoffset2 + 24);
ctemp58 = *(aoffset2 + 25);
ctemp59 = *(aoffset2 + 26);
ctemp60 = *(aoffset2 + 27);
ctemp61 = *(aoffset2 + 28);
ctemp62 = *(aoffset2 + 29);
ctemp63 = *(aoffset2 + 30);
ctemp64 = *(aoffset2 + 31);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
*(boffset + 4) = -ctemp05;
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
*(boffset + 8) = -ctemp09;
*(boffset + 9) = -ctemp10;
*(boffset + 10) = -ctemp11;
*(boffset + 11) = -ctemp12;
*(boffset + 12) = -ctemp13;
*(boffset + 13) = -ctemp14;
*(boffset + 14) = -ctemp15;
*(boffset + 15) = -ctemp16;
*(boffset + 16) = -ctemp17;
*(boffset + 17) = -ctemp18;
*(boffset + 18) = -ctemp19;
*(boffset + 19) = -ctemp20;
*(boffset + 20) = -ctemp21;
*(boffset + 21) = -ctemp22;
*(boffset + 22) = -ctemp23;
*(boffset + 23) = -ctemp24;
*(boffset + 24) = -ctemp25;
*(boffset + 25) = -ctemp26;
*(boffset + 26) = -ctemp27;
*(boffset + 27) = -ctemp28;
*(boffset + 28) = -ctemp29;
*(boffset + 29) = -ctemp30;
*(boffset + 30) = -ctemp31;
*(boffset + 31) = -ctemp32;
*(boffset + 32) = -ctemp33;
*(boffset + 33) = -ctemp34;
*(boffset + 34) = -ctemp35;
*(boffset + 35) = -ctemp36;
*(boffset + 36) = -ctemp37;
*(boffset + 37) = -ctemp38;
*(boffset + 38) = -ctemp39;
*(boffset + 39) = -ctemp40;
*(boffset + 40) = -ctemp41;
*(boffset + 41) = -ctemp42;
*(boffset + 42) = -ctemp43;
*(boffset + 43) = -ctemp44;
*(boffset + 44) = -ctemp45;
*(boffset + 45) = -ctemp46;
*(boffset + 46) = -ctemp47;
*(boffset + 47) = -ctemp48;
*(boffset + 48) = -ctemp49;
*(boffset + 49) = -ctemp50;
*(boffset + 50) = -ctemp51;
*(boffset + 51) = -ctemp52;
*(boffset + 52) = -ctemp53;
*(boffset + 53) = -ctemp54;
*(boffset + 54) = -ctemp55;
*(boffset + 55) = -ctemp56;
*(boffset + 56) = -ctemp57;
*(boffset + 57) = -ctemp58;
*(boffset + 58) = -ctemp59;
*(boffset + 59) = -ctemp60;
*(boffset + 60) = -ctemp61;
*(boffset + 61) = -ctemp62;
*(boffset + 62) = -ctemp63;
*(boffset + 63) = -ctemp64;
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 64;
i --;
}while(i > 0);
}
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
ctemp05 = *(aoffset1 + 4);
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
ctemp09 = *(aoffset1 + 8);
ctemp10 = *(aoffset1 + 9);
ctemp11 = *(aoffset1 + 10);
ctemp12 = *(aoffset1 + 11);
ctemp13 = *(aoffset1 + 12);
ctemp14 = *(aoffset1 + 13);
ctemp15 = *(aoffset1 + 14);
ctemp16 = *(aoffset1 + 15);
ctemp17 = *(aoffset1 + 16);
ctemp18 = *(aoffset1 + 17);
ctemp19 = *(aoffset1 + 18);
ctemp20 = *(aoffset1 + 19);
ctemp21 = *(aoffset1 + 20);
ctemp22 = *(aoffset1 + 21);
ctemp23 = *(aoffset1 + 22);
ctemp24 = *(aoffset1 + 23);
ctemp25 = *(aoffset1 + 24);
ctemp26 = *(aoffset1 + 25);
ctemp27 = *(aoffset1 + 26);
ctemp28 = *(aoffset1 + 27);
ctemp29 = *(aoffset1 + 28);
ctemp30 = *(aoffset1 + 29);
ctemp31 = *(aoffset1 + 30);
ctemp32 = *(aoffset1 + 31);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
*(boffset + 4) = -ctemp05;
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
*(boffset + 8) = -ctemp09;
*(boffset + 9) = -ctemp10;
*(boffset + 10) = -ctemp11;
*(boffset + 11) = -ctemp12;
*(boffset + 12) = -ctemp13;
*(boffset + 13) = -ctemp14;
*(boffset + 14) = -ctemp15;
*(boffset + 15) = -ctemp16;
*(boffset + 16) = -ctemp17;
*(boffset + 17) = -ctemp18;
*(boffset + 18) = -ctemp19;
*(boffset + 19) = -ctemp20;
*(boffset + 20) = -ctemp21;
*(boffset + 21) = -ctemp22;
*(boffset + 22) = -ctemp23;
*(boffset + 23) = -ctemp24;
*(boffset + 24) = -ctemp25;
*(boffset + 25) = -ctemp26;
*(boffset + 26) = -ctemp27;
*(boffset + 27) = -ctemp28;
*(boffset + 28) = -ctemp29;
*(boffset + 29) = -ctemp30;
*(boffset + 30) = -ctemp31;
*(boffset + 31) = -ctemp32;
boffset += 32;
}
j--;
}while(j > 0);
} /* end of if(j > 0) */
if (n & 8){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 16;
i = (m >> 1);
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
ctemp05 = *(aoffset1 + 4);
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
ctemp09 = *(aoffset1 + 8);
ctemp10 = *(aoffset1 + 9);
ctemp11 = *(aoffset1 + 10);
ctemp12 = *(aoffset1 + 11);
ctemp13 = *(aoffset1 + 12);
ctemp14 = *(aoffset1 + 13);
ctemp15 = *(aoffset1 + 14);
ctemp16 = *(aoffset1 + 15);
ctemp17 = *(aoffset2 + 0);
ctemp18 = *(aoffset2 + 1);
ctemp19 = *(aoffset2 + 2);
ctemp20 = *(aoffset2 + 3);
ctemp21 = *(aoffset2 + 4);
ctemp22 = *(aoffset2 + 5);
ctemp23 = *(aoffset2 + 6);
ctemp24 = *(aoffset2 + 7);
ctemp25 = *(aoffset2 + 8);
ctemp26 = *(aoffset2 + 9);
ctemp27 = *(aoffset2 + 10);
ctemp28 = *(aoffset2 + 11);
ctemp29 = *(aoffset2 + 12);
ctemp30 = *(aoffset2 + 13);
ctemp31 = *(aoffset2 + 14);
ctemp32 = *(aoffset2 + 15);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
*(boffset + 4) = -ctemp05;
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
*(boffset + 8) = -ctemp09;
*(boffset + 9) = -ctemp10;
*(boffset + 10) = -ctemp11;
*(boffset + 11) = -ctemp12;
*(boffset + 12) = -ctemp13;
*(boffset + 13) = -ctemp14;
*(boffset + 14) = -ctemp15;
*(boffset + 15) = -ctemp16;
*(boffset + 16) = -ctemp17;
*(boffset + 17) = -ctemp18;
*(boffset + 18) = -ctemp19;
*(boffset + 19) = -ctemp20;
*(boffset + 20) = -ctemp21;
*(boffset + 21) = -ctemp22;
*(boffset + 22) = -ctemp23;
*(boffset + 23) = -ctemp24;
*(boffset + 24) = -ctemp25;
*(boffset + 25) = -ctemp26;
*(boffset + 26) = -ctemp27;
*(boffset + 27) = -ctemp28;
*(boffset + 28) = -ctemp29;
*(boffset + 29) = -ctemp30;
*(boffset + 30) = -ctemp31;
*(boffset + 31) = -ctemp32;
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 32;
i --;
}while(i > 0);
}
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
ctemp05 = *(aoffset1 + 4);
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
ctemp09 = *(aoffset1 + 8);
ctemp10 = *(aoffset1 + 9);
ctemp11 = *(aoffset1 + 10);
ctemp12 = *(aoffset1 + 11);
ctemp13 = *(aoffset1 + 12);
ctemp14 = *(aoffset1 + 13);
ctemp15 = *(aoffset1 + 14);
ctemp16 = *(aoffset1 + 15);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
*(boffset + 4) = -ctemp05;
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
*(boffset + 8) = -ctemp09;
*(boffset + 9) = -ctemp10;
*(boffset + 10) = -ctemp11;
*(boffset + 11) = -ctemp12;
*(boffset + 12) = -ctemp13;
*(boffset + 13) = -ctemp14;
*(boffset + 14) = -ctemp15;
*(boffset + 15) = -ctemp16;
boffset += 16;
}
}
if (n & 4){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 8;
i = (m >> 1);
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
ctemp05 = *(aoffset1 + 4);
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
ctemp09 = *(aoffset2 + 0);
ctemp10 = *(aoffset2 + 1);
ctemp11 = *(aoffset2 + 2);
ctemp12 = *(aoffset2 + 3);
ctemp13 = *(aoffset2 + 4);
ctemp14 = *(aoffset2 + 5);
ctemp15 = *(aoffset2 + 6);
ctemp16 = *(aoffset2 + 7);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
*(boffset + 4) = -ctemp05;
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
*(boffset + 8) = -ctemp09;
*(boffset + 9) = -ctemp10;
*(boffset + 10) = -ctemp11;
*(boffset + 11) = -ctemp12;
*(boffset + 12) = -ctemp13;
*(boffset + 13) = -ctemp14;
*(boffset + 14) = -ctemp15;
*(boffset + 15) = -ctemp16;
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 16;
i --;
}while(i > 0);
}
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
ctemp05 = *(aoffset1 + 4);
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
*(boffset + 4) = -ctemp05;
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
boffset += 8;
}
}
if (n & 2){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 4;
i = (m >> 1);
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
ctemp05 = *(aoffset2 + 0);
ctemp06 = *(aoffset2 + 1);
ctemp07 = *(aoffset2 + 2);
ctemp08 = *(aoffset2 + 3);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
*(boffset + 4) = -ctemp05;
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 8;
i --;
}while(i > 0);
}
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
boffset += 4;
}
}
if (n & 1){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
// aoffset += 2;
i = (m >> 1);
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 4;
i --;
}while(i > 0);
}
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
// boffset += 2;
}
}
return 0;
}

View File

@ -0,0 +1,333 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js, offset;
FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
FLOAT data17, data18, data19, data20, data21, data22, data23, data24;
FLOAT data25, data26, data27, data28, data29, data30, data31, data32;
FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8;
FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16;
lda *= 2;
js = (n >> 4);
while (js > 0){
offset = posX - posY;
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
if (offset > -8) ao9 = a + (posX + 8) * 2 + posY * lda; else ao9 = a + posY * 2 + (posX + 8) * lda;
if (offset > -9) ao10 = a + (posX + 9) * 2 + posY * lda; else ao10 = a + posY * 2 + (posX + 9) * lda;
if (offset > -10) ao11 = a + (posX + 10) * 2 + posY * lda; else ao11 = a + posY * 2 + (posX + 10) * lda;
if (offset > -11) ao12 = a + (posX + 11) * 2 + posY * lda; else ao12 = a + posY * 2 + (posX + 11) * lda;
if (offset > -12) ao13 = a + (posX + 12) * 2 + posY * lda; else ao13 = a + posY * 2 + (posX + 12) * lda;
if (offset > -13) ao14 = a + (posX + 13) * 2 + posY * lda; else ao14 = a + posY * 2 + (posX + 13) * lda;
if (offset > -14) ao15 = a + (posX + 14) * 2 + posY * lda; else ao15 = a + posY * 2 + (posX + 14) * lda;
if (offset > -15) ao16 = a + (posX + 15) * 2 + posY * lda; else ao16 = a + posY * 2 + (posX + 15) * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
data05 = *(ao3 + 0);
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);
data09 = *(ao5 + 0);
data10 = *(ao5 + 1);
data11 = *(ao6 + 0);
data12 = *(ao6 + 1);
data13 = *(ao7 + 0);
data14 = *(ao7 + 1);
data15 = *(ao8 + 0);
data16 = *(ao8 + 1);
data17 = *(ao9 + 0);
data18 = *(ao9 + 1);
data19 = *(ao10 + 0);
data20 = *(ao10 + 1);
data21 = *(ao11 + 0);
data22 = *(ao11 + 1);
data23 = *(ao12 + 0);
data24 = *(ao12 + 1);
data25 = *(ao13 + 0);
data26 = *(ao13 + 1);
data27 = *(ao14 + 0);
data28 = *(ao14 + 1);
data29 = *(ao15 + 0);
data30 = *(ao15 + 1);
data31 = *(ao16 + 0);
data32 = *(ao16 + 1);
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
if (offset > -2) ao3 += lda; else ao3 += 2;
if (offset > -3) ao4 += lda; else ao4 += 2;
if (offset > -4) ao5 += lda; else ao5 += 2;
if (offset > -5) ao6 += lda; else ao6 += 2;
if (offset > -6) ao7 += lda; else ao7 += 2;
if (offset > -7) ao8 += lda; else ao8 += 2;
if (offset > -8) ao9 += lda; else ao9 += 2;
if (offset > -9) ao10 += lda; else ao10 += 2;
if (offset > -10) ao11 += lda; else ao11 += 2;
if (offset > -11) ao12 += lda; else ao12 += 2;
if (offset > -12) ao13 += lda; else ao13 += 2;
if (offset > -13) ao14 += lda; else ao14 += 2;
if (offset > -14) ao15 += lda; else ao15 += 2;
if (offset > -15) ao16 += lda; else ao16 += 2;
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = data05;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
b[11] = data12;
b[12] = data13;
b[13] = data14;
b[14] = data15;
b[15] = data16;
b[16] = data17;
b[17] = data18;
b[18] = data19;
b[19] = data20;
b[20] = data21;
b[21] = data22;
b[22] = data23;
b[23] = data24;
b[24] = data25;
b[25] = data26;
b[26] = data27;
b[27] = data28;
b[28] = data29;
b[29] = data30;
b[30] = data31;
b[31] = data32;
b += 32;
offset --;
i --;
}
posX += 16;
js --;
}
if (n & 8) {
offset = posX - posY;
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
data05 = *(ao3 + 0);
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);
data09 = *(ao5 + 0);
data10 = *(ao5 + 1);
data11 = *(ao6 + 0);
data12 = *(ao6 + 1);
data13 = *(ao7 + 0);
data14 = *(ao7 + 1);
data15 = *(ao8 + 0);
data16 = *(ao8 + 1);
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
if (offset > -2) ao3 += lda; else ao3 += 2;
if (offset > -3) ao4 += lda; else ao4 += 2;
if (offset > -4) ao5 += lda; else ao5 += 2;
if (offset > -5) ao6 += lda; else ao6 += 2;
if (offset > -6) ao7 += lda; else ao7 += 2;
if (offset > -7) ao8 += lda; else ao8 += 2;
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = data05;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
b[11] = data12;
b[12] = data13;
b[13] = data14;
b[14] = data15;
b[15] = data16;
b += 16;
offset --;
i --;
}
posX += 8;
}
if (n & 4) {
offset = posX - posY;
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
data05 = *(ao3 + 0);
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
if (offset > -2) ao3 += lda; else ao3 += 2;
if (offset > -3) ao4 += lda; else ao4 += 2;
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = data05;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
b += 8;
offset --;
i --;
}
posX += 4;
}
if (n & 2) {
offset = posX - posY;
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b += 4;
offset --;
i --;
}
posX += 2;
}
if (n & 1) {
offset = posX - posY;
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
if (offset > 0) ao1 += lda; else ao1 += 2;
b[ 0] = data01;
b[ 1] = data02;
b += 2;
offset --;
i --;
}
}
return 0;
}

Some files were not shown because too many files have changed in this diff Show More