Merge pull request #4606 from OpenMathLib/develop

Merge develop branch for 0.3.27
This commit is contained in:
Martin Kroeker 2024-04-04 22:24:56 +02:00 committed by GitHub
commit 8f3bb62254
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2639 changed files with 141761 additions and 40705 deletions

View File

@ -1,44 +1,44 @@
macos_instance: macos_instance:
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
task: #task:
name: AppleM1/LLVM # name: AppleM1/LLVM
compile_script: # compile_script:
- brew install llvm # - brew install llvm
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH # - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" # - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" # - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- make TARGET=VORTEX USE_OPENMP=1 CC=clang # - make TARGET=VORTEX USE_OPENMP=1 CC=clang
task: #task:
name: AppleM1/LLVM/ILP64 # name: AppleM1/LLVM/ILP64
compile_script: # compile_script:
- brew install llvm # - brew install llvm
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH # - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" # - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" # - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1 # - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1
task: #task:
name: AppleM1/LLVM/CMAKE # name: AppleM1/LLVM/CMAKE
compile_script: # compile_script:
- brew install llvm # - brew install llvm
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH # - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" # - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" # - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- mkdir build # - mkdir build
- cd build # - cd build
- cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON .. # - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
- make -j 4 # - make -j 4
task: #task:
name: AppleM1/GCC/MAKE/OPENMP # name: AppleM1/GCC/MAKE/OPENMP
compile_script: # compile_script:
- brew install gcc@11 # - brew install gcc@11
- export PATH=/opt/homebrew/bin:$PATH # - export PATH=/opt/homebrew/bin:$PATH
- export LDFLAGS="-L/opt/homebrew/lib" # - export LDFLAGS="-L/opt/homebrew/lib"
- export CPPFLAGS="-I/opt/homebrew/include" # - export CPPFLAGS="-I/opt/homebrew/include"
- make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1 # - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
macos_instance: macos_instance:
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
@ -58,8 +58,8 @@ task:
- export VALID_ARCHS="i386 x86_64" - export VALID_ARCHS="i386 x86_64"
- xcrun --sdk macosx --show-sdk-path - xcrun --sdk macosx --show-sdk-path
- xcodebuild -version - xcodebuild -version
- export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - export CC=/Applications/Xcode-15.3.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX12.3.sdk -arch x86_64" - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-15.3.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.4.sdk -arch x86_64"
- make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
always: always:
config_artifacts: config_artifacts:
@ -78,8 +78,8 @@ task:
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - export CC=/Applications/Xcode-15.3.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0" - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-15.3.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.4.sdk -arch arm64 -miphoneos-version-min=10.0"
- make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1 - make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1
always: always:
config_artifacts: config_artifacts:
@ -91,14 +91,16 @@ macos_instance:
task: task:
name: AppleM1/LLVM armv7-androidndk xbuild name: AppleM1/LLVM armv7-androidndk xbuild
compile_script: compile_script:
- #brew install android-ndk - brew install android-ndk
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- find /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b -name "armv7a-linux-androideabi*-ranlib" - ls /System/Volumes/Data/opt/homebrew
- ls -l /System/Volumes/Data/opt/homebrew/Caskroom/
- find /System/Volumes/Data/opt/homebrew -name "armv7a-linux-androideabi*-ranlib"
- #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
- #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0" - #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b/AndroidNDK8937393.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/26c/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" - make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
always: always:
config_artifacts: config_artifacts:

149
.github/workflows/apple_m.yml vendored Normal file
View File

@ -0,0 +1,149 @@
name: apple m
on: [push, pull_request]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
permissions:
contents: read # to fetch code (actions/checkout)
jobs:
build:
if: "github.repository == 'OpenMathLib/OpenBLAS'"
runs-on: macos-14
strategy:
fail-fast: false
matrix:
build: [cmake, make]
fortran: [gfortran]
openmp: [0, 1]
ilp64: [0, 1]
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Print system information
run: |
if [ "$RUNNER_OS" == "Linux" ]; then
cat /proc/cpuinfo
elif [ "$RUNNER_OS" == "macOS" ]; then
sysctl -a | grep machdep.cpu
else
echo "::error::$RUNNER_OS not supported"
exit 1
fi
- name: Install Dependencies
run: |
if [ "$RUNNER_OS" == "Linux" ]; then
sudo apt-get install -y gfortran cmake ccache libtinfo5
elif [ "$RUNNER_OS" == "macOS" ]; then
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
brew reinstall gcc
brew install coreutils cmake ccache
brew install llvm
else
echo "::error::$RUNNER_OS not supported"
exit 1
fi
- name: Compilation cache
uses: actions/cache@v3
with:
path: ~/.ccache
# We include the commit sha in the cache key, as new cache entries are
# only created if there is no existing entry for the key yet.
# GNU make and cmake call the compilers differently. It looks like
# that causes the cache to mismatch. Keep the ccache for both build
# tools separate to avoid polluting each other.
key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
# Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
restore-keys: |
ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}-${{ github.ref }}
ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}
ccache-${{ runner.os }}-${{ matrix.build }}
- name: Configure ccache
run: |
if [ "${{ matrix.build }}" = "make" ]; then
# Add ccache to path
if [ "$RUNNER_OS" = "Linux" ]; then
echo "/usr/lib/ccache" >> $GITHUB_PATH
elif [ "$RUNNER_OS" = "macOS" ]; then
echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH
echo "/opt/homebrew/opt/llvm/bin" >>$GITHUB_PATH
echo "" >>$GITHUB_PATH
else
echo "::error::$RUNNER_OS not supported"
exit 1
fi
fi
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
test -d ~/.ccache || mkdir -p ~/.ccache
echo "max_size = 300M" > ~/.ccache/ccache.conf
echo "compression = true" >> ~/.ccache/ccache.conf
ccache -s
- name: Build OpenBLAS
run: |
export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
export CC="/opt/homebrew/opt/llvm/bin/clang"
case "${{ matrix.build }}" in
"make")
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=${{matrix.openmp}} INTERFACE64=${{matrix.ilp64}} FC="ccache ${{ matrix.fortran }}"
;;
"cmake")
export LDFLAGS="$LDFLAGS -Wl,-ld_classic"
mkdir build && cd build
cmake -DDYNAMIC_ARCH=1 \
-DUSE_OPENMP=${{matrix.openmp}} \
-DINTERFACE64=${{matrix.ilp64}} \
-DNOFORTRAN=0 \
-DBUILD_WITHOUT_LAPACK=0 \
-DCMAKE_VERBOSE_MAKEFILE=ON \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
..
cmake --build .
;;
*)
echo "::error::Configuration not supported"
exit 1
;;
esac
- name: Show ccache status
continue-on-error: true
run: ccache -s
- name: Run tests
timeout-minutes: 60
run: |
case "${{ matrix.build }}" in
"make")
MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0'
echo "::group::Tests in 'test' directory"
make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
echo "::endgroup::"
echo "::group::Tests in 'ctest' directory"
make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
echo "::endgroup::"
echo "::group::Tests in 'utest' directory"
make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
echo "::endgroup::"
;;
"cmake")
cd build && ctest
;;
*)
echo "::error::Configuration not supported"
exit 1
;;
esac

View File

@ -14,8 +14,8 @@ jobs:
if: "github.repository == 'OpenMathLib/OpenBLAS'" if: "github.repository == 'OpenMathLib/OpenBLAS'"
runs-on: ubuntu-latest runs-on: ubuntu-latest
env: env:
xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282 xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1698113812618
toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0-20231018.tar.gz
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
@ -76,7 +76,7 @@ jobs:
run: | run: |
wget ${xuetie_toolchain}/${toolchain_file_name} wget ${xuetie_toolchain}/${toolchain_file_name}
tar -xvf ${toolchain_file_name} -C /opt tar -xvf ${toolchain_file_name} -C /opt
export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1/bin:$PATH" export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0/bin:$PATH"
make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)

View File

@ -42,6 +42,7 @@ jobs:
- name: Install Dependencies - name: Install Dependencies
run: | run: |
if [ "$RUNNER_OS" == "Linux" ]; then if [ "$RUNNER_OS" == "Linux" ]; then
sudo apt-get update
sudo apt-get install -y gfortran cmake ccache libtinfo5 sudo apt-get install -y gfortran cmake ccache libtinfo5
elif [ "$RUNNER_OS" == "macOS" ]; then elif [ "$RUNNER_OS" == "macOS" ]; then
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.

253
.github/workflows/riscv64_vector.yml vendored Normal file
View File

@ -0,0 +1,253 @@
name: riscv64 zvl256b qemu test
on: [push, pull_request]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
permissions:
contents: read # to fetch code (actions/checkout)
jobs:
TEST:
if: "github.repository == 'OpenMathLib/OpenBLAS'"
runs-on: ubuntu-latest
env:
triple: riscv64-unknown-linux-gnu
riscv_gnu_toolchain: https://github.com/riscv-collab/riscv-gnu-toolchain
riscv_gnu_toolchain_version: 13.2.0
riscv_gnu_toolchain_nightly_download_path: /releases/download/2024.02.02/riscv64-glibc-ubuntu-22.04-llvm-nightly-2024.02.02-nightly.tar.gz
strategy:
fail-fast: false
matrix:
include:
- target: RISCV64_ZVL128B
opts: TARGET=RISCV64_ZVL128B BINARY=64 ARCH=riscv64
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=128,elen=64
- target: RISCV64_ZVL256B
opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: install build deps
run: |
sudo apt-get update
sudo apt-get install autoconf automake autotools-dev ninja-build make \
libgomp1-riscv64-cross ccache
wget ${riscv_gnu_toolchain}/${riscv_gnu_toolchain_nightly_download_path}
tar -xvf $(basename ${riscv_gnu_toolchain_nightly_download_path}) -C /opt
- name: Compilation cache
uses: actions/cache@v3
with:
path: ~/.ccache
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
restore-keys: |
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
ccache-${{ runner.os }}-${{ matrix.target }}
- name: Configure ccache
run: |
test -d ~/.ccache || mkdir -p ~/.ccache
echo "max_size = 300M" > ~/.ccache/ccache.conf
echo "compression = true" >> ~/.ccache/ccache.conf
ccache -s
- name: build OpenBLAS libs
run: |
export PATH="/opt/riscv/bin:$PATH"
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
CC='ccache clang --rtlib=compiler-rt -target ${triple} --sysroot /opt/riscv/sysroot --gcc-toolchain=/opt/riscv/lib/gcc/riscv64-unknown-linux-gnu/${riscv_gnu_toolchain_version}/' \
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
RANLIB='ccache ${triple}-ranlib' \
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
HOSTCC=gcc HOSTFC=gfortran -j$(nproc)
- name: build OpenBLAS tests
run: |
export PATH="/opt/riscv/bin:$PATH"
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
CC='${triple}-gcc' \
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
RANLIB='ccache ${triple}-ranlib' \
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
HOSTCC=gcc HOSTFC=gfortran -j$(nproc) tests
- name: build lapack-netlib tests
working-directory: ./lapack-netlib/TESTING
run: |
export PATH="/opt/riscv/bin:$PATH"
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
CC='${triple}-gcc' \
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
RANLIB='ccache ${triple}-ranlib' \
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
HOSTCC=gcc HOSTFC=gfortran -j$(nproc) \
LIN/xlintsts LIN/xlintstc LIN/xlintstd LIN/xlintstz LIN/xlintstrfs \
LIN/xlintstrfc LIN/xlintstrfd LIN/xlintstrfz LIN/xlintstds \
LIN/xlintstzc EIG/xeigtsts EIG/xeigtstc EIG/xeigtstd EIG/xeigtstz \
- name: OpenBLAS tests
shell: bash
run: |
export PATH="/opt/riscv/bin:$PATH"
export QEMU_CPU=${{ matrix.qemu_cpu }}
rm -rf ./test_out
mkdir -p ./test_out
run_test() { local DIR=$1; local CMD=$2; local DATA=$3; local OUTPUT="./test_out/$DIR.$CMD"; \
echo "`pwd`/$DIR/$CMD $DIR/$DATA" >> $OUTPUT; \
if [[ -z $DATA ]]; then qemu-riscv64 ./$DIR/$CMD |& tee $OUTPUT ; \
else qemu-riscv64 ./$DIR/$CMD < ./$DIR/$DATA |& tee $OUTPUT ; fi ; \
RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi \
}
run_test test cblat1 &
run_test test cblat2 cblat2.dat &
run_test test cblat3 cblat3.dat &
run_test test dblat1 &
run_test test dblat2 dblat2.dat &
run_test test dblat3 dblat3.dat &
run_test test sblat1 &
run_test test sblat2 sblat2.dat &
run_test test sblat3 sblat3.dat &
run_test test zblat1 &
run_test test zblat2 zblat2.dat &
run_test test zblat3 zblat3.dat &
run_test ctest xccblat1 &
run_test ctest xccblat2 cin2 &
run_test ctest xccblat3 cin3 &
run_test ctest xdcblat1 &
run_test ctest xdcblat2 din2 &
run_test ctest xdcblat3 din3 &
run_test ctest xscblat1 &
run_test ctest xscblat2 sin2 &
run_test ctest xscblat3 sin3 &
run_test ctest xzcblat1 &
run_test ctest xzcblat2 zin2 &
run_test ctest xzcblat3 zin3 &
wait
while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*)
if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi
- name: netlib tests
shell: bash
run: |
: # these take a very long time
echo "Skipping netlib tests in CI"
exit 0
: # comment out exit above to enable the tests
: # probably we want to identify a subset to run in CI
export PATH="/opt/riscv/bin:$PATH"
export QEMU_CPU=${{ matrix.qemu_cpu }}
rm -rf ./test_out
mkdir -p ./test_out
run_test() { local OUTPUT="./test_out/$1"; local DATA="./lapack-netlib/TESTING/$2"; local CMD="./lapack-netlib/TESTING/$3"; \
echo "$4" >> $OUTPUT; \
echo "$CMD" >> $OUTPUT; \
qemu-riscv64 $CMD < $DATA |& tee $OUTPUT; \
RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi; \
if grep -q fail $OUTPUT ; then echo "*** FAIL: log contains 'fail'" >> $OUTPUT ; fi ; \
if grep -q rror $OUTPUT | grep -v -q "passed" | grep -v "largest error" ; then echo "*** FAIL: log contains 'error'" >> $OUTPUT ; fi \
}
run_test stest.out stest.in LIN/xlintsts "Testing REAL LAPACK linear equation routines" &
run_test ctest.out ctest.in LIN/xlintstc "Testing COMPLEX LAPACK linear equation routines" &
run_test dtest.out dtest.in LIN/xlintstd "Testing DOUBLE PRECISION LAPACK linear equation routines" &
run_test ztest.out ztest.in LIN/xlintstz "Testing COMPLEX16 LAPACK linear equation routines" &
run_test dstest.out dstest.in LIN/xlintstds "Testing SINGLE-DOUBLE PRECISION LAPACK prototype linear equation routines" &
run_test zctest.out zctest.in LIN/xlintstzc "Testing COMPLEX-COMPLEX16 LAPACK prototype linear equation routines" &
run_test stest_rfp.out stest_rfp.in LIN/xlintstrfs "Testing REAL LAPACK RFP prototype linear equation routines" &
run_test dtest_rfp.out dtest_rfp.in LIN/xlintstrfd "Testing DOUBLE PRECISION LAPACK RFP prototype linear equation routines" &
run_test ctest_rfp.out ctest_rfp.in LIN/xlintstrfc "Testing COMPLEX LAPACK RFP prototype linear equation routines" &
run_test ztest_rfp.out ztest_rfp.in LIN/xlintstrfz "Testing COMPLEX16 LAPACK RFP prototype linear equation routines" &
run_test snep.out nep.in EIG/xeigtsts "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
run_test ssep.out sep.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test sse2.out se2.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test ssvd.out svd.in EIG/xeigtsts "SVD - Testing Singular Value Decomposition routines" &
run_test sec.out sec.in EIG/xeigtsts "SEC - Testing REAL Eigen Condition Routines" &
run_test sed.out sed.in EIG/xeigtsts "SEV - Testing REAL Nonsymmetric Eigenvalue Driver" &
run_test sgg.out sgg.in EIG/xeigtsts "SGG - Testing REAL Nonsymmetric Generalized Eigenvalue Problem routines" &
run_test sgd.out sgd.in EIG/xeigtsts "SGD - Testing REAL Nonsymmetric Generalized Eigenvalue Problem driver routines" &
run_test ssb.out ssb.in EIG/xeigtsts "SSB - Testing REAL Symmetric Eigenvalue Problem routines" &
run_test ssg.out ssg.in EIG/xeigtsts "SSG - Testing REAL Symmetric Generalized Eigenvalue Problem routines" &
run_test sbal.out sbal.in EIG/xeigtsts "SGEBAL - Testing the balancing of a REAL general matrix" &
run_test sbak.out sbak.in EIG/xeigtsts "SGEBAK - Testing the back transformation of a REAL balanced matrix" &
run_test sgbal.out sgbal.in EIG/xeigtsts "SGGBAL - Testing the balancing of a pair of REAL general matrices" &
run_test sgbak.out sgbak.in EIG/xeigtsts "SGGBAK - Testing the back transformation of a pair of REAL balanced matrices" &
run_test sbb.out sbb.in EIG/xeigtsts "SBB - Testing banded Singular Value Decomposition routines" &
run_test sglm.out glm.in EIG/xeigtsts "GLM - Testing Generalized Linear Regression Model routines" &
run_test sgqr.out gqr.in EIG/xeigtsts "GQR - Testing Generalized QR and RQ factorization routines" &
run_test sgsv.out gsv.in EIG/xeigtsts "GSV - Testing Generalized Singular Value Decomposition routines" &
run_test scsd.out csd.in EIG/xeigtsts "CSD - Testing CS Decomposition routines" &
run_test slse.out lse.in EIG/xeigtsts "LSE - Testing Constrained Linear Least Squares routines" &
run_test cnep.out nep.in EIG/xeigtstc "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
run_test csep.out sep.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test cse2.out se2.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test csvd.out svd.in EIG/xeigtstc "SVD - Testing Singular Value Decomposition routines" &
run_test cec.out cec.in EIG/xeigtstc "CEC - Testing COMPLEX Eigen Condition Routines" &
run_test ced.out ced.in EIG/xeigtstc "CES - Testing COMPLEX Nonsymmetric Schur Form Driver" &
run_test cgg.out cgg.in EIG/xeigtstc "CGG - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem routines" &
run_test cgd.out cgd.in EIG/xeigtstc "CGD - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem driver routines" &
run_test csb.out csb.in EIG/xeigtstc "CHB - Testing Hermitian Eigenvalue Problem routines" &
run_test csg.out csg.in EIG/xeigtstc "CSG - Testing Symmetric Generalized Eigenvalue Problem routines" &
run_test cbal.out cbal.in EIG/xeigtstc "CGEBAL - Testing the balancing of a COMPLEX general matrix" &
run_test cbak.out cbak.in EIG/xeigtstc "CGEBAK - Testing the back transformation of a COMPLEX balanced matrix" &
run_test cgbal.out cgbal.in EIG/xeigtstc "CGGBAL - Testing the balancing of a pair of COMPLEX general matrices" &
run_test cgbak.out cgbak.in EIG/xeigtstc "CGGBAK - Testing the back transformation of a pair of COMPLEX balanced matrices" &
run_test cbb.out cbb.in EIG/xeigtstc "CBB - Testing banded Singular Value Decomposition routines" &
run_test cglm.out glm.in EIG/xeigtstc "GLM - Testing Generalized Linear Regression Model routines" &
run_test cgqr.out gqr.in EIG/xeigtstc "GQR - Testing Generalized QR and RQ factorization routines" &
run_test cgsv.out gsv.in EIG/xeigtstc "GSV - Testing Generalized Singular Value Decomposition routines" &
run_test ccsd.out csd.in EIG/xeigtstc "CSD - Testing CS Decomposition routines" &
run_test clse.out lse.in EIG/xeigtstc "LSE - Testing Constrained Linear Least Squares routines" &
run_test dnep.out nep.in EIG/xeigtstd "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
run_test dsep.out sep.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test dse2.out se2.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test dsvd.out svd.in EIG/xeigtstd "SVD - Testing Singular Value Decomposition routines" &
run_test dec.out dec.in EIG/xeigtstd "DEC - Testing DOUBLE PRECISION Eigen Condition Routines" &
run_test ded.out ded.in EIG/xeigtstd "DEV - Testing DOUBLE PRECISION Nonsymmetric Eigenvalue Driver" &
run_test dgg.out dgg.in EIG/xeigtstd "DGG - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem routines" &
run_test dgd.out dgd.in EIG/xeigtstd "DGD - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem driver routines" &
run_test dsb.out dsb.in EIG/xeigtstd "DSB - Testing DOUBLE PRECISION Symmetric Eigenvalue Problem routines" &
run_test dsg.out dsg.in EIG/xeigtstd "DSG - Testing DOUBLE PRECISION Symmetric Generalized Eigenvalue Problem routines" &
run_test dbal.out dbal.in EIG/xeigtstd "DGEBAL - Testing the balancing of a DOUBLE PRECISION general matrix" &
run_test dbak.out dbak.in EIG/xeigtstd "DGEBAK - Testing the back transformation of a DOUBLE PRECISION balanced matrix" &
run_test dgbal.out dgbal.in EIG/xeigtstd "DGGBAL - Testing the balancing of a pair of DOUBLE PRECISION general matrices" &
run_test dgbak.out dgbak.in EIG/xeigtstd "DGGBAK - Testing the back transformation of a pair of DOUBLE PRECISION balanced matrices" &
run_test dbb.out dbb.in EIG/xeigtstd "DBB - Testing banded Singular Value Decomposition routines" &
run_test dglm.out glm.in EIG/xeigtstd "GLM - Testing Generalized Linear Regression Model routines" &
run_test dgqr.out gqr.in EIG/xeigtstd "GQR - Testing Generalized QR and RQ factorization routines" &
run_test dgsv.out gsv.in EIG/xeigtstd "GSV - Testing Generalized Singular Value Decomposition routines" &
run_test dcsd.out csd.in EIG/xeigtstd "CSD - Testing CS Decomposition routines" &
run_test dlse.out lse.in EIG/xeigtstd "LSE - Testing Constrained Linear Least Squares routines" &
run_test znep.out nep.in EIG/xeigtstz "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
run_test zsep.out sep.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test zse2.out se2.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test zsvd.out svd.in EIG/xeigtstz "SVD - Testing Singular Value Decomposition routines" &
run_test zec.out zec.in EIG/xeigtstz "ZEC - Testing COMPLEX16 Eigen Condition Routines" &
run_test zed.out zed.in EIG/xeigtstz "ZES - Testing COMPLEX16 Nonsymmetric Schur Form Driver" &
run_test zgg.out zgg.in EIG/xeigtstz "ZGG - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem routines" &
run_test zgd.out zgd.in EIG/xeigtstz "ZGD - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem driver routines" &
run_test zsb.out zsb.in EIG/xeigtstz "ZHB - Testing Hermitian Eigenvalue Problem routines" &
run_test zsg.out zsg.in EIG/xeigtstz "ZSG - Testing Symmetric Generalized Eigenvalue Problem routines" &
run_test zbal.out zbal.in EIG/xeigtstz "ZGEBAL - Testing the balancing of a COMPLEX16 general matrix" &
run_test zbak.out zbak.in EIG/xeigtstz "ZGEBAK - Testing the back transformation of a COMPLEX16 balanced matrix" &
run_test zgbal.out zgbal.in EIG/xeigtstz "ZGGBAL - Testing the balancing of a pair of COMPLEX general matrices" &
run_test zgbak.out zgbak.in EIG/xeigtstz "ZGGBAK - Testing the back transformation of a pair of COMPLEX16 balanced matrices" &
run_test zbb.out zbb.in EIG/xeigtstz "ZBB - Testing banded Singular Value Decomposition routines" &
run_test zglm.out glm.in EIG/xeigtstz "GLM - Testing Generalized Linear Regression Model routines" &
run_test zgqr.out gqr.in EIG/xeigtstz "GQR - Testing Generalized QR and RQ factorization routines" &
run_test zgsv.out gsv.in EIG/xeigtstz "GSV - Testing Generalized Singular Value Decomposition routines" &
run_test zcsd.out csd.in EIG/xeigtstz "CSD - Testing CS Decomposition routines" &
run_test zlse.out lse.in EIG/xeigtstz "LSE - Testing Constrained Linear Least Squares routines" &
wait
while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*)
python ./lapack-netlib/lapack_testing.py -d ./test_out -e > netlib_summary
TOTALS="$(grep 'ALL PRECISIONS' netlib_summary)"
NUMERICAL_ERRORS=-1
OTHER_ERRORS=-1
. <(awk '/ALL PRECISIONS/{printf "NUMERICAL_ERRORS=%s\nOTHER_ERRORS=%s\n", $5, $7}' netlib_summary
if (( NUMERICAL_ERRORS != 0 )) || (( OTHER_ERRORS != 0 )) ; then cat netlib_summary ; FAILURES=1 ; fi
if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi

13
.gitignore vendored
View File

@ -47,46 +47,59 @@ config_last.h
getarch getarch
getarch_2nd getarch_2nd
utest/openblas_utest utest/openblas_utest
utest/openblas_utest_ext
ctest/xccblat1 ctest/xccblat1
ctest/xccblat2 ctest/xccblat2
ctest/xccblat3 ctest/xccblat3
ctest/xccblat3_3m
ctest/xdcblat1 ctest/xdcblat1
ctest/xdcblat2 ctest/xdcblat2
ctest/xdcblat3 ctest/xdcblat3
ctest/xdcblat3_3m
ctest/xscblat1 ctest/xscblat1
ctest/xscblat2 ctest/xscblat2
ctest/xscblat3 ctest/xscblat3
ctest/xscblat3_3m
ctest/xzcblat1 ctest/xzcblat1
ctest/xzcblat2 ctest/xzcblat2
ctest/xzcblat3 ctest/xzcblat3
ctest/xzcblat3_3m
exports/linktest.c exports/linktest.c
exports/linux.def exports/linux.def
kernel/setparam_*.c kernel/setparam_*.c
kernel/kernel_*.h kernel/kernel_*.h
test/CBLAT2.SUMM test/CBLAT2.SUMM
test/CBLAT3.SUMM test/CBLAT3.SUMM
test/CBLAT3_3M.SUMM
test/DBLAT2.SUMM test/DBLAT2.SUMM
test/DBLAT3.SUMM test/DBLAT3.SUMM
test/DBLAT3_3M.SUMM
test/SBLAT2.SUMM test/SBLAT2.SUMM
test/SBLAT3.SUMM test/SBLAT3.SUMM
test/SBLAT3_3M.SUMM
test/ZBLAT2.SUMM test/ZBLAT2.SUMM
test/ZBLAT3.SUMM test/ZBLAT3.SUMM
test/ZBLAT3_3M.SUMM
test/SHBLAT3.SUMM test/SHBLAT3.SUMM
test/SBBLAT3.SUMM test/SBBLAT3.SUMM
test/cblat1 test/cblat1
test/cblat2 test/cblat2
test/cblat3 test/cblat3
test/cblat3_3m
test/dblat1 test/dblat1
test/dblat2 test/dblat2
test/dblat3 test/dblat3
test/dblat3_3m
test/sblat1 test/sblat1
test/sblat2 test/sblat2
test/sblat3 test/sblat3
test/sblat3_3m
test/test_shgemm test/test_shgemm
test/test_sbgemm test/test_sbgemm
test/zblat1 test/zblat1
test/zblat2 test/zblat2
test/zblat3 test/zblat3
test/zblat3_3m
build build
build.* build.*
*.swp *.swp

View File

@ -24,6 +24,8 @@ option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, d
option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON) option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF)
option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF) option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
@ -40,6 +42,11 @@ option(USE_PERL "Use the older PERL scripts for build preparation instead of uni
option(NO_WARMUP "Do not run a benchmark on each startup just to find the best location for the memory buffer" ON) option(NO_WARMUP "Do not run a benchmark on each startup just to find the best location for the memory buffer" ON)
option(FIXED_LIBNAME "Use a non-versioned name for the library and no symbolic linking to variant names" OFF)
set(LIBNAMEPREFIX "" CACHE STRING "Add a prefix to the openblas part of the library name" )
set(LIBNAMESUFFIX "" CACHE STRING "Add a suffix after the openblas part of the library name" )
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
else() else()
@ -96,7 +103,7 @@ message(WARNING "CMake support is experimental. It does not yet support all buil
include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE}) set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
set(BLASDIRS interface driver/level2 driver/level3 driver/others) set(BLASDIRS interface driver/level2 driver/level3 driver/others)
@ -323,7 +330,7 @@ if (NOT NOFORTRAN)
# Build test and ctest # Build test and ctest
add_subdirectory(test) add_subdirectory(test)
endif() endif()
if (BUILD_TESTING) if (BUILD_TESTING AND NOT BUILD_WITHOUT_LAPACK)
add_subdirectory(lapack-netlib/TESTING) add_subdirectory(lapack-netlib/TESTING)
endif() endif()
endif() endif()
@ -336,11 +343,12 @@ endif()
add_subdirectory(cpp_thread_test) add_subdirectory(cpp_thread_test)
endif() endif()
if (NOT FIXED_LIBNAME)
set_target_properties(${OpenBLAS_LIBS} PROPERTIES set_target_properties(${OpenBLAS_LIBS} PROPERTIES
VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION} VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}
SOVERSION ${OpenBLAS_MAJOR_VERSION} SOVERSION ${OpenBLAS_MAJOR_VERSION}
) )
endif()
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
if (NOT MSVC) if (NOT MSVC)
target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition") target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition")
@ -452,6 +460,61 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
endif() endif()
endif() endif()
if (BUILD_BENCHMARKS)
#find_package(OpenMP REQUIRED)
file(GLOB SOURCES "benchmark/*.c")
if (NOT USE_OPENMP)
file(GLOB REMFILE "benchmark/smallscaling.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
endif()
if (BUILD_WITHOUT_LAPACK)
file(GLOB REMFILE "benchmark/cholesky.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
file(GLOB REMFILE "benchmark/geev.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
file(GLOB REMFILE "benchmark/gesv.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
file(GLOB REMFILE "benchmark/getri.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
file(GLOB REMFILE "benchmark/potrf.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
file(GLOB REMFILE "benchmark/spmv.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
file(GLOB REMFILE "benchmark/symv.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
file(GLOB REMFILE "benchmark/linpack.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
endif()
if (NOT USE_GEMM3M)
file(GLOB REMFILE "benchmark/gemm3m.c")
list(REMOVE_ITEM SOURCES ${REMFILE})
endif()
foreach(source ${SOURCES})
get_filename_component(name ${source} NAME_WE)
if ((NOT ${name} STREQUAL "zdot-intel") AND (NOT ${name} STREQUAL "cula_wrapper"))
set(defines DEFAULT COMPLEX DOUBLE "COMPLEX\;DOUBLE")
foreach(define ${defines})
set(target_name "benchmark_${name}")
if (NOT "${define}" STREQUAL "DEFAULT")
string(JOIN "_" define_str ${define})
set(target_name "${target_name}_${define_str}")
endif()
if ((NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX_DOUBLE") AND
(NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX_DOUBLE") AND
(NOT ${target_name} STREQUAL "benchmark_max_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX_DOUBLE") AND
(NOT ${target_name} STREQUAL "benchmark_min_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX_DOUBLE"))
add_executable(${target_name} ${source})
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} )
# target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C)
if (NOT "${define}" STREQUAL "DEFAULT")
target_compile_definitions(${target_name} PRIVATE ${define})
endif()
endif()
endforeach()
endif()
endforeach()
endif()
# Install project # Install project

View File

@ -219,3 +219,7 @@ In chronological order:
* Mark Seminatore <https://github.com/mseminatore> * Mark Seminatore <https://github.com/mseminatore>
* [2023-11-09] Improve Windows threading performance scaling * [2023-11-09] Improve Windows threading performance scaling
* [2024-02-09] Introduce MT_TRACE facility and improve code consistency
* Dirreke <https://github.com/mseminatore>
* [2024-01-16] Add basic support for the CSKY architecture

View File

@ -1,4 +1,104 @@
OpenBLAS ChangeLog OpenBLAS ChangeLog
====================================================================
Version 0.3.27
4-Apr-2024
general:
- added initial (generic) support for the CSKY architecture
- capped the maximum number of threads used in GEMM, GETRF and POTRF to avoid creating
underutilized or idle threads
- sped up multithreaded POTRF on all platforms
- added extension openblas_set_num_threads_local() that returns the previous thread count
- re-evaluated the SGEMV and DGEMV load thresholds to avoid activating multithreading
for too small workloads
- improved the fallback code used when the precompiled number of threads is exceeded,
and made it callable multiple times during the lifetime of an instance
- added CBLAS interfaces for the BLAS extensions ?AMIN,?AMAX, CAXPYC and ZAXPYC
- fixed a potential buffer overflow in the interface to the GEMMT kernels
- fixed use of incompatible pointer types in GEMMT and C/ZAXPBY as flagged by GCC-14
- fixed unwanted case sensitivity of the character parameters in ?TRTRS
- sped up the OpenMP thread management code
- fixed sizing of logical variables in INTERFACE64 builds of the C version of LAPACK
- fixed inclusion of new LAPACK and LAPACKE functions from LAPACK 3.11 in the shared library
- added a testsuite for the BLAS extensions
- modified the error thresholds for SGS/DGS functions in the LAPACK testsuite to suppress
spurious errors
- added support for building the benchmark collection with CMAKE
- added rewriting of linker options to avoid linking both libgomp and libomp in CMAKE builds
with OpenMP enabled that use clang with gfortran
- fixed building on systems with ucLibc
- added support for calling ?NRM2 with a negative increment value on all architectures
- added support for the LLVM18 version of the flang-new compiler
- fixed handling of the OPENBLAS_LOOPS variable in several benchmarks
- Integrated fixes from the Reference-LAPACK project:
- Increased accuracy in C/ZLARFGP (Reference-LAPACK PR 981)
x86:
- fixed handling of NaN and Inf arguments in ZSCAL
- fixed GEMM3M functions failing in CMAKE builds
x86-64:
- removed all instances of sched_yield() on Linux and BSD
- fixed a potential deadlock in the thread server on MSWindows (introduced in 0.3.26)
- fixed GEMM3M functions failing in CMAKE builds
- fixed handling of NaN and Inf arguments in ZSCAL
- added compiler checks for AVX512BF16 compatibility
- fixed LLVM compiler options for Sapphire Rapids
- fixed cpu handling fallbacks for Sapphire Rapids with
disabled AVX2 in DYNAMIC_ARCH mode
- fixed extensions SCSUM and DZSUM
- improved GEMM performance for ZEN targets
arm:
- fixed handling of NaN and Inf arguments in ZSCAL
arm64:
- added initial support for the Cortex-A76 cpu
- fixed handling of NaN and Inf arguments in ZSCAL
- fixed default compiler options for gcc (-march and -mtune)
- added support for ArmCompilerForLinux
- added support for the NeoverseV2 cpu in DYNAMIC_ARCH builds
- fixed mishandling of the INTERFACE64 option in CMAKE builds
- corrected SCSUM kernels (erroneously duplicating SCASUM behaviour)
- added SVE-enabled kernels for CSUM/ZSUM
- worked around an inaccuracy in the NRM2 kernels for NeoverseN1 and Apple M
power:
- improved performance of SGEMM on POWER8/9/10
- improved performance of DGEMM on POWER10
- added support for OpenMP builds with xlc/xlf on AIX
- improved cpu autodetection for DYNAMIC_ARCH builds on older AIX
- fixed cpu core counting on AIX
- added support for building a shared library on AIX
riscv64:
- added support for the X280 cpu
- added support for semi-generic RISCV models with vector length 128 or 256
- added support for compiling with either RVV 0.7.1 or RVV 1.0 standard compilers
- fixed handling of NaN and Inf arguments in ZSCAL
- improved cpu model autodetection
- fixed corner cases in ?AXPBY for C910V
- fixed handling of zero increments in ?AXPY kernels for C910V
loongarch64:
- added optimized kernels for ?AMIN and ?AMAX
- fixed handling of NaN and Inf arguments in ZSCAL
- fixed handling of corner cases in ?AXPBY
- fixed computation of SAMIN and DAMIN in LSX mode
- fixed computation of ?ROT
- added optimized SSYMV and DSYMV kernels for LSX and LASX mode
- added optimized CGEMM and ZGEMM kernels for LSX and LASX mode
- added optimized CGEMV and ZGEMV kernels
mips:
- fixed utilizing MSA on P5600 and related cpus (broken in 0.3.22)
- fixed handling of NaN and Inf arguments in ZSCAL
- fixed mishandling of the INTERFACE64 option in CMAKE builds
zarch:
- fixed handling of NaN and Inf arguments in ZSCAL
- fixed calculation of ?SUM on Z13
==================================================================== ====================================================================
Version 0.3.26 Version 0.3.26
2-Jan-2024 2-Jan-2024

View File

@ -1,5 +1,9 @@
TOPDIR = . TOPDIR = .
include ./Makefile.system include ./Makefile.system
LNCMD = ln -fs
ifeq ($(FIXED_LIBNAME), 1)
LNCMD = true
endif
BLASDIRS = interface driver/level2 driver/level3 driver/others BLASDIRS = interface driver/level2 driver/level3 driver/others
@ -134,17 +138,17 @@ shared : libs netlib $(RELA)
ifneq ($(NO_SHARED), 1) ifneq ($(NO_SHARED), 1)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
@$(MAKE) -C exports so @$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so @$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) @$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif endif
ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD)) ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD))
@$(MAKE) -C exports so @$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so @$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so
endif endif
ifeq ($(OSNAME), Darwin) ifeq ($(OSNAME), Darwin)
@$(MAKE) -C exports dyn @$(MAKE) -C exports dyn
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib @$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).dylib
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib @$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif endif
ifeq ($(OSNAME), WINNT) ifeq ($(OSNAME), WINNT)
@$(MAKE) -C exports dll @$(MAKE) -C exports dll
@ -152,6 +156,9 @@ endif
ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(OSNAME), CYGWIN_NT)
@$(MAKE) -C exports dll @$(MAKE) -C exports dll
endif endif
ifeq ($(OSNAME), AIX)
@$(MAKE) -C exports so
endif
endif endif
tests : shared tests : shared
@ -229,13 +236,13 @@ ifeq ($(INTERFACE64),1)
endif endif
@echo THELIBNAME=$(LIBNAME) >> Makefile.conf_last @echo THELIBNAME=$(LIBNAME) >> Makefile.conf_last
@echo THELIBSONAME=$(LIBSONAME) >> Makefile.conf_last @echo THELIBSONAME=$(LIBSONAME) >> Makefile.conf_last
@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) @-$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
@touch lib.grd @touch lib.grd
prof : prof_blas prof_lapack prof : prof_blas prof_lapack
prof_blas : prof_blas :
ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LNCMD) $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
for d in $(SUBDIRS) ; \ for d in $(SUBDIRS) ; \
do if test -d $$d; then \ do if test -d $$d; then \
$(MAKE) -C $$d prof || exit 1 ; \ $(MAKE) -C $$d prof || exit 1 ; \
@ -246,7 +253,7 @@ ifeq ($(DYNAMIC_ARCH), 1)
endif endif
blas : blas :
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) $(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
for d in $(BLASDIRS) ; \ for d in $(BLASDIRS) ; \
do if test -d $$d; then \ do if test -d $$d; then \
$(MAKE) -C $$d libs || exit 1 ; \ $(MAKE) -C $$d libs || exit 1 ; \
@ -254,7 +261,7 @@ blas :
done done
hpl : hpl :
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) $(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
for d in $(BLASDIRS) ../laswp exports ; \ for d in $(BLASDIRS) ../laswp exports ; \
do if test -d $$d; then \ do if test -d $$d; then \
$(MAKE) -C $$d $(@F) || exit 1 ; \ $(MAKE) -C $$d $(@F) || exit 1 ; \
@ -268,7 +275,7 @@ ifeq ($(DYNAMIC_ARCH), 1)
endif endif
hpl_p : hpl_p :
ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LNCMD) $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
for d in $(SUBDIRS) ../laswp exports ; \ for d in $(SUBDIRS) ../laswp exports ; \
do if test -d $$d; then \ do if test -d $$d; then \
$(MAKE) -C $$d $(@F) || exit 1 ; \ $(MAKE) -C $$d $(@F) || exit 1 ; \
@ -309,8 +316,12 @@ endif
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1) ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1)
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc
else
ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGIBM1)
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc
else else
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
endif endif
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
@ -401,6 +412,7 @@ lapack-runtest: lapack-test
blas-test: blas-test:
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out) (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out)
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out) (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out)

View File

@ -58,6 +58,13 @@ FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
endif endif
endif endif
ifeq ($(CORE), CORTEXA76)
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a76
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a76
endif
endif
ifeq ($(CORE), FT2000) ifeq ($(CORE), FT2000)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
@ -104,19 +111,25 @@ ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
endif endif
else else
CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native CCOMMON_OPT += -march=armv8.4-a+sve
ifneq ($(CROSS), 1)
CCOMMON_OPT += -mtune=native
endif
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.4-a -mtune=native FCOMMON_OPT += -march=armv8.4-a
ifneq ($(CROSS), 1)
FCOMMON_OPT += -mtune=native
endif
endif endif
endif endif
else else
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
endif endif
endif endif
else else
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
endif endif
@ -132,25 +145,31 @@ ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
ifneq ($(OSNAME), Darwin) ifneq ($(OSNAME), Darwin)
CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
else else
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
endif endif
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
endif endif
else else
CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native CCOMMON_OPT += -march=armv8.5-a+sve
ifneq ($(CROSS), 1)
CCOMMON_OPT += -mtune=native
endif
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.5-a -mtune=native FCOMMON_OPT += -march=armv8.5-a
ifneq ($(CROSS), 1)
FCOMMON_OPT += -mtune=native
endif
endif endif
endif endif
else else
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
endif endif
endif endif
else else
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
endif endif
@ -258,9 +277,17 @@ endif
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
ifeq ($(CORE), CORTEXX1) ifeq ($(CORE), CORTEXX1)
CCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72 CCOMMON_OPT += -march=armv8.2-a
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ12) $(ISCLANG)))
CCOMMON_OPT += -mtune=cortex-x1
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72 FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-x1
endif
else
CCOMMON_OPT += -mtune=cortex-a72
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
endif
endif endif
endif endif
endif endif
@ -271,6 +298,12 @@ CCOMMON_OPT += -march=armv8.4-a+sve
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.4-a+sve FCOMMON_OPT += -march=armv8.4-a+sve
endif endif
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ12) $(ISCLANG)))
CCOMMON_OPT += -mtune=cortex-x2
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -mtune=cortex-x2
endif
endif
endif endif
endif endif
@ -290,6 +323,12 @@ CCOMMON_OPT += -march=armv8.4-a+sve
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.4-a+sve FCOMMON_OPT += -march=armv8.4-a+sve
endif endif
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ12) $(ISCLANG)))
CCOMMON_OPT += -mtune=cortex-a710
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -mtune=cortex-a710
endif
endif
endif endif
endif endif

4
Makefile.csky Normal file
View File

@ -0,0 +1,4 @@
ifeq ($(CORE), CK860FV)
CCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float
FCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float -static
endif

View File

@ -2,11 +2,15 @@ TOPDIR = .
export GOTOBLAS_MAKEFILE = 1 export GOTOBLAS_MAKEFILE = 1
-include $(TOPDIR)/Makefile.conf_last -include $(TOPDIR)/Makefile.conf_last
include ./Makefile.system include ./Makefile.system
LNCMD = ln -fs
ifdef THELIBNAME ifdef THELIBNAME
LIBNAME=$(THELIBNAME) LIBNAME=$(THELIBNAME)
LIBSONAME=$(THELIBSONAME) LIBSONAME=$(THELIBSONAME)
endif endif
ifeq ($(FIXED_LIBNAME), 1)
LNCMD = true
endif
ifeq ($(INTERFACE64),1) ifeq ($(INTERFACE64),1)
USE_64BITINT=1 USE_64BITINT=1
endif endif
@ -99,7 +103,7 @@ ifneq ($(NO_STATIC),1)
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) $(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif endif
#for install shared library #for install shared library
ifneq ($(NO_SHARED),1) ifneq ($(NO_SHARED),1)
@ -107,21 +111,21 @@ ifneq ($(NO_SHARED),1)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
@install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ $(LNCMD) $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) $(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif endif
ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD)) ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD))
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so $(LNCMD) $(LIBSONAME) $(LIBPREFIX).so
endif endif
ifeq ($(OSNAME), Darwin) ifeq ($(OSNAME), Darwin)
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@-install_name_tool -id "$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(MAJOR_VERSION).dylib" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" @-install_name_tool -id "$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(MAJOR_VERSION).dylib" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \ $(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib $(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif endif
ifeq ($(OSNAME), WINNT) ifeq ($(OSNAME), WINNT)
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
@ -149,15 +153,15 @@ ifneq ($(NO_STATIC),1)
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) $(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif endif
#for install shared library #for install shared library
ifneq ($(NO_SHARED),1) ifneq ($(NO_SHARED),1)
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ $(LNCMD) $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) $(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif endif
endif endif
@ -170,6 +174,8 @@ endif
@echo Generating $(LIBSONAMEBASE)$(SUFFIX64).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo Generating $(LIBSONAMEBASE)$(SUFFIX64).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(PKGFILE)" @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(PKGFILE)"
@echo 'libprefix='$(LIBNAMEPREFIX) >> "$(PKGFILE)"
@echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)"
@echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)" @echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)"
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)" @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)"
@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)" @echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
@ -186,7 +192,7 @@ endif
ifneq ($(NO_SHARED),1) ifneq ($(NO_SHARED),1)
#ifeq logical or #ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif endif
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"

View File

@ -55,6 +55,26 @@ ifeq ($(TARGET), C910V)
TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d
endif endif
ifeq ($(TARGET), CK860FV)
TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float
endif
ifeq ($(TARGET), x280)
TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
endif
ifeq ($(TARGET), RISCV64_ZVL256B)
TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
endif
ifeq ($(TARGET), RISCV64_ZVL128B)
TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
endif
ifeq ($(TARGET), RISCV64_GENERIC)
TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d
endif
all: getarch_2nd all: getarch_2nd
./getarch_2nd 0 >> $(TARGET_MAKE) ./getarch_2nd 0 >> $(TARGET_MAKE)
./getarch_2nd 1 >> $(TARGET_CONF) ./getarch_2nd 1 >> $(TARGET_CONF)

View File

@ -2,3 +2,19 @@ ifeq ($(CORE), C910V)
CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920
FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static
endif endif
ifeq ($(CORE), x280)
CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math
FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
endif
ifeq ($(CORE), RISCV64_ZVL256B)
CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d
FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
endif
ifeq ($(CORE), RISCV64_ZVL128B)
CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
endif
ifeq ($(CORE), RISCV64_GENERIC)
CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d
FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static
endif

View File

@ -3,7 +3,12 @@
# #
# This library's version # This library's version
VERSION = 0.3.26 VERSION = 0.3.26.dev
# If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
# and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library
#
# LIBNAMEPREFIX = scipy
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

View File

@ -365,8 +365,9 @@ GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8) GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8)
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11)
GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11)
GCCVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12)
# Note that the behavior of -dumpversion is compile-time-configurable for # Note that the behavior of -dumpversion is compile-time-configurable for
# gcc-7.x and newer. Use -dumpfullversion there # gcc-7.x and newer. Use -dumpfullversion there
ifeq ($(GCCVERSIONGTEQ7),1) ifeq ($(GCCVERSIONGTEQ7),1)
@ -873,6 +874,11 @@ endif
endif endif
endif endif
ifeq ($(ARCH), csky)
NO_BINARY_MODE = 1
BINARY_DEFINED = 1
endif
# #
# C Compiler dependent settings # C Compiler dependent settings
# #
@ -1176,7 +1182,7 @@ ifeq ($(F_COMPILER), IBM)
CCOMMON_OPT += -DF_INTERFACE_IBM CCOMMON_OPT += -DF_INTERFACE_IBM
FEXTRALIB += -lxlf90 FEXTRALIB += -lxlf90
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG)) ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG))
FCOMMON_OPT += -qextname FCOMMON_OPT += -qextname -qzerosize
endif endif
# FCOMMON_OPT += -qarch=440 # FCOMMON_OPT += -qarch=440
ifdef BINARY64 ifdef BINARY64
@ -1511,16 +1517,28 @@ ifndef LIBSONAMEBASE
LIBSONAMEBASE = openblas LIBSONAMEBASE = openblas
endif endif
ifndef LIBNAMEPREFIX
LIBNAMEPREFIX =
endif
SYMPREFIX=$(SYMBOLPREFIX)
ifeq ($(SYMBOLPREFIX),$(LIBNAMEPREFIX))
SYMPREFIX=
endif
SYMSUFFIX=$(SYMBOLSUFFIX)
ifeq ($(SYMBOLSUFFIX),$(LIBNAMESUFFIX))
SYMSUFFIX=
endif
ifndef LIBNAMESUFFIX ifndef LIBNAMESUFFIX
LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX) LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)
else else
LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX) LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)$(LIBNAMESUFFIX)
endif endif
ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(OSNAME), CYGWIN_NT)
LIBPREFIX = cyg$(LIBNAMEBASE) LIBPREFIX = cyg$(LIBNAMEPREFIX)$(LIBNAMEBASE)
else else
LIBPREFIX = lib$(LIBNAMEBASE) LIBPREFIX = lib$(LIBNAMEPREFIX)$(LIBNAMEBASE)
endif endif
KERNELDIR = $(TOPDIR)/kernel/$(ARCH) KERNELDIR = $(TOPDIR)/kernel/$(ARCH)
@ -1652,6 +1670,10 @@ ifeq ($(F_COMPILER),CRAY)
LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
endif endif
ifeq ($(F_COMPILER),FLANGNEW)
LAPACK_FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
override FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
endif
LAPACK_CFLAGS = $(CFLAGS) LAPACK_CFLAGS = $(CFLAGS)
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
@ -1699,14 +1721,14 @@ LIBNAME_P = $(LIBPREFIX)p$(REVISION)_p.$(LIBSUFFIX)
endif endif
endif endif
ifeq ($(FIXED_LIBNAME),1)
LIBNAME = lib$(LIBNAMEPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).$(LIBSUFFIX)
LIBNAME_P = lib$(LIBNAMEPREFIX)$(LISOBNAMEBASE)$(LIBNAMESUFFIX)_p.$(LIBSUFFIX)
endif
LIBDLLNAME = $(LIBPREFIX).dll LIBDLLNAME = $(LIBPREFIX).dll
IMPLIBNAME = lib$(LIBNAMEBASE).dll.a IMPLIBNAME = lib$(LIBNAMEBASE).dll.a
ifneq ($(OSNAME), AIX)
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
else
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a)
endif
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)

View File

@ -130,11 +130,11 @@ ifeq ($(C_COMPILER), GCC)
endif endif
endif endif
else ifeq ($(C_COMPILER), CLANG) else ifeq ($(C_COMPILER), CLANG)
# cooperlake support was added in clang 12 # sapphire rapids support was added in clang 12
ifeq ($(CLANGVERSIONGTEQ12), 1) ifeq ($(CLANGVERSIONGTEQ12), 1)
CCOMMON_OPT += -march=cooperlake CCOMMON_OPT += -march=sapphirerapids
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=cooperlake FCOMMON_OPT += -march=sapphirerapids
endif endif
else # not supported in clang, fallback to avx512 else # not supported in clang, fallback to avx512
CCOMMON_OPT += -march=skylake-avx512 CCOMMON_OPT += -march=skylake-avx512

View File

@ -167,6 +167,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
- **Cortex A57**: Optimized Level-3 and Level-2 functions - **Cortex A57**: Optimized Level-3 and Level-2 functions
- **Cortex A72**: same as A57 ( different cpu specifications) - **Cortex A72**: same as A57 ( different cpu specifications)
- **Cortex A73**: same as A57 (different cpu specifications) - **Cortex A73**: same as A57 (different cpu specifications)
- **Cortex A76**: same as A57 (different cpu specifications)
- **Falkor**: same as A57 (different cpu specifications) - **Falkor**: same as A57 (different cpu specifications)
- **ThunderX**: Optimized some Level-1 functions - **ThunderX**: Optimized some Level-1 functions
- **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2 - **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
@ -185,6 +186,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only. - **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
- **POWER10**: Optimized Level-3 BLAS including SBGEMM and some Level-1,2. - **POWER10**: Optimized Level-3 BLAS including SBGEMM and some Level-1,2.
- **AIX**: Dynamic architecture with OpenXL and OpenMP.
```sh
make CC=ibm-clang_r FC=xlf TARGET=POWER7 BINARY=64 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 USE_THREAD=1
```
#### IBM zEnterprise System #### IBM zEnterprise System
- **Z13**: Optimized Level-3 BLAS and Level-1,2 - **Z13**: Optimized Level-3 BLAS and Level-1,2
@ -198,6 +204,21 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
``` ```
(also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision) (also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision)
- **x280**: Level-3 BLAS and Level-1,2 are optimized by RISC-V Vector extension 1.0.
```sh
make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran
```
- **ZVL???B**: Level-3 BLAS and Level-1,2 including vectorised kernels targeting generic RISCV cores with vector support with registers of at least the corresponding width; ZVL128B and ZVL256B are available.
e.g.:
```sh
make TARGET=RISCV64_ZVL256B CFLAGS="-DTARGET=RISCV64_ZVL256B" \
BINARY=64 ARCH=riscv64 CC='clang -target riscv64-unknown-linux-gnu' \
AR=riscv64-unknown-linux-gnu-ar AS=riscv64-unknown-linux-gnu-gcc \
LD=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran \
HOSTCC=gcc HOSTFC=gfortran -j
```
### Support for multiple targets in a single library ### Support for multiple targets in a single library
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
@ -227,7 +248,7 @@ Please note that it is not possible to combine support for different architectur
- **NetBSD**: Supported by the community. We don't actively test the library on this OS. - **NetBSD**: Supported by the community. We don't actively test the library on this OS.
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS. - **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>. - **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
- **AIX**: Supported on PPC up to POWER8 - **AIX**: Supported on PPC up to POWER10
- **Haiku**: Supported by the community. We don't actively test the library on this OS. - **Haiku**: Supported by the community. We don't actively test the library on this OS.
- **SunOS**: Supported by the community. We don't actively test the library on this OS. - **SunOS**: Supported by the community. We don't actively test the library on this OS.
- **Cortex-M**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-on-Cortex-M>. - **Cortex-M**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-on-Cortex-M>.

View File

@ -93,6 +93,7 @@ CORTEXA53
CORTEXA57 CORTEXA57
CORTEXA72 CORTEXA72
CORTEXA73 CORTEXA73
CORTEXA76
CORTEXA510 CORTEXA510
CORTEXA710 CORTEXA710
CORTEXX1 CORTEXX1
@ -118,8 +119,11 @@ Z13
Z14 Z14
10.RISC-V 64: 10.RISC-V 64:
RISCV64_GENERIC RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54)
RISCV64_ZVL128B
C910V C910V
x280
RISCV64_ZVL256B
11.LOONGARCH64: 11.LOONGARCH64:
LOONGSONGENERIC LOONGSONGENERIC
@ -133,3 +137,7 @@ E2K
EV4 EV4
EV5 EV5
EV6 EV6
14.CSKY
CSKY
CK860FV

View File

@ -37,6 +37,12 @@ ESSL=/opt/ibm/lib
#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a #LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
# x280 temporary workaround for gfortran
ifeq ($(TARGET), x280)
CCOMMON_OPT:=$(filter-out -mllvm --riscv-v-vector-bits-min=512,$(CCOMMON_OPT))
endif
ifneq ($(NO_LAPACK), 1) ifneq ($(NO_LAPACK), 1)
GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \ scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \

View File

@ -92,7 +92,7 @@ int main(int argc, char *argv[]){
if ((p = getenv("OPENBLAS_TEST"))) btest=*p; if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step);

View File

@ -85,7 +85,7 @@ int main(int argc, char *argv[]){
double time1, time2, timeg1,timeg2; double time1, time2, timeg1,timeg2;
char *p; char *p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
argc--;argv++; argc--;argv++;

View File

@ -120,7 +120,7 @@ int main(int argc, char *argv[]){
if ((p = getenv("OPENBLAS_TEST"))) btest=*p; if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]); fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]);

View File

@ -54,7 +54,7 @@ int main(int argc, char *argv[]){
int step = 1; int step = 1;
int loops = 1; int loops = 1;
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
double time1,timeg; double time1,timeg;

24
c_check
View File

@ -91,6 +91,7 @@ case "$data" in
*ARCH_ZARCH*) architecture=zarch ;; *ARCH_ZARCH*) architecture=zarch ;;
*ARCH_RISCV64*) architecture=riscv64 ;; *ARCH_RISCV64*) architecture=riscv64 ;;
*ARCH_LOONGARCH64*) architecture=loongarch64 ;; *ARCH_LOONGARCH64*) architecture=loongarch64 ;;
*ARCH_CSKY*) architecture=csky ;;
esac esac
defined=0 defined=0
@ -236,6 +237,7 @@ case "$data" in
*ARCH_ARM*) architecture=arm ;; *ARCH_ARM*) architecture=arm ;;
*ARCH_ZARCH*) architecture=zarch ;; *ARCH_ZARCH*) architecture=zarch ;;
*ARCH_LOONGARCH64*) architecture=loongarch64 ;; *ARCH_LOONGARCH64*) architecture=loongarch64 ;;
*ARCH_CSKY*) architecture=csky ;;
esac esac
binformat='bin32' binformat='bin32'
@ -244,6 +246,7 @@ case "$data" in
esac esac
no_avx512=0 no_avx512=0
no_avx512bf=0
if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC') tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
tmpf="$tmpd/a.c" tmpf="$tmpd/a.c"
@ -262,6 +265,25 @@ if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
} }
rm -rf "$tmpd" rm -rf "$tmpd"
if [ "$no_avx512" -eq 0 ]; then
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
tmpf="$tmpd/a.c"
code='"__m512 a= _mm512_dpbf16_ps(a, (__m512bh) _mm512_loadu_si512(%1]), (__m512bh) _mm512_loadu_si512(%2]));"'
printf "#include <immintrin.h>\n\nint main(void){ %s; }\n" "$code" >> "$tmpf"
if [ "$compiler" = "PGI" ]; then
args=" -tp cooperlake -c -o $tmpf.o $tmpf"
else
args=" -march=cooperlake -c -o $tmpf.o $tmpf"
fi
no_avx512bf=0
{
$compiler_name $flags $args >/dev/null 2>&1
} || {
no_avx512bf=1
}
rm -rf "$tmpd"
fi
fi fi
no_rv64gv=0 no_rv64gv=0
@ -409,6 +431,7 @@ done
[ "$makefile" = "-" ] && { [ "$makefile" = "-" ] && {
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
[ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
exit 0 exit 0
@ -437,6 +460,7 @@ done
[ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n" [ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n"
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n" [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
[ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
[ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n" [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"

View File

@ -97,6 +97,7 @@ $architecture = arm64 if ($data =~ /ARCH_ARM64/);
$architecture = zarch if ($data =~ /ARCH_ZARCH/); $architecture = zarch if ($data =~ /ARCH_ZARCH/);
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/); $architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
$architecture = csky if ($data =~ /ARCH_CSKY/);
$defined = 0; $defined = 0;
@ -156,6 +157,11 @@ if ($architecture eq "loongarch64") {
$binary = 64; $binary = 64;
} }
if ($architecture eq "csky") {
$defined = 1;
$binary = 32;
}
if ($compiler eq "PGI") { if ($compiler eq "PGI") {
$compiler_name .= " -tp p7" if ($binary eq "32"); $compiler_name .= " -tp p7" if ($binary eq "32");
$compiler_name .= " -tp p7-64" if ($binary eq "64"); $compiler_name .= " -tp p7-64" if ($binary eq "64");
@ -284,6 +290,7 @@ $architecture = arm if ($data =~ /ARCH_ARM/);
$architecture = arm64 if ($data =~ /ARCH_ARM64/); $architecture = arm64 if ($data =~ /ARCH_ARM64/);
$architecture = zarch if ($data =~ /ARCH_ZARCH/); $architecture = zarch if ($data =~ /ARCH_ZARCH/);
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
$architecture = csky if ($data =~ /ARCH_CSKY/);
$binformat = bin32; $binformat = bin32;
$binformat = bin64 if ($data =~ /BINARY_64/); $binformat = bin64 if ($data =~ /BINARY_64/);

22
cblas.h
View File

@ -12,6 +12,7 @@ extern "C" {
/*Set the number of threads on runtime.*/ /*Set the number of threads on runtime.*/
void openblas_set_num_threads(int num_threads); void openblas_set_num_threads(int num_threads);
void goto_set_num_threads(int num_threads); void goto_set_num_threads(int num_threads);
int openblas_set_num_threads_local(int num_threads);
/*Get the number of threads on runtime.*/ /*Get the number of threads on runtime.*/
int openblas_get_num_threads(void); int openblas_get_num_threads(void);
@ -100,6 +101,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
@ -115,6 +126,9 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
@ -289,6 +303,14 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA
void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc); OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
void cblas_sgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
void cblas_dgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
void cblas_cgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
void cblas_zgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);

View File

@ -64,6 +64,7 @@ else ()
"#define NEEDBUNDERSCORE 1\n") "#define NEEDBUNDERSCORE 1\n")
endif() endif()
if (CMAKE_Fortran_COMPILER)
get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE) get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE)
string(TOUPPER ${F_COMPILER} F_COMPILER) string(TOUPPER ${F_COMPILER} F_COMPILER)
endif()

View File

@ -6,9 +6,6 @@
if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
# This is for classic Flang. LLVM Flang is handled with gfortran below. # This is for classic Flang. LLVM Flang is handled with gfortran below.
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
if (BINARY64 AND INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
endif ()
if (USE_OPENMP) if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
endif () endif ()
@ -55,6 +52,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
if (MIPS64) if (MIPS64)
if (BINARY64) if (BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64") set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64")
if (INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
endif ()
else () else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32")
endif () endif ()
@ -83,9 +83,14 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
endif () endif ()
endif () endif ()
endif () endif ()
if (ARM64 AND INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
endif ()
else () else ()
if (BINARY64) if (BINARY64)
if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
set(FCOMMON_OPT "${FCOMMON_OPT} -m64") set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
endif ()
if (INTERFACE64) if (INTERFACE64)
if (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel") if (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel")
if (WIN32) if (WIN32)
@ -98,9 +103,11 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
endif () endif ()
endif () endif ()
else () else ()
if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
set(FCOMMON_OPT "${FCOMMON_OPT} -m32") set(FCOMMON_OPT "${FCOMMON_OPT} -m32")
endif () endif ()
endif () endif ()
endif ()
if (USE_OPENMP) if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")

View File

@ -1,4 +1,6 @@
libdir=@CMAKE_INSTALL_FULL_LIBDIR@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@
libnameprefix=@LIBNAMEPREFIX@
libnamesuffix=@LIBNAMESUFFIX@
libsuffix=@SUFFIX64_UNDERSCORE@ libsuffix=@SUFFIX64_UNDERSCORE@
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
@ -7,5 +9,5 @@ Name: OpenBLAS
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
Version: @OpenBLAS_VERSION@ Version: @OpenBLAS_VERSION@
URL: https://github.com/OpenMathLib/OpenBLAS URL: https://github.com/OpenMathLib/OpenBLAS
Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix} Libs: @OpenMP_C_FLAGS@ -L${libdir} -l${libnameprefix}openblas${libnamesuffix}${libsuffix}
Cflags: -I${includedir} Cflags: -I${includedir}

View File

@ -932,7 +932,7 @@ endif ()
set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4) set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16) set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73") elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73" OR "${TCORE}" STREQUAL "CORTEXA76")
file(APPEND ${TARGET_CONF_TEMP} file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t49152\n" "#define L1_CODE_SIZE\t49152\n"
"#define L1_CODE_LINESIZE\t64\n" "#define L1_CODE_LINESIZE\t64\n"

View File

@ -501,10 +501,11 @@ set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_TH
endif() endif()
endif() endif()
endif() endif()
set(LIBPREFIX "lib${LIBNAMEPREFIX}openblas")
if (DEFINED LIBNAMESUFFIX) if (DEFINED LIBNAMESUFFIX)
set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}") set(LIBPREFIX "${LIBNAMEPREFIX}_${LIBNAMESUFFIX}")
else ()
set(LIBPREFIX "libopenblas")
endif () endif ()
if (NOT DEFINED SYMBOLPREFIX) if (NOT DEFINED SYMBOLPREFIX)
@ -615,13 +616,19 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
endforeach () endforeach ()
endif () endif ()
if ("${F_COMPILER}" STREQUAL "NAG" OR "${F_COMPILER}" STREQUAL "CRAY") if (CMAKE_Fortran_COMPILER)
if (${F_COMPILER} STREQUAL "NAG" OR ${F_COMPILER} STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512")
if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
message(STATUS "removing fortran flags")
set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64")
endif ()
foreach (FILTER_FLAG ${FILTER_FLAGS}) foreach (FILTER_FLAG ${FILTER_FLAGS})
string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS})
string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS})
endforeach () endforeach ()
endif () endif ()
endif ()
if ("${F_COMPILER}" STREQUAL "GFORTRAN") if ("${F_COMPILER}" STREQUAL "GFORTRAN")
# lapack-netlib is rife with uninitialized warnings -hpa # lapack-netlib is rife with uninitialized warnings -hpa
@ -679,6 +686,10 @@ else ()
endif () endif ()
endif () endif ()
if (DEFINED FIXED_LIBNAME)
set (LIBNAME "${LIBPREFIX}.${LIBSUFFIX}")
set (LIBNAME "${LIBPREFIX}_p.${LIBSUFFIX}")
endif()
set(LIBDLLNAME "${LIBPREFIX}.dll") set(LIBDLLNAME "${LIBPREFIX}.dll")
set(LIBSONAME "${LIBNAME}.${LIBSUFFIX}.so") set(LIBSONAME "${LIBNAME}.${LIBSUFFIX}.so")

View File

@ -358,12 +358,6 @@ typedef int blasint;
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop; \n");
#endif #endif
#ifdef BULLDOZER
#ifndef YIELDING
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif
#endif
#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(POWER8) || defined(POWER9) || defined(POWER10)
#ifndef YIELDING #ifndef YIELDING
@ -371,21 +365,13 @@ typedef int blasint;
#endif #endif
#endif #endif
/*
#ifdef PILEDRIVER
#ifndef YIELDING
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif
#endif
*/
/* #if defined(ARCH_X86_64)
#ifdef STEAMROLLER
#ifndef YIELDING #ifndef YIELDING
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif #endif
#endif #endif
*/
#ifdef __EMSCRIPTEN__ #ifdef __EMSCRIPTEN__
#define YIELDING #define YIELDING
@ -396,7 +382,7 @@ typedef int blasint;
#endif #endif
/*** /***
To alloc job_t on heap or statck. To alloc job_t on heap or stack.
please https://github.com/xianyi/OpenBLAS/issues/246 please https://github.com/xianyi/OpenBLAS/issues/246
***/ ***/
#if defined(OS_WINDOWS) #if defined(OS_WINDOWS)
@ -482,6 +468,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#include "common_e2k.h" #include "common_e2k.h"
#endif #endif
#ifdef ARCH_CSKY
#include "common_csky.h"
#endif
#ifndef ASSEMBLER #ifndef ASSEMBLER
#ifdef OS_WINDOWSSTORE #ifdef OS_WINDOWSSTORE
typedef char env_var_t[MAX_PATH]; typedef char env_var_t[MAX_PATH];

56
common_csky.h Normal file
View File

@ -0,0 +1,56 @@
/*****************************************************************************
Copyright (c) 2011-2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#ifndef COMMON_CSKY
#define COMMON_CSKY
#define MB __sync_synchronize()
#define WMB __sync_synchronize()
#define RMB __sync_synchronize()
#define INLINE inline
#ifndef ASSEMBLER
static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
}
#endif
#define BUFFER_SIZE ( 32 << 20)
#define SEEK_ADDRESS
#endif

View File

@ -498,6 +498,15 @@ void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double *
void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *, void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *); xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);
void BLASFUNC(sgemmt)(char*, char *, char *, blasint *, blasint *, float *,
float *, blasint *, float *, blasint *, float *, float *, blasint *);
void BLASFUNC(dgemmt)(char*, char *, char *, blasint *, blasint *, double *,
double *, blasint *, double *, blasint *, double *, double *, blasint *);
void BLASFUNC(cgemmt)(char*, char *, char *, blasint *, blasint *, float *,
float *, blasint *, float *, blasint *, float *, float *, blasint *);
void BLASFUNC(zgemmt)(char*, char *, char *, blasint *, blasint *, double *,
double *, blasint *, double *, blasint *, double *, double *, blasint *);
int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *, int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *,
float *, float *, blasint *, float *, blasint *, float *, float *, blasint *, float *, blasint *,
float *, float *, blasint *); float *, float *, blasint *);
@ -764,8 +773,8 @@ xdouble BLASFUNC(qlamc3)(xdouble *, xdouble *);
void BLASFUNC(saxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(saxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *);
void BLASFUNC(daxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(daxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *);
void BLASFUNC(caxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(caxpby) (blasint *, void *, float *, blasint *, void *, float *, blasint *);
void BLASFUNC(zaxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *); void BLASFUNC(zaxpby) (blasint *, void *, double *, blasint *, void *, double *, blasint *);
void BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *); void BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *);
void BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *); void BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *);

View File

@ -91,8 +91,26 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define BUFFER_SIZE ( 32 << 20) #define BUFFER_SIZE ( 32 << 20)
#define SEEK_ADDRESS #define SEEK_ADDRESS
#if defined(C910V) #if defined(C910V) || defined(RISCV64_ZVL256B) || defined(RISCV64_ZVL128B) || defined(x280)
# include <riscv_vector.h> # include <riscv_vector.h>
#endif #endif
#if defined( __riscv_xtheadc ) && defined( __riscv_v ) && ( __riscv_v <= 7000 )
// t-head toolchain uses obsolete rvv intrinsics, can't build for C910V without this
#define RISCV_0p10_INTRINSICS
#define RISCV_RVV(x) x
#else
#define RISCV_RVV(x) __riscv_ ## x
#endif
#if defined(C910V) || defined(RISCV64_ZVL256B)
# if !defined(DOUBLE)
# define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f32m1_f32)(v)
# else
# define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f64m1_f64)(v)
# endif
#else
# define EXTRACT_FLOAT(v) (v[0])
#endif
#endif #endif

View File

@ -137,19 +137,20 @@ typedef struct blas_queue {
extern int blas_server_avail; extern int blas_server_avail;
extern int blas_omp_number_max; extern int blas_omp_number_max;
extern int blas_omp_threads_local;
static __inline int num_cpu_avail(int level) { static __inline int num_cpu_avail(int level) {
#ifdef USE_OPENMP #ifdef USE_OPENMP
int openmp_nthreads; int openmp_nthreads;
openmp_nthreads=omp_get_max_threads(); openmp_nthreads=omp_get_max_threads();
if (omp_in_parallel()) openmp_nthreads = blas_omp_threads_local;
#endif #endif
#ifndef USE_OPENMP #ifndef USE_OPENMP
if (blas_cpu_number == 1 if (blas_cpu_number == 1
#endif #else
#ifdef USE_OPENMP if (openmp_nthreads == 1
if (openmp_nthreads == 1 || omp_in_parallel()
#endif #endif
) return 1; ) return 1;

View File

@ -42,6 +42,7 @@ size_t length64=sizeof(value64);
#define CPU_CORTEXA57 3 #define CPU_CORTEXA57 3
#define CPU_CORTEXA72 4 #define CPU_CORTEXA72 4
#define CPU_CORTEXA73 5 #define CPU_CORTEXA73 5
#define CPU_CORTEXA76 23
#define CPU_NEOVERSEN1 11 #define CPU_NEOVERSEN1 11
#define CPU_NEOVERSEV1 16 #define CPU_NEOVERSEV1 16
#define CPU_NEOVERSEN2 17 #define CPU_NEOVERSEN2 17
@ -89,7 +90,8 @@ static char *cpuname[] = {
"CORTEXX2", "CORTEXX2",
"CORTEXA510", "CORTEXA510",
"CORTEXA710", "CORTEXA710",
"FT2000" "FT2000",
"CORTEXA76"
}; };
static char *cpuname_lower[] = { static char *cpuname_lower[] = {
@ -115,7 +117,8 @@ static char *cpuname_lower[] = {
"cortexx2", "cortexx2",
"cortexa510", "cortexa510",
"cortexa710", "cortexa710",
"ft2000" "ft2000",
"cortexa76"
}; };
int get_feature(char *search) int get_feature(char *search)
@ -210,6 +213,8 @@ int detect(void)
return CPU_CORTEXX2; return CPU_CORTEXX2;
else if (strstr(cpu_part, "0xd4e")) //X3 else if (strstr(cpu_part, "0xd4e")) //X3
return CPU_CORTEXX2; return CPU_CORTEXX2;
else if (strstr(cpu_part, "0xd0b"))
return CPU_CORTEXA76;
} }
// Qualcomm // Qualcomm
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00")) else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
@ -391,6 +396,7 @@ void get_cpuconfig(void)
break; break;
case CPU_NEOVERSEV1: case CPU_NEOVERSEV1:
case CPU_CORTEXA76:
printf("#define %s\n", cpuname[d]); printf("#define %s\n", cpuname[d]);
printf("#define L1_CODE_SIZE 65536\n"); printf("#define L1_CODE_SIZE 65536\n");
printf("#define L1_CODE_LINESIZE 64\n"); printf("#define L1_CODE_LINESIZE 64\n");

View File

@ -72,10 +72,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_GENERIC 0 #define CPU_GENERIC 0
#define CPU_C910V 1 #define CPU_C910V 1
#define CPU_x280 2
#define CPU_RISCV64_ZVL256B 3
#define CPU_RISCV64_ZVL128B 4
static char *cpuname[] = { static char *cpuname[] = {
"RISCV64_GENERIC", "RISCV64_GENERIC",
"C910V" "C910V",
"x280",
"CPU_RISCV64_ZVL256B",
"CPU_RISCV64_ZVL128B"
};
static char *cpuname_lower[] = {
"riscv64_generic",
"c910v",
"x280",
"riscv64_zvl256b",
"riscv64_zvl128b"
}; };
int detect(void){ int detect(void){
@ -86,21 +100,27 @@ int detect(void){
char *pmodel = NULL, *pisa = NULL; char *pmodel = NULL, *pisa = NULL;
infile = fopen("/proc/cpuinfo", "r"); infile = fopen("/proc/cpuinfo", "r");
if (!infile)
return CPU_GENERIC;
while (fgets(buffer, sizeof(buffer), infile)){ while (fgets(buffer, sizeof(buffer), infile)){
if(!strncmp(buffer, "model name", 10)){ if(!strncmp(buffer, "model name", 10)){
strcpy(model_buffer, buffer); strcpy(model_buffer, buffer);
pmodel = strchr(isa_buffer, ':') + 1; pmodel = strchr(model_buffer, ':');
if (pmodel)
pmodel++;
} }
if(!strncmp(buffer, "isa", 3)){ if(!strncmp(buffer, "isa", 3)){
strcpy(isa_buffer, buffer); strcpy(isa_buffer, buffer);
pisa = strchr(isa_buffer, '4') + 1; pisa = strchr(isa_buffer, '4');
if (pisa)
pisa++;
} }
} }
fclose(infile); fclose(infile);
if (!pmodel) if (!pmodel || !pisa)
return(CPU_GENERIC); return(CPU_GENERIC);
if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v')) if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v'))
@ -140,5 +160,5 @@ void get_cpuconfig(void){
} }
void get_libname(void){ void get_libname(void){
printf("riscv64\n"); printf("%s", cpuname_lower[detect()]);
} }

View File

@ -173,6 +173,10 @@ HAVE_C11
ARCH_E2K ARCH_E2K
#endif #endif
#if defined(__csky__)
ARCH_CSKY
#endif
#if defined(__EMSCRIPTEN__) #if defined(__EMSCRIPTEN__)
ARCH_RISCV64 ARCH_RISCV64
OS_WINDOWS OS_WINDOWS

View File

@ -40,6 +40,10 @@ else()
c_${float_char}blas1.c) c_${float_char}blas1.c)
endif() endif()
target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(x${float_char}cblat1 omp pthread)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat1 m) target_link_libraries(x${float_char}cblat1 m)
endif() endif()
@ -65,6 +69,10 @@ else()
constant.c) constant.c)
endif() endif()
target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(x${float_char}cblat2 omp pthread)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat2 m) target_link_libraries(x${float_char}cblat2 m)
endif() endif()
@ -80,6 +88,17 @@ if (NOT NOFORTRAN)
auxiliary.c auxiliary.c
c_xerbla.c c_xerbla.c
constant.c) constant.c)
if (USE_GEMM3M)
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
add_executable(x${float_char}cblat3_3m
c_${float_char}blat3_3m.f
c_${float_char}blas3_3m.c
c_${float_char}3chke_3m.c
auxiliary.c
c_xerbla.c
constant.c)
endif()
endif()
else() else()
add_executable(x${float_char}cblat3 add_executable(x${float_char}cblat3
c_${float_char}blat3c.c c_${float_char}blat3c.c
@ -88,12 +107,44 @@ else()
auxiliary.c auxiliary.c
c_xerbla.c c_xerbla.c
constant.c) constant.c)
if (USE_GEMM3M)
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
add_executable(x${float_char}cblat3_3m
c_${float_char}blat3c_3m.c
c_${float_char}blas3_3m.c
c_${float_char}3chke_3m.c
auxiliary.c
c_xerbla.c
constant.c)
endif()
endif()
endif() endif()
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(x${float_char}cblat3 omp pthread)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat3 m) target_link_libraries(x${float_char}cblat3 m)
endif() endif()
if (USE_GEMM3M)
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(x${float_char}cblat3 omp pthread)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat3_3m m)
endif()
endif()
endif()
add_test(NAME "x${float_char}cblat3" add_test(NAME "x${float_char}cblat3"
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
if (USE_GEMM3M)
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
add_test(NAME "x${float_char}cblat3_3m"
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3_3m> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3_3m")
endif()
endif()
endforeach() endforeach()

View File

@ -5,6 +5,24 @@
TOPDIR = .. TOPDIR = ..
include $(TOPDIR)/Makefile.system include $(TOPDIR)/Makefile.system
SUPPORT_GEMM3M = 0
ifeq ($(ARCH), x86)
SUPPORT_GEMM3M = 1
endif
ifeq ($(ARCH), x86_64)
SUPPORT_GEMM3M = 1
endif
ifeq ($(ARCH), ia64)
SUPPORT_GEMM3M = 1
endif
ifeq ($(ARCH), MIPS)
SUPPORT_GEMM3M = 1
endif
override CFLAGS += -DADD$(BU) -DCBLAS override CFLAGS += -DADD$(BU) -DCBLAS
ifeq ($(F_COMPILER),GFORTRAN) ifeq ($(F_COMPILER),GFORTRAN)
override FFLAGS += -fno-tree-vectorize override FFLAGS += -fno-tree-vectorize
@ -144,9 +162,15 @@ all3targets += xdcblat3
endif endif
ifeq ($(BUILD_COMPLEX),1) ifeq ($(BUILD_COMPLEX),1)
all3targets += xccblat3 all3targets += xccblat3
ifeq ($(SUPPORT_GEMM3M),1)
all3targets += xccblat3_3m
endif
endif endif
ifeq ($(BUILD_COMPLEX16),1) ifeq ($(BUILD_COMPLEX16),1)
all3targets += xzcblat3 all3targets += xzcblat3
ifeq ($(SUPPORT_GEMM3M),1)
all3targets += xzcblat3_3m
endif
endif endif
all3: $(all3targets) all3: $(all3targets)
@ -181,9 +205,9 @@ endif
endif endif
endif endif
all3_3m: xzcblat3_3m xccblat3_3m ifeq ($(SUPPORT_GEMM3M),1)
ifeq ($(USE_OPENMP), 1) ifeq ($(USE_OPENMP), 1)
ifeq ($(BUILD_SINGLE),1) ifeq ($(BUILD_COMPLEX),1)
OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
endif endif
ifeq ($(BUILD_COMPLEX16),1) ifeq ($(BUILD_COMPLEX16),1)
@ -197,6 +221,7 @@ ifeq ($(BUILD_COMPLEX16),1)
OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
endif endif
endif endif
endif
@ -218,6 +243,9 @@ ifeq ($(F_COMPILER), IBM)
ifeq ($(C_COMPILER), GCC) ifeq ($(C_COMPILER), GCC)
CEXTRALIB += -lgomp CEXTRALIB += -lgomp
endif endif
ifeq ($(C_COMPILER), CLANG)
CEXTRALIB += -lomp
endif
endif endif
endif endif
@ -268,8 +296,10 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) $(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) $(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
ifeq ($(SUPPORT_GEMM3M),1)
xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME) xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) $(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
endif
else else
xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME) xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) $(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
@ -277,6 +307,10 @@ xccblat2: $(ctestl2o) c_cblat2c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) $(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME) xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) $(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
ifeq ($(SUPPORT_GEMM3M),1)
xccblat3_3m: $(ctestl3o_3m) c_cblat3c_3m.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xccblat3_3m c_cblat3c_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
endif
endif endif
endif endif
@ -290,8 +324,10 @@ xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) $(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) $(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
ifeq ($(SUPPORT_GEMM3M),1)
xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME) xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) $(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
endif
else else
xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME) xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) $(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
@ -299,6 +335,10 @@ xzcblat2: $(ztestl2o) c_zblat2c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) $(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME) xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) $(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
ifeq ($(SUPPORT_GEMM3M),1)
xzcblat3_3m: $(ztestl3o_3m) c_zblat3c_3m.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xzcblat3_3m c_zblat3c_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
endif
endif endif
endif endif

View File

@ -96,7 +96,7 @@
INTEGER ICAMAXTEST INTEGER ICAMAXTEST
EXTERNAL SCASUMTEST, SCNRM2TEST, ICAMAXTEST EXTERNAL SCASUMTEST, SCNRM2TEST, ICAMAXTEST
* .. External Subroutines .. * .. External Subroutines ..
EXTERNAL CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1 EXTERNAL CSCALTEST, CSSCALTEST, CTEST, ITEST1, STEST1
* .. Intrinsic Functions .. * .. Intrinsic Functions ..
INTRINSIC MAX INTRINSIC MAX
* .. Common blocks .. * .. Common blocks ..
@ -214,8 +214,8 @@
CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1), CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1),
+ STRUE4(NP1),SFAC) + STRUE4(NP1),SFAC)
ELSE IF (ICASE.EQ.8) THEN ELSE IF (ICASE.EQ.8) THEN
* .. CSCAL .. * .. CSCALTEST ..
CALL CSCAL(N,CA,CX,INCX) CALL CSCALTEST(N,CA,CX,INCX)
CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX), CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX),
+ SFAC) + SFAC)
ELSE IF (ICASE.EQ.9) THEN ELSE IF (ICASE.EQ.9) THEN
@ -236,14 +236,14 @@
* *
INCX = 1 INCX = 1
IF (ICASE.EQ.8) THEN IF (ICASE.EQ.8) THEN
* CSCAL * CSCALTEST
* Add a test for alpha equal to zero. * Add a test for alpha equal to zero.
CA = (0.0E0,0.0E0) CA = (0.0E0,0.0E0)
DO 80 I = 1, 5 DO 80 I = 1, 5
MWPCT(I) = (0.0E0,0.0E0) MWPCT(I) = (0.0E0,0.0E0)
MWPCS(I) = (1.0E0,1.0E0) MWPCS(I) = (1.0E0,1.0E0)
80 CONTINUE 80 CONTINUE
CALL CSCAL(5,CA,CX,INCX) CALL CSCALTEST(5,CA,CX,INCX)
CALL CTEST(5,CX,MWPCT,MWPCS,SFAC) CALL CTEST(5,CX,MWPCT,MWPCS,SFAC)
ELSE IF (ICASE.EQ.9) THEN ELSE IF (ICASE.EQ.9) THEN
* CSSCALTEST * CSSCALTEST

View File

@ -440,6 +440,7 @@ static real c_b43 = (float)1.;
extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*); extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*);
static complex mwpcs[5], mwpct[5]; static complex mwpcs[5], mwpct[5];
extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*); extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*);
extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_();
static complex cx[8]; static complex cx[8];
extern real scnrm2test_(integer*, complex*, integer*); extern real scnrm2test_(integer*, complex*, integer*);
static integer np1; static integer np1;
@ -481,7 +482,7 @@ static real c_b43 = (float)1.;
stest1_(&r__1, &strue4[np1 - 1], &strue4[np1 - 1], sfac); stest1_(&r__1, &strue4[np1 - 1], &strue4[np1 - 1], sfac);
} else if (combla_1.icase == 8) { } else if (combla_1.icase == 8) {
/* .. CSCAL .. */ /* .. CSCAL .. */
cscal_(&combla_1.n, &ca, cx, &combla_1.incx); cscaltest_(&combla_1.n, &ca, cx, &combla_1.incx);
ctest_(&len, cx, &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], ctest_(&len, cx, &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48],
&ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], sfac); &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], sfac);
} else if (combla_1.icase == 9) { } else if (combla_1.icase == 9) {
@ -515,7 +516,7 @@ static real c_b43 = (float)1.;
mwpcs[i__1].r = (float)1., mwpcs[i__1].i = (float)1.; mwpcs[i__1].r = (float)1., mwpcs[i__1].i = (float)1.;
/* L80: */ /* L80: */
} }
cscal_(&c__5, &ca, cx, &combla_1.incx); cscaltest_(&c__5, &ca, cx, &combla_1.incx);
ctest_(&c__5, cx, mwpct, mwpcs, sfac); ctest_(&c__5, cx, mwpct, mwpcs, sfac);
} else if (combla_1.icase == 9) { } else if (combla_1.icase == 9) {
/* CSSCALTEST */ /* CSSCALTEST */

3942
ctest/c_cblat3c_3m.c Normal file

File diff suppressed because it is too large Load Diff

3951
ctest/c_zblat3c_3m.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -545,13 +545,31 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
*range_n, IFLOAT *sa, IFLOAT *sb, *range_n, IFLOAT *sa, IFLOAT *sb,
BLASLONG nthreads_m, BLASLONG nthreads_n) { BLASLONG nthreads_m, BLASLONG nthreads_n) {
#ifndef USE_OPENMP #ifdef USE_OPENMP
#ifndef OS_WINDOWS static omp_lock_t level3_lock, critical_section_lock;
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER; static volatile BLASLONG init_lock = 0, omp_lock_initialized = 0,
#else parallel_section_left = MAX_PARALLEL_NUMBER;
// Lock initialization; Todo : Maybe this part can be moved to blas_init() in blas_server_omp.c
while(omp_lock_initialized == 0)
{
blas_lock(&init_lock);
{
if(omp_lock_initialized == 0)
{
omp_init_lock(&level3_lock);
omp_init_lock(&critical_section_lock);
omp_lock_initialized = 1;
WMB;
}
blas_unlock(&init_lock);
}
}
#elif defined(OS_WINDOWS)
CRITICAL_SECTION level3_lock; CRITICAL_SECTION level3_lock;
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock); InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif #else
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
#endif #endif
blas_arg_t newarg; blas_arg_t newarg;
@ -599,12 +617,28 @@ InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif #endif
#endif #endif
#ifndef USE_OPENMP #ifdef USE_OPENMP
#ifndef OS_WINDOWS omp_set_lock(&level3_lock);
pthread_mutex_lock(&level3_lock); omp_set_lock(&critical_section_lock);
#else
parallel_section_left--;
/*
How OpenMP locks works with NUM_PARALLEL
1) parallel_section_left = Number of available concurrent executions of OpenBLAS - Number of currently executing OpenBLAS executions
2) level3_lock is acting like a master lock or barrier which stops OpenBLAS calls when all the parallel_section are currently busy executing other OpenBLAS calls
3) critical_section_lock is used for updating variables shared between threads executing OpenBLAS calls concurrently and for unlocking of master lock whenever required
4) Unlock master lock only when we have not already exhausted all the parallel_sections and allow another thread with a OpenBLAS call to enter
*/
if(parallel_section_left != 0)
omp_unset_lock(&level3_lock);
omp_unset_lock(&critical_section_lock);
#elif defined(OS_WINDOWS)
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif #else
pthread_mutex_lock(&level3_lock);
#endif #endif
#ifdef USE_ALLOC_HEAP #ifdef USE_ALLOC_HEAP
@ -732,12 +766,24 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
free(job); free(job);
#endif #endif
#ifndef USE_OPENMP #ifdef USE_OPENMP
#ifndef OS_WINDOWS omp_set_lock(&critical_section_lock);
pthread_mutex_unlock(&level3_lock); parallel_section_left++;
#else
/*
Unlock master lock only when all the parallel_sections are already exhausted and one of the thread has completed its OpenBLAS call
otherwise just increment the parallel_section_left
The master lock is only locked when we have exhausted all the parallel_sections, So only unlock it then and otherwise just increment the count
*/
if(parallel_section_left == 1)
omp_unset_lock(&level3_lock);
omp_unset_lock(&critical_section_lock);
#elif defined(OS_WINDOWS)
LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock); LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif #else
pthread_mutex_unlock(&level3_lock);
#endif #endif
return 0; return 0;

View File

@ -113,6 +113,8 @@ extern unsigned int openblas_thread_timeout(void);
/* We need this global for checking if initialization is finished. */ /* We need this global for checking if initialization is finished. */
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
int blas_omp_threads_local = 1;
/* Local Variables */ /* Local Variables */
#if defined(USE_PTHREAD_LOCK) #if defined(USE_PTHREAD_LOCK)
static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER;

View File

@ -69,6 +69,7 @@
int blas_server_avail = 0; int blas_server_avail = 0;
int blas_omp_number_max = 0; int blas_omp_number_max = 0;
int blas_omp_threads_local = 1;
extern int openblas_omp_adaptive_env(void); extern int openblas_omp_adaptive_env(void);
@ -422,7 +423,6 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
if (i != MAX_PARALLEL_NUMBER) if (i != MAX_PARALLEL_NUMBER)
break; break;
} }
if (openblas_omp_adaptive_env() != 0) { if (openblas_omp_adaptive_env() != 0) {
#pragma omp parallel for num_threads(num) schedule(OMP_SCHED) #pragma omp parallel for num_threads(num) schedule(OMP_SCHED)
for (i = 0; i < num; i ++) { for (i = 0; i < num; i ++) {

View File

@ -48,6 +48,12 @@
#endif #endif
#endif #endif
#ifdef SMP_DEBUG
# define MT_TRACE(...) fprintf(stderr, __VA_ARGS__)
#else
# define MT_TRACE(...)
#endif
/* This is a thread implementation for Win32 lazy implementation */ /* This is a thread implementation for Win32 lazy implementation */
/* Thread server common information */ /* Thread server common information */
@ -59,6 +65,8 @@ static CRITICAL_SECTION queue_lock;
/* We need this global for checking if initialization is finished. */ /* We need this global for checking if initialization is finished. */
int blas_server_avail = 0; int blas_server_avail = 0;
int blas_omp_threads_local = 1;
/* Local Variables */ /* Local Variables */
static BLASULONG server_lock = 0; static BLASULONG server_lock = 0;
@ -66,16 +74,9 @@ static HANDLE blas_threads [MAX_CPU_NUMBER];
static DWORD blas_threads_id[MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER];
static volatile int thread_target; // target num of live threads, volatile for cross-thread reads static volatile int thread_target; // target num of live threads, volatile for cross-thread reads
#if defined (__GNUC__) && (__GNUC__ < 6) //
#define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch) // Legacy code path
#else //
#if defined(_WIN64)
#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp)
#else
#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp)
#endif
#endif
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) { static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) {
if (!(mode & BLAS_COMPLEX)) { if (!(mode & BLAS_COMPLEX)) {
@ -199,9 +200,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
} }
} }
/* This is a main routine of threads. Each thread waits until job is */ //
/* queued. */ // This is a main routine of threads. Each thread waits until job is queued.
//
static DWORD WINAPI blas_thread_server(void *arg) { static DWORD WINAPI blas_thread_server(void *arg) {
/* Thread identifier */ /* Thread identifier */
@ -213,31 +214,24 @@ static DWORD WINAPI blas_thread_server(void *arg){
/* Each server needs each buffer */ /* Each server needs each buffer */
buffer = blas_memory_alloc(2); buffer = blas_memory_alloc(2);
#ifdef SMP_DEBUG MT_TRACE("Server[%2ld] Thread is started!\n", cpu);
fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu);
#endif
while (1) { while (1) {
/* Waiting for Queue */ /* Waiting for Queue */
#ifdef SMP_DEBUG MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu);
fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu);
#endif
// event raised when work is added to the queue // event raised when work is added to the queue
WaitForSingleObject(kickoff_event, INFINITE); WaitForSingleObject(kickoff_event, INFINITE);
if (cpu > thread_target - 2) if (cpu > thread_target - 2) {
{ //MT_TRACE("thread [%d] exiting.\n", cpu);
//printf("thread [%d] exiting.\n", cpu);
break; // excess thread, so worker thread exits break; // excess thread, so worker thread exits
} }
#ifdef SMP_DEBUG MT_TRACE("Server[%2ld] Got it.\n", cpu);
fprintf(STDERR, "Server[%2ld] Got it.\n", cpu);
#endif
#if 1
EnterCriticalSection(&queue_lock); EnterCriticalSection(&queue_lock);
queue = work_queue; queue = work_queue;
@ -245,19 +239,6 @@ static DWORD WINAPI blas_thread_server(void *arg){
work_queue = work_queue->next; work_queue = work_queue->next;
LeaveCriticalSection(&queue_lock); LeaveCriticalSection(&queue_lock);
#else
volatile blas_queue_t* queue_next;
INT_PTR prev_value;
do {
queue = (volatile blas_queue_t*)work_queue;
if (!queue)
break;
queue_next = (volatile blas_queue_t*)queue->next;
prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue);
} while (prev_value != queue);
#endif
if (queue) { if (queue) {
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
@ -270,10 +251,8 @@ static DWORD WINAPI blas_thread_server(void *arg){
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
#endif #endif
#ifdef SMP_DEBUG MT_TRACE("Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
fprintf(STDERR, "Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
#endif
// fprintf(stderr, "queue start[%ld]!!!\n", cpu); // fprintf(stderr, "queue start[%ld]!!!\n", cpu);
@ -281,7 +260,8 @@ static DWORD WINAPI blas_thread_server(void *arg){
main_status[cpu] = MAIN_RUNNING1; main_status[cpu] = MAIN_RUNNING1;
#endif #endif
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); if (sa == NULL)
sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
if (sb == NULL) { if (sb == NULL) {
if (!(queue -> mode & BLAS_COMPLEX)) { if (!(queue -> mode & BLAS_COMPLEX)) {
@ -333,7 +313,6 @@ static DWORD WINAPI blas_thread_server(void *arg){
#endif #endif
if (!(queue -> mode & BLAS_LEGACY)) { if (!(queue -> mode & BLAS_LEGACY)) {
(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
} else { } else {
legacy_exec(routine, queue -> mode, queue -> args, sb); legacy_exec(routine, queue -> mode, queue -> args, sb);
@ -342,26 +321,23 @@ static DWORD WINAPI blas_thread_server(void *arg){
continue; //if queue == NULL continue; //if queue == NULL
} }
#ifdef SMP_DEBUG MT_TRACE("Server[%2ld] Finished!\n", cpu);
fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
#endif
queue->finished = 1; queue->finished = 1;
} }
/* Shutdown procedure */ /* Shutdown procedure */
#ifdef SMP_DEBUG MT_TRACE("Server[%2ld] Shutdown!\n", cpu);
fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu);
#endif
blas_memory_free(buffer); blas_memory_free(buffer);
return 0; return 0;
} }
/* Initializing routine */ //
// Initializing routine
//
int blas_thread_init(void) { int blas_thread_init(void) {
BLASLONG i; BLASLONG i;
@ -369,10 +345,7 @@ int blas_thread_init(void){
LOCK_COMMAND(&server_lock); LOCK_COMMAND(&server_lock);
#ifdef SMP_DEBUG MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number);
fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n",
blas_cpu_number);
#endif
if (!blas_server_avail) { if (!blas_server_avail) {
// create the kickoff Event // create the kickoff Event
@ -383,7 +356,7 @@ int blas_thread_init(void){
InitializeCriticalSection(&queue_lock); InitializeCriticalSection(&queue_lock);
for(i = 0; i < blas_cpu_number - 1; i++) { for(i = 0; i < blas_cpu_number - 1; i++) {
//printf("thread_init: creating thread [%d]\n", i); //MT_TRACE("thread_init: creating thread [%d]\n", i);
blas_threads[i] = CreateThread(NULL, 0, blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i, blas_thread_server, (void *)i,
@ -398,14 +371,11 @@ int blas_thread_init(void){
return 0; return 0;
} }
/* //
User can call one of two routines. // User can call one of two routines.
// exec_blas_async ... immediately returns after jobs are queued.
exec_blas_async ... immediately returns after jobs are queued. // exec_blas ... returns after jobs are finished.
//
exec_blas ... returns after jobs are finished.
*/
int exec_blas_async(BLASLONG pos, blas_queue_t *queue) { int exec_blas_async(BLASLONG pos, blas_queue_t *queue) {
#if defined(SMP_SERVER) #if defined(SMP_SERVER)
@ -439,14 +409,14 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
} }
else else
{ {
blas_queue_t *next_item = work_queue; blas_queue_t *queue_item = work_queue;
// find the end of the work queue // find the end of the work queue
while (next_item) while (queue_item->next)
next_item = next_item->next; queue_item = queue_item->next;
// add new work to the end // add new work to the end
next_item = queue; queue_item->next = queue;
} }
LeaveCriticalSection(&queue_lock); LeaveCriticalSection(&queue_lock);
@ -456,16 +426,16 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
return 0; return 0;
} }
//
// Join. Wait for all queued tasks to complete
//
int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) { int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) {
#ifdef SMP_DEBUG MT_TRACE("Synchronization Waiting.\n");
fprintf(STDERR, "Synchronization Waiting.\n");
#endif
while (num) { while (num) {
#ifdef SMP_DEBUG MT_TRACE("Waiting Queue ..\n");
fprintf(STDERR, "Waiting Queue ..\n");
#endif
while (!queue->finished) while (!queue->finished)
YIELDING; YIELDING;
@ -473,9 +443,8 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
num--; num--;
} }
#ifdef SMP_DEBUG MT_TRACE("Completely Done.\n\n");
fprintf(STDERR, "Completely Done.\n\n");
#endif
// if work was added to the queue after this batch we can't sleep the worker threads // if work was added to the queue after this batch we can't sleep the worker threads
// by resetting the event // by resetting the event
EnterCriticalSection(&queue_lock); EnterCriticalSection(&queue_lock);
@ -488,7 +457,9 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
return 0; return 0;
} }
/* Execute Threads */ //
// Execute Threads
//
int exec_blas(BLASLONG num, blas_queue_t *queue) { int exec_blas(BLASLONG num, blas_queue_t *queue) {
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) #if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
@ -502,28 +473,32 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
if ((num <= 0) || (queue == NULL)) return 0; if ((num <= 0) || (queue == NULL)) return 0;
if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next); if ((num > 1) && queue -> next)
exec_blas_async(1, queue -> next);
routine = queue -> routine; routine = queue -> routine;
if (queue -> mode & BLAS_LEGACY) { if (queue -> mode & BLAS_LEGACY) {
legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
} else } else {
if (queue -> mode & BLAS_PTHREAD) { if (queue -> mode & BLAS_PTHREAD) {
void (*pthreadcompat)(void *) = queue -> routine; void (*pthreadcompat)(void *) = queue -> routine;
(pthreadcompat)(queue -> args); (pthreadcompat)(queue -> args);
} else } else
(routine)(queue -> args, queue -> range_m, queue -> range_n, (routine)(queue -> args, queue -> range_m, queue -> range_n,
queue -> sa, queue -> sb, 0); queue -> sa, queue -> sb, 0);
}
if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); if ((num > 1) && queue -> next)
exec_blas_async_wait(num - 1, queue -> next);
return 0; return 0;
} }
/* Shutdown procedure, but user don't have to call this routine. The */ //
/* kernel automatically kill threads. */ // Shutdown procedure, but user don't have to call this routine. The
// kernel automatically kill threads.
//
int BLASFUNC(blas_thread_shutdown)(void) { int BLASFUNC(blas_thread_shutdown)(void) {
int i; int i;
@ -556,6 +531,9 @@ int BLASFUNC(blas_thread_shutdown)(void){
return 0; return 0;
} }
//
// Legacy function to set numbef of threads
//
void goto_set_num_threads(int num_threads) void goto_set_num_threads(int num_threads)
{ {
long i; long i;
@ -577,11 +555,11 @@ void goto_set_num_threads(int num_threads)
SetEvent(kickoff_event); SetEvent(kickoff_event);
for (i = num_threads - 1; i < blas_num_threads - 1; i++) { for (i = num_threads - 1; i < blas_num_threads - 1; i++) {
//printf("set_num_threads: waiting on thread [%d] to quit.\n", i); //MT_TRACE("set_num_threads: waiting on thread [%d] to quit.\n", i);
WaitForSingleObject(blas_threads[i], INFINITE); WaitForSingleObject(blas_threads[i], INFINITE);
//printf("set_num_threads: thread [%d] has quit.\n", i); //MT_TRACE("set_num_threads: thread [%d] has quit.\n", i);
CloseHandle(blas_threads[i]); CloseHandle(blas_threads[i]);
} }
@ -610,7 +588,7 @@ void goto_set_num_threads(int num_threads)
} }
for (i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) { for (i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) {
//printf("set_num_threads: creating thread [%d]\n", i); //MT_TRACE("set_num_threads: creating thread [%d]\n", i);
blas_threads[i] = CreateThread(NULL, 0, blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i, blas_thread_server, (void *)i,
@ -625,6 +603,9 @@ void goto_set_num_threads(int num_threads)
blas_cpu_number = num_threads; blas_cpu_number = num_threads;
} }
//
// Openblas function to set thread count
//
void openblas_set_num_threads(int num) void openblas_set_num_threads(int num)
{ {
goto_set_num_threads(num); goto_set_num_threads(num);

View File

@ -275,6 +275,7 @@ extern gotoblas_t gotoblas_EXCAVATOR;
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
#define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE #define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE
#define gotoblas_ZEN gotoblas_SANDYBRIDGE #define gotoblas_ZEN gotoblas_SANDYBRIDGE
#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE
#else #else
extern gotoblas_t gotoblas_HASWELL; extern gotoblas_t gotoblas_HASWELL;
extern gotoblas_t gotoblas_ZEN; extern gotoblas_t gotoblas_ZEN;

View File

@ -1,6 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */ /* Copyright 2023-2024 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -143,12 +143,13 @@ extern gotoblas_t gotoblas_ARMV8SVE;
#endif #endif
extern gotoblas_t gotoblas_THUNDERX3T110; extern gotoblas_t gotoblas_THUNDERX3T110;
#endif #endif
#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1
extern void openblas_warning(int verbose, const char * msg); extern void openblas_warning(int verbose, const char * msg);
#define FALLBACK_VERBOSE 1 #define FALLBACK_VERBOSE 1
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"
#define NUM_CORETYPES 16 #define NUM_CORETYPES 17
/* /*
* In case asm/hwcap.h is outdated on the build system, make sure * In case asm/hwcap.h is outdated on the build system, make sure
@ -178,6 +179,7 @@ static char *corename[] = {
"emag8180", "emag8180",
"neoversen1", "neoversen1",
"neoversev1", "neoversev1",
"neoversev2",
"neoversen2", "neoversen2",
"thunderx3t110", "thunderx3t110",
"cortexa55", "cortexa55",
@ -198,10 +200,11 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; if (gotoblas == &gotoblas_EMAG8180) return corename[ 9];
if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10];
if (gotoblas == &gotoblas_NEOVERSEV1) return corename[11]; if (gotoblas == &gotoblas_NEOVERSEV1) return corename[11];
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12]; if (gotoblas == &gotoblas_NEOVERSEV2) return corename[12];
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13]; if (gotoblas == &gotoblas_NEOVERSEN2) return corename[13];
if (gotoblas == &gotoblas_CORTEXA55) return corename[14]; if (gotoblas == &gotoblas_THUNDERX3T110) return corename[14];
if (gotoblas == &gotoblas_ARMV8SVE) return corename[15]; if (gotoblas == &gotoblas_CORTEXA55) return corename[15];
if (gotoblas == &gotoblas_ARMV8SVE) return corename[16];
return corename[NUM_CORETYPES]; return corename[NUM_CORETYPES];
} }
@ -233,10 +236,11 @@ static gotoblas_t *force_coretype(char *coretype) {
case 9: return (&gotoblas_EMAG8180); case 9: return (&gotoblas_EMAG8180);
case 10: return (&gotoblas_NEOVERSEN1); case 10: return (&gotoblas_NEOVERSEN1);
case 11: return (&gotoblas_NEOVERSEV1); case 11: return (&gotoblas_NEOVERSEV1);
case 12: return (&gotoblas_NEOVERSEN2); case 12: return (&gotoblas_NEOVERSEV2);
case 13: return (&gotoblas_THUNDERX3T110); case 13: return (&gotoblas_NEOVERSEN2);
case 14: return (&gotoblas_CORTEXA55); case 14: return (&gotoblas_THUNDERX3T110);
case 15: return (&gotoblas_ARMV8SVE); case 15: return (&gotoblas_CORTEXA55);
case 16: return (&gotoblas_ARMV8SVE);
} }
snprintf(message, 128, "Core not found: %s\n", coretype); snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message); openblas_warning(1, message);
@ -312,6 +316,13 @@ static gotoblas_t *get_coretype(void) {
return &gotoblas_NEOVERSEN1; return &gotoblas_NEOVERSEN1;
}else }else
return &gotoblas_NEOVERSEV1; return &gotoblas_NEOVERSEV1;
case 0xd4f:
if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
return &gotoblas_NEOVERSEN1;
} else {
return &gotoblas_NEOVERSEV2;
}
#endif #endif
case 0xd05: // Cortex A55 case 0xd05: // Cortex A55
return &gotoblas_CORTEXA55; return &gotoblas_CORTEXA55;

View File

@ -43,6 +43,13 @@ char *gotoblas_corename(void) {
#define CPU_POWER9 9 #define CPU_POWER9 9
#define CPU_POWER10 10 #define CPU_POWER10 10
#ifndef POWER_9
#define POWER_9 0x20000 /* 9 class CPU */
#endif
#ifndef POWER_10
#define POWER_10 0x40000 /* 10 class CPU */
#endif
#ifdef _AIX #ifdef _AIX
#include <sys/systemcfg.h> #include <sys/systemcfg.h>
@ -62,7 +69,7 @@ static int cpuid(void)
else if (arch == POWER_9) return CPU_POWER9; else if (arch == POWER_9) return CPU_POWER9;
#endif #endif
#ifdef POWER_10 #ifdef POWER_10
else if (arch == POWER_10) return CPU_POWER10; else if (arch >= POWER_10) return CPU_POWER10;
#endif #endif
return CPU_UNKNOWN; return CPU_UNKNOWN;
} }
@ -332,6 +339,9 @@ void gotoblas_dynamic_init(void) {
if (gotoblas && gotoblas -> init) { if (gotoblas && gotoblas -> init) {
strncpy(coren,gotoblas_corename(),20); strncpy(coren,gotoblas_corename(),20);
sprintf(coremsg, "Core: %s\n",coren); sprintf(coremsg, "Core: %s\n",coren);
if (getenv("GET_OPENBLAS_CORETYPE")) {
fprintf(stderr, "%s", coremsg);
}
openblas_warning(2, coremsg); openblas_warning(2, coremsg);
gotoblas -> init(); gotoblas -> init();
} else { } else {

View File

@ -3214,7 +3214,7 @@ void blas_shutdown(void){
#endif #endif
memory[pos].lock = 0; memory[pos].lock = 0;
} }
if (memory_overflowed) if (memory_overflowed) {
for (pos = 0; pos < NEW_BUFFERS; pos ++){ for (pos = 0; pos < NEW_BUFFERS; pos ++){
newmemory[pos].addr = (void *)0; newmemory[pos].addr = (void *)0;
newmemory[pos].used = 0; newmemory[pos].used = 0;
@ -3223,6 +3223,10 @@ void blas_shutdown(void){
#endif #endif
newmemory[pos].lock = 0; newmemory[pos].lock = 0;
} }
free(newmemory);
newmemory = NULL;
memory_overflowed = 0;
}
UNLOCK_COMMAND(&alloc_lock); UNLOCK_COMMAND(&alloc_lock);

View File

@ -36,11 +36,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef SMP_SERVER #ifdef SMP_SERVER
extern void openblas_set_num_threads(int num_threads) ; extern void openblas_set_num_threads(int num_threads) ;
extern int openblas_get_num_threads(void) ;
void openblas_set_num_threads_(int* num_threads){ void openblas_set_num_threads_(int* num_threads){
openblas_set_num_threads(*num_threads); openblas_set_num_threads(*num_threads);
} }
int openblas_set_num_threads_local(int num_threads){
int ret = openblas_get_num_threads();
openblas_set_num_threads(num_threads);
blas_omp_threads_local=num_threads;
return ret;
}
#else #else
//Single thread //Single thread
@ -50,4 +59,8 @@ void openblas_set_num_threads(int num_threads) {
void openblas_set_num_threads_(int* num_threads){ void openblas_set_num_threads_(int* num_threads){
} }
int openblas_set_num_threads_local(int num_threads){
return 1;
}
#endif #endif

View File

@ -73,6 +73,10 @@ endif
endif endif
endif endif
ifeq ($(F_COMPILER)$(OSNAME), IBMAIX)
EXTRALIB += -lxlf90
endif
ifeq ($(C_COMPILER), PGI) ifeq ($(C_COMPILER), PGI)
EXTRALIB += -pgf90libs EXTRALIB += -pgf90libs
endif endif
@ -132,8 +136,12 @@ libgoto_hpl.def : $(GENSYM)
./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F) ./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
ifeq ($(OSNAME), Darwin) ifeq ($(OSNAME), Darwin)
ifeq ($(FIXED_LIBNAME),1)
INTERNALNAME = $(LIBPREFIX)$(LIBNAMESUFFIX).dylib
else
INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif endif
endif
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
$(LIBDYNNAME) : ../$(LIBNAME) osx.def $(LIBDYNNAME) : ../$(LIBNAME) osx.def
@ -169,8 +177,12 @@ INTERNALNAME = $(LIBPREFIX).so
FEXTRALIB += -lm FEXTRALIB += -lm
EXTRALIB += -lm EXTRALIB += -lm
else else
ifeq ($(FIXED_LIBNAME),1)
INTERNALNAME = $(LIBPREFIX)$(LIBNAMESUFFIX).so
else
INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION) INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
endif endif
endif
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX)) ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
../$(LIBSONAME) : ../$(LIBNAME) linktest.c ../$(LIBSONAME) : ../$(LIBNAME) linktest.c
@ -248,6 +260,20 @@ endif
ifeq ($(OSNAME), AIX) ifeq ($(OSNAME), AIX)
so : ../$(LIBSONAME) linktest.c
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(EXTRALIB) && echo OK.
rm -f linktest
../$(LIBSONAME) : aix.exp
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,-bcdtors:all:-2147481648:s,-bE:aix.exp -Wl,-bbigtoc ../$(LIBNAME) $(EXTRALIB)
aix.exp :
/usr/bin/nm -X32_64 -PCpgl ../$(LIBNAME) | /usr/bin/awk '{ if ((($$ 2 == "T") \
|| ($$ 2 == "D") || ($$ 2 == "B") || ($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) && (substr($$ 1,1,1) != ".")) \
{ if (($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) { print $$ 1 " weak" } else { print $$ 1 } } }' | \
/usr/bin/sort -u > aix.exp
ifeq ($(COMPILER_F77), xlf) ifeq ($(COMPILER_F77), xlf)
goto32.$(SUFFIX) : ../$(LIBNAME) aix.def goto32.$(SUFFIX) : ../$(LIBNAME) aix.def
@ -289,6 +315,11 @@ test : linktest.c
linktest.c : $(GENSYM) ../Makefile.system ../getarch.c linktest.c : $(GENSYM) ../Makefile.system ../getarch.c
./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c ./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c
ifeq ($(F_COMPILER), IBM)
mv linktest.c linktest.c.FIRST
egrep -v 'second_|dsecnd_' linktest.c.FIRST > linktest.c
rm linktest.c.FIRST
endif
clean :: clean ::
@rm -f *.def *.dylib __.SYMDEF* *.renamed @rm -f *.def *.dylib __.SYMDEF* *.renamed

View File

@ -60,6 +60,7 @@ cblasobjsc="
cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv
cblas_scnrm2 cblas_scasum cblas_cgemmt cblas_scnrm2 cblas_scasum cblas_cgemmt
cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy
cblas_caxpyc cblas_crotg cblas_csrot cblas_scamax cblas_scamin
" "
cblasobjsd=" cblasobjsd="
cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot
@ -69,6 +70,7 @@ cblasobjsd="
cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv
cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt
cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy
cblas_damax cblas_damin
" "
cblasobjss=" cblasobjss="
@ -80,6 +82,7 @@ cblasobjss="
cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm
cblas_strsv cblas_sgeadd cblas_sgemmt cblas_strsv cblas_sgeadd cblas_sgemmt
cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy
cblas_samax cblas_samin
" "
cblasobjsz=" cblasobjsz="
@ -91,6 +94,7 @@ cblasobjsz="
cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub
cblas_zaxpby cblas_zgeadd cblas_zgemmt cblas_zaxpby cblas_zgeadd cblas_zgemmt
cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy
cblas_zaxpyc cblas_zdrot cblas_zrotg cblas_dzamax cblas_dzamin
" "
cblasobjs="cblas_xerbla" cblasobjs="cblas_xerbla"
@ -861,6 +865,53 @@ lapackobjs2z="$lapackobjs2z
zgedmd zgedmd
zgedmdq zgedmdq
" "
#functions added post 3.11
lapackobjs2c="$lapackobjs2c
claqp2rk
claqp3rk
ctrsyl3
"
# claqz0
# claqz1
# claqz2
# claqz3
# clatrs3
lapackobjs2d="$lapackobjs2d
dgelqs
dgelst
dgeqp3rk
dgeqrs
dlaqp2rk
dlaqp3rk
dlarmm
dlatrs3
dtrsyl3
"
# dlaqz0
# dlaqz1
# dlaqz2
# dlaqz3
# dlaqz4
lapackobjs2z="$lapackobjs2z
zgelqs
zgelst
zgeqp3rk
zgeqrs
zlaqp2rk
zlaqp3rk
zlatrs3
zrscl
ztrsyl3
"
# zlaqz0
# zlaqz1
# zlaqz2
# zlaqz3
lapack_extendedprecision_objs=" lapack_extendedprecision_objs="
zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx
dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx
@ -1622,6 +1673,14 @@ lapackeobjsc="
LAPACKE_cgetsqrhrt_work LAPACKE_cgetsqrhrt_work
LAPACKE_cungtsqr_row LAPACKE_cungtsqr_row
LAPACKE_cungtsqr_row_work LAPACKE_cungtsqr_row_work
LAPACKE_clangb
LAPACKE_clangb_work
LAPACKE_ctrsyl3
LAPACKE_ctrsyl3_work
LAPACKE_ctz_nancheck
LAPACKE_ctz_trans
LAPACKE_cunhr_col
LAPACKE_cunhr_col_work
" "
lapackeobjsd=" lapackeobjsd="
@ -2239,6 +2298,14 @@ lapackeobjsd="
LAPACKE_dgetsqrhrt_work LAPACKE_dgetsqrhrt_work
LAPACKE_dorgtsqr_row LAPACKE_dorgtsqr_row
LAPACKE_dorgtsqr_row_work LAPACKE_dorgtsqr_row_work
LAPACKE_dlangb
LAPACKE_dlangb_work
LAPACKE_dorhr_col
LAPACKE_dorhr_col_work
LAPACKE_dtrsyl3
LAPACKE_dtrsyl3_work
LAPACKE_dtz_nancheck
LAPACKE_dtz_trans
" "
lapackeobjss=" lapackeobjss="
@ -2848,6 +2915,14 @@ lapackeobjss="
LAPACKE_sgetsqrhrt_work LAPACKE_sgetsqrhrt_work
LAPACKE_sorgtsqr_row LAPACKE_sorgtsqr_row
LAPACKE_sorgtsqr_row_work LAPACKE_sorgtsqr_row_work
LAPACKE_slangb
LAPACKE_slangb_work
LAPACKE_sorhr_col
LAPACKE_sorhr_col_work
LAPACKE_strsyl3
LAPACKE_strsyl3_work
LAPACKE_stz_nancheck
LAPACKE_stz_trans
" "
lapackeobjsz=" lapackeobjsz="
@ -3515,6 +3590,14 @@ lapackeobjsz="
LAPACKE_zgetsqrhrt_work LAPACKE_zgetsqrhrt_work
LAPACKE_zungtsqr_row LAPACKE_zungtsqr_row
LAPACKE_zungtsqr_row_work LAPACKE_zungtsqr_row_work
LAPACKE_zlangb
LAPACKE_zlangb_work
LAPACKE_ztrsyl3
LAPACKE_ztrsyl3_work
LAPACKE_ztz_nancheck
LAPACKE_ztz_trans
LAPACKE_zunhr_col
LAPACKE_zunhr_col_work
" "
## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile`
## Not exported: requires LAPACKE_EXTENDED to be set and depends on the ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the
@ -3616,6 +3699,7 @@ lapack_embeded_underscore_objs_s="
ssysv_aa_2stage ssytrf_aa_2stage ssysv_aa_2stage ssytrf_aa_2stage
ssytrs_aa_2stage ssytrs_aa_2stage
slaorhr_col_getrfnp slaorhr_col_getrfnp2 sorhr_col slaorhr_col_getrfnp slaorhr_col_getrfnp2 sorhr_col
slarfb_gett
" "
lapack_embeded_underscore_objs_c=" lapack_embeded_underscore_objs_c="
chetf2_rook chetrf_rook chetri_rook chetf2_rook chetrf_rook chetri_rook
@ -3641,6 +3725,7 @@ lapack_embeded_underscore_objs_c="
csysv_aa_2stage csytrf_aa_2stage csysv_aa_2stage csytrf_aa_2stage
csytrs_aa_2stage csytrs_aa_2stage
claunhr_col_getrfnp claunhr_col_getrfnp2 cunhr_col claunhr_col_getrfnp claunhr_col_getrfnp2 cunhr_col
clarfb_gett
" "
lapack_embeded_underscore_objs_d=" lapack_embeded_underscore_objs_d="
dlasyf_rook dlasyf_rook
@ -3658,6 +3743,7 @@ lapack_embeded_underscore_objs_d="
dsysv_aa_2stage dsysv_aa_2stage
dsytrf_aa_2stage dsytrs_aa_2stage dsytrf_aa_2stage dsytrs_aa_2stage
dlaorhr_col_getrfnp dlaorhr_col_getrfnp2 dorhr_col dlaorhr_col_getrfnp dlaorhr_col_getrfnp2 dorhr_col
dlarfb_gett
" "
lapack_embeded_underscore_objs_z=" lapack_embeded_underscore_objs_z="
zhetf2_rook zhetrf_rook zhetri_rook zhetf2_rook zhetrf_rook zhetri_rook
@ -3682,6 +3768,7 @@ lapack_embeded_underscore_objs_z="
zhetrs_aa_2stage zsysv_aa_2stage zhetrs_aa_2stage zsysv_aa_2stage
zsytrf_aa_2stage zsytrs_aa_2stage zsytrf_aa_2stage zsytrs_aa_2stage
zlaunhr_col_getrfnp zlaunhr_col_getrfnp2 zunhr_col zlaunhr_col_getrfnp zlaunhr_col_getrfnp2 zunhr_col
zlarfb_gett
" "
dirname=`pwd -P`/../lapack-netlib dirname=`pwd -P`/../lapack-netlib

View File

@ -45,7 +45,7 @@ if [ -z "$compiler" ]; then
pathf90 pathf95 pathf90 pathf95
pgf95 pgf90 pgf77 pgfortran nvfortran pgf95 pgf90 pgf77 pgfortran nvfortran
flang egfortran flang egfortran
ifort nagfor ifx ftn crayftn" ifort nagfor ifx ftn crayftn armflang"
for list in $lists; do for list in $lists; do
for p in $path; do for p in $path; do
@ -86,6 +86,10 @@ else
vendor=CRAY vendor=CRAY
openmp='-fopenmp' openmp='-fopenmp'
;; ;;
*Arm\ F90*)
vendor=FLANG
openmp='-fopenmp'
;;
*GNU*|*GCC*) *GNU*|*GCC*)
v="${data#*GCC: *\) }" v="${data#*GCC: *\) }"

View File

@ -90,7 +90,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <sys/sysinfo.h> #include <sys/sysinfo.h>
#include <unistd.h> #include <unistd.h>
#endif #endif
#if defined(AIX) #if defined(_AIX)
#include <unistd.h>
#include <sys/systemcfg.h>
#include <sys/sysinfo.h> #include <sys/sysinfo.h>
#endif #endif
@ -150,6 +152,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_EV4 */ /* #define FORCE_EV4 */
/* #define FORCE_EV5 */ /* #define FORCE_EV5 */
/* #define FORCE_EV6 */ /* #define FORCE_EV6 */
/* #define FORCE_CSKY */
/* #define FORCE_CK860FV */
/* #define FORCE_GENERIC */ /* #define FORCE_GENERIC */
#ifdef FORCE_P2 #ifdef FORCE_P2
@ -1327,6 +1331,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "CORTEXA73" #define CORENAME "CORTEXA73"
#endif #endif
#ifdef FORCE_CORTEXA76
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "CORTEXA76"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXA76 " \
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa76"
#define CORENAME "CORTEXA76"
#endif
#ifdef FORCE_CORTEXX1 #ifdef FORCE_CORTEXX1
#define FORCE #define FORCE
#define ARCHITECTURE "ARM64" #define ARCHITECTURE "ARM64"
@ -1677,9 +1696,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define LIBNAME "c910v" #define LIBNAME "c910v"
#define CORENAME "C910V" #define CORENAME "C910V"
#endif #endif
#endif
#ifdef FORCE_x280
#define FORCE
#define ARCHITECTURE "RISCV64"
#define SUBARCHITECTURE "x280"
#define SUBDIRNAME "riscv64"
#define ARCHCONFIG "-Dx280 " \
"-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
#define LIBNAME "x280"
#define CORENAME "x280"
#else #else
#endif #endif
#ifdef FORCE_RISCV64_ZVL256B
#define FORCE
#define ARCHITECTURE "RISCV64"
#define SUBARCHITECTURE "RISCV64_ZVL256B"
#define SUBDIRNAME "riscv64"
#define ARCHCONFIG "-DRISCV64_ZVL256B " \
"-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
#define LIBNAME "riscv64_zvl256b"
#define CORENAME "RISCV64_ZVL256B"
#endif
#ifdef FORCE_RISCV64_ZVL128B
#define FORCE
#define ARCHITECTURE "RISCV64"
#define SUBARCHITECTURE "RISCV64_ZVL128B"
#define SUBDIRNAME "riscv64"
#define ARCHCONFIG "-DRISCV64_ZVL128B " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
#define LIBNAME "riscv64_zvl128b"
#define CORENAME "RISCV64_ZVL128B"
#endif
#if defined(FORCE_E2K) || defined(__e2k__) #if defined(FORCE_E2K) || defined(__e2k__)
#define FORCE #define FORCE
@ -1692,6 +1748,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "generic" #define CORENAME "generic"
#endif #endif
#ifdef FORCE_CSKY
#define FORCE
#define ARCHITECTURE "CSKY"
#define SUBARCHITECTURE "CSKY"
#define SUBDIRNAME "csky"
#define ARCHCONFIG "-DCSKY" \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
#define LIBNAME "csky"
#define CORENAME "CSKY"
#endif
#ifdef FORCE_CK860FV
#define FORCE
#define ARCHITECTURE "CSKY"
#define SUBARCHITECTURE "CK860V"
#define SUBDIRNAME "csky"
#define ARCHCONFIG "-DCK860FV " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
#define LIBNAME "ck860fv"
#define CORENAME "CK860FV"
#endif
#ifndef FORCE #ifndef FORCE
#ifdef USER_TARGET #ifdef USER_TARGET
@ -1766,7 +1849,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define OPENBLAS_SUPPORTED #define OPENBLAS_SUPPORTED
#endif #endif
#ifndef OPENBLAS_SUPPORTED #ifndef OPENBLAS_SUPPORTED
#error "This arch/CPU is not supported by OpenBLAS." #error "This arch/CPU is not supported by OpenBLAS."
#endif #endif
@ -1805,11 +1887,13 @@ static int get_num_cores(void) {
return count; return count;
#elif defined(AIX) #elif defined(_AIX)
//returns the number of processors which are currently online //returns the number of processors which are currently online
count = sysconf(_SC_NPROCESSORS_ONLN); count = sysconf(_SC_NPROCESSORS_ONLN);
if (count <= 0) count = 2; if (count <= 0) count = 2;
return count;
#else #else
return 2; return 2;
#endif #endif
@ -1831,7 +1915,7 @@ int main(int argc, char *argv[]){
#ifdef FORCE #ifdef FORCE
printf("CORE=%s\n", CORENAME); printf("CORE=%s\n", CORENAME);
#else #else
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) || defined(__csky__)
printf("CORE=%s\n", get_corename()); printf("CORE=%s\n", get_corename());
#endif #endif
#endif #endif
@ -1979,7 +2063,7 @@ printf("ELF_VERSION=2\n");
#ifdef FORCE #ifdef FORCE
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME); printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
#else #else
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) #if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__csky__)
printf("#define CHAR_CORENAME \"%s\"\n", get_corename()); printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
#endif #endif
#endif #endif

View File

@ -119,6 +119,7 @@ endif ()
if (BUILD_BFLOAT16) if (BUILD_BFLOAT16)
GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
@ -130,6 +131,8 @@ endif ()
foreach (float_type ${FLOAT_TYPES}) foreach (float_type ${FLOAT_TYPES})
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type})
GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type}) GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type})
GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type}) GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type})
GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type}) GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type})

View File

@ -270,7 +270,8 @@ CSBLAS1OBJS = \
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_samax.$(SUFFIX) \
cblas_samin.$(SUFFIX)
CSBLAS2OBJS = \ CSBLAS2OBJS = \
cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
@ -295,7 +296,8 @@ CDBLAS1OBJS = \
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_damax.$(SUFFIX) \
cblas_damin.$(SUFFIX)
CDBLAS2OBJS = \ CDBLAS2OBJS = \
cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
@ -315,7 +317,7 @@ CCBLAS1OBJS = \
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
cblas_caxpby.$(SUFFIX) \ cblas_caxpby.$(SUFFIX) cblas_scamax.$(SUFFIX) cblas_caxpyc.$(SUFFIX) cblas_scamin.$(SUFFIX) \
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX) cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX)
CCBLAS2OBJS = \ CCBLAS2OBJS = \
@ -340,12 +342,12 @@ CXERBLAOBJ = \
CZBLAS1OBJS = \ CZBLAS1OBJS = \
cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \ cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
cblas_zcopy.$(SUFFIX) \ cblas_zcopy.$(SUFFIX) cblas_dzamax.$(SUFFIX) cblas_dzamin.$(SUFFIX) \
cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \ cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
cblas_zaxpby.$(SUFFIX) \ cblas_zaxpby.$(SUFFIX) cblas_zaxpyc.$(SUFFIX) \
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX) cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX)
@ -1301,7 +1303,7 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c
ifeq ($(BUILD_BFLOAT16),1) ifeq ($(BUILD_BFLOAT16),1)
sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F) $(CC) -c $(CFLAGS) $< -o $(@F)
sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : gemmt.c ../param.h sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F) $(CC) -c $(CFLAGS) $< -o $(@F)
endif endif
@ -1533,6 +1535,30 @@ cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c
cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
cblas_scamax.$(SUFFIX) cblas_scamax.$(PSUFFIX) : max.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
cblas_dzamax.$(SUFFIX) cblas_dzamax.$(PSUFFIX) : max.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_scamin.$(SUFFIX) cblas_scamin.$(PSUFFIX) : max.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_dzamin.$(SUFFIX) cblas_dzamin.$(PSUFFIX) : max.c
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
@ -1627,6 +1653,15 @@ cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c
cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_caxpyc.$(SUFFIX) cblas_caxpyc.$(PSUFFIX) : zaxpy.c
$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c
$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c
$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
@ -1932,7 +1967,7 @@ cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
ifeq ($(BUILD_BFLOAT16),1) ifeq ($(BUILD_BFLOAT16),1)
cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
endif endif

View File

@ -117,8 +117,8 @@ void CNAME(enum CBLAS_ORDER order,
if (ldc < MAX(1, m)) info = 8; if (ldc < MAX(1, m)) info = 8;
if (lda < MAX(1, m)) info = 5; if (lda < MAX(1, m)) info = 5;
if (n < 0) info = 2; if (n < 0) info = 1;
if (m < 0) info = 1; if (m < 0) info = 2;
} }
if (info >= 0) { if (info >= 0) {

View File

@ -533,8 +533,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
MNK = (double) args.m * (double) args.n * (double) args.k; MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
args.nthreads = 1; args.nthreads = 1;
else else {
args.nthreads = num_cpu_avail(3); args.nthreads = num_cpu_avail(3);
if (MNK/args.nthreads < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD)
args.nthreads = MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD);
}
args.common = NULL; args.common = NULL;
if (args.nthreads == 1) { if (args.nthreads == 1) {

View File

@ -78,6 +78,9 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
char transA, transB, Uplo; char transA, transB, Uplo;
blasint nrowa, nrowb; blasint nrowa, nrowb;
#if defined(COMPLEX)
blasint ncolb;
#endif
IFLOAT *buffer; IFLOAT *buffer;
IFLOAT *aa, *bb; IFLOAT *aa, *bb;
FLOAT *cc; FLOAT *cc;
@ -157,17 +160,25 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
uplo = 1; uplo = 1;
nrowa = m; nrowa = m;
if (transa) nrowa = k; if (transa & 1) nrowa = k;
nrowb = k; nrowb = k;
if (transb) nrowb = m; #if defined(COMPLEX)
ncolb = m;
#endif
if (transb & 1) {
nrowb = m;
#if defined(COMPLEX)
ncolb = k;
#endif
}
info = 0; info = 0;
if (ldc < MAX(1, m)) if (ldc < MAX(1, m))
info = 13; info = 13;
if (ldb < MAX(1, nrowa)) if (ldb < MAX(1, nrowb))
info = 10; info = 10;
if (lda < MAX(1, nrowb)) if (lda < MAX(1, nrowa))
info = 8; info = 8;
if (k < 0) if (k < 0)
info = 5; info = 5;
@ -211,6 +222,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
blasint info; blasint info;
blasint lda, ldb; blasint lda, ldb;
FLOAT *a, *b; FLOAT *a, *b;
#if defined(COMPLEX)
blasint nrowb, ncolb;
#endif
XFLOAT *buffer; XFLOAT *buffer;
PRINT_DEBUG_CNAME; PRINT_DEBUG_CNAME;
@ -262,11 +276,22 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
info = -1; info = -1;
blasint nrowa, nrowb; blasint nrowa;
#if !defined(COMPLEX)
blasint nrowb;
#endif
nrowa = m; nrowa = m;
if (transa) nrowa = k; if (transa & 1) nrowa = k;
nrowb = k; nrowb = k;
if (transb) nrowb = m; #if defined(COMPLEX)
ncolb = m;
#endif
if (transb & 1) {
nrowb = m;
#if defined(COMPLEX)
ncolb = k;
#endif
}
if (ldc < MAX(1, m)) if (ldc < MAX(1, m))
info = 13; info = 13;
@ -330,26 +355,38 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
info = -1; info = -1;
blasint ncola, ncolb; blasint ncola;
ncola = k; #if !defined(COMPLEX)
if (transa) ncola = m; blasint ncolb;
#endif
ncola = m;
if (transa & 1) ncola = k;
ncolb = k;
#if defined(COMPLEX)
nrowb = m;
#endif
if (transb & 1) {
#if defined(COMPLEX)
nrowb = k;
#endif
ncolb = m; ncolb = m;
if (transb) ncolb = k; }
if (ldc < MAX(1,m)) if (ldc < MAX(1,m))
info = 13; info = 13;
if (ldb < MAX(1, ncolb)) if (ldb < MAX(1, ncolb))
info = 10;
if (lda < MAX(1, ncola))
info = 8; info = 8;
if (lda < MAX(1, ncola))
info = 10;
if (k < 0) if (k < 0)
info = 5; info = 5;
if (m < 0) if (m < 0)
info = 4; info = 4;
if (transb < 0) if (transb < 0)
info = 3;
if (transa < 0)
info = 2; info = 2;
if (transa < 0)
info = 3;
if (uplo < 0) if (uplo < 0)
info = 1; info = 1;
} }
@ -428,7 +465,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
IDEBUG_START; IDEBUG_START;
const blasint incb = (transb == 0) ? 1 : ldb; #if defined(COMPLEX)
if (transb > 1){
#ifndef CBLAS
IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
#else
if (order == CblasColMajor)
IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
if (order == CblasRowMajor)
IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
#endif
}
#endif
const blasint incb = ((transb & 1) == 0) ? 1 : ldb;
if (uplo == 1) { if (uplo == 1) {
for (i = 0; i < m; i++) { for (i = 0; i < m; i++) {
@ -438,19 +488,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#if defined(COMPLEX) #if defined(COMPLEX)
aa = a + i * 2; aa = a + i * 2;
bb = b + i * ldb * 2; bb = b + i * ldb * 2;
if (transa) { if (transa & 1) {
aa = a + lda * i * 2; aa = a + lda * i * 2;
} }
if (transb) if (transb & 1)
bb = b + i * 2; bb = b + i * 2;
cc = c + i * 2 * ldc + i * 2; cc = c + i * 2 * ldc + i * 2;
#else #else
aa = a + i; aa = a + i;
bb = b + i * ldb; bb = b + i * ldb;
if (transa) { if (transa & 1) {
aa = a + lda * i; aa = a + lda * i;
} }
if (transb) if (transb & 1)
bb = b + i; bb = b + i;
cc = c + i * ldc + i; cc = c + i * ldc + i;
#endif #endif
@ -461,7 +511,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
NULL, 0); NULL, 0);
if (alpha_r == ZERO && alpha_i == ZERO) if (alpha_r == ZERO && alpha_i == ZERO)
return; continue;
#else #else
if (beta != ONE) if (beta != ONE)
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
@ -472,13 +522,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
IDEBUG_START; IDEBUG_START;
buffer_size = j + k + 128 / sizeof(FLOAT); buffer_size = 2 * (j + k) + 128 / sizeof(FLOAT);
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
buffer_size += 160 / sizeof(FLOAT); buffer_size += 160 / sizeof(FLOAT);
#endif #endif
// for alignment // for alignment
buffer_size = (buffer_size + 3) & ~3; buffer_size = (buffer_size + 3) & ~3;
STACK_ALLOC(buffer_size, FLOAT, buffer); STACK_ALLOC(buffer_size, IFLOAT, buffer);
#ifdef SMP #ifdef SMP
@ -491,7 +541,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif #endif
#if defined(COMPLEX) #if defined(COMPLEX)
if (!transa) if (!(transa & 1))
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
aa, lda, bb, incb, cc, 1, aa, lda, bb, incb, cc, 1,
buffer); buffer);
@ -500,7 +550,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
aa, lda, bb, incb, cc, 1, aa, lda, bb, incb, cc, 1,
buffer); buffer);
#else #else
if (!transa) if (!(transa & 1))
(gemv[(int)transa]) (j, k, 0, alpha, aa, lda, (gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
bb, incb, cc, 1, buffer); bb, incb, cc, 1, buffer);
else else
@ -509,7 +559,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif #endif
#ifdef SMP #ifdef SMP
} else { } else {
if (!transa) if (!(transa & 1))
(gemv_thread[(int)transa]) (j, k, alpha, aa, (gemv_thread[(int)transa]) (j, k, alpha, aa,
lda, bb, incb, cc, lda, bb, incb, cc,
1, buffer, 1, buffer,
@ -533,13 +583,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
l = j; l = j;
#if defined COMPLEX #if defined COMPLEX
bb = b + i * ldb * 2; bb = b + i * ldb * 2;
if (transb) { if (transb & 1) {
bb = b + i * 2; bb = b + i * 2;
} }
cc = c + i * 2 * ldc; cc = c + i * 2 * ldc;
#else #else
bb = b + i * ldb; bb = b + i * ldb;
if (transb) { if (transb & 1) {
bb = b + i; bb = b + i;
} }
cc = c + i * ldc; cc = c + i * ldc;
@ -551,7 +601,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
NULL, 0); NULL, 0);
if (alpha_r == ZERO && alpha_i == ZERO) if (alpha_r == ZERO && alpha_i == ZERO)
return; continue;
#else #else
if (beta != ONE) if (beta != ONE)
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
@ -561,13 +611,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif #endif
IDEBUG_START; IDEBUG_START;
buffer_size = j + k + 128 / sizeof(FLOAT); buffer_size = 2 * (j + k) + 128 / sizeof(FLOAT);
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
buffer_size += 160 / sizeof(FLOAT); buffer_size += 160 / sizeof(FLOAT);
#endif #endif
// for alignment // for alignment
buffer_size = (buffer_size + 3) & ~3; buffer_size = (buffer_size + 3) & ~3;
STACK_ALLOC(buffer_size, FLOAT, buffer); STACK_ALLOC(buffer_size, IFLOAT, buffer);
#ifdef SMP #ifdef SMP
@ -580,7 +630,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#endif #endif
#if defined(COMPLEX) #if defined(COMPLEX)
if (!transa) if (!(transa & 1))
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
a, lda, bb, incb, cc, 1, a, lda, bb, incb, cc, 1,
buffer); buffer);
@ -589,7 +639,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
a, lda, bb, incb, cc, 1, a, lda, bb, incb, cc, 1,
buffer); buffer);
#else #else
if (!transa) if (!(transa & 1))
(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb, (gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
incb, cc, 1, buffer); incb, cc, 1, buffer);
else else
@ -599,7 +649,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
#ifdef SMP #ifdef SMP
} else { } else {
if (!transa) if (!(transa & 1))
(gemv_thread[(int)transa]) (j, k, alpha, a, lda, (gemv_thread[(int)transa]) (j, k, alpha, a, lda,
bb, incb, cc, 1, bb, incb, cc, 1,
buffer, nthreads); buffer, nthreads);

View File

@ -226,7 +226,7 @@ void CNAME(enum CBLAS_ORDER order,
#ifdef SMP #ifdef SMP
if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD ) if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD )
nthreads = 1; nthreads = 1;
else else
nthreads = num_cpu_avail(2); nthreads = num_cpu_avail(2);

View File

@ -154,7 +154,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
} }
#endif #endif
msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT); if ( *rows > *cols )
msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT);
else
msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT);
b = malloc(msize); b = malloc(msize);
if ( b == NULL ) if ( b == NULL )

View File

@ -95,14 +95,19 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
#ifdef SMP #ifdef SMP
args.common = NULL; args.common = NULL;
#ifndef DOUBLE #ifndef DOUBLE
if (args.m*args.n < 40000) int nmax = 40000;
#else #else
if (args.m*args.n < 10000) int nmax = 10000;
#endif #endif
if (args.m*args.n <nmax) {
args.nthreads = 1; args.nthreads = 1;
else } else {
args.nthreads = num_cpu_avail(4); args.nthreads = num_cpu_avail(4);
if ((args.m*args.n)/args.nthreads <nmax)
args.nthreads = (args.m*args.n)/nmax;
}
if (args.nthreads == 1) { if (args.nthreads == 1) {
#endif #endif

View File

@ -113,13 +113,17 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
#ifdef SMP #ifdef SMP
args.common = NULL; args.common = NULL;
#ifndef DOUBLE #ifndef DOUBLE
if (args.n <128) int nmax = 128;
#else #else
if (args.n <64) int nmax = 64;
#endif #endif
if (args.n <nmax) {
args.nthreads = 1; args.nthreads = 1;
else } else {
args.nthreads = num_cpu_avail(4); args.nthreads = num_cpu_avail(4);
if (args.n/args.nthreads <nmax)
args.nthreads = args.n/nmax;
}
if (args.nthreads == 1) { if (args.nthreads == 1) {
#endif #endif

View File

@ -95,10 +95,12 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *
if (trans_arg == 'R') trans = 0; if (trans_arg == 'R') trans = 0;
if (trans_arg == 'C') trans = 1; if (trans_arg == 'C') trans = 1;
TOUPPER(uplo_arg);
uplo = -1; uplo = -1;
if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1; if (uplo_arg == 'L') uplo = 1;
TOUPPER(diag_arg);
diag = -1; diag = -1;
if (diag_arg == 'U') diag = 0; if (diag_arg == 'U') diag = 0;
if (diag_arg == 'N') diag = 1; if (diag_arg == 'N') diag = 1;

View File

@ -95,10 +95,12 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *
if (trans_arg == 'R') trans = 2; if (trans_arg == 'R') trans = 2;
if (trans_arg == 'C') trans = 3; if (trans_arg == 'C') trans = 3;
TOUPPER(uplo_arg);
uplo = -1; uplo = -1;
if (uplo_arg == 'U') uplo = 0; if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1; if (uplo_arg == 'L') uplo = 1;
TOUPPER(diag_arg);
diag = -1; diag = -1;
if (diag_arg == 'U') diag = 0; if (diag_arg == 'U') diag = 0;
if (diag_arg == 'N') diag = 1; if (diag_arg == 'N') diag = 1;

View File

@ -46,6 +46,12 @@
#ifdef USE_ABS #ifdef USE_ABS
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
#ifndef USE_MIN #ifndef USE_MIN
/* ABS & MAX */ /* ABS & MAX */
@ -92,6 +98,8 @@
#else #else
#define ABS
#ifndef USE_MIN #ifndef USE_MIN
/* MAX */ /* MAX */
@ -130,6 +138,12 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
if (n <= 0) return 0; if (n <= 0) return 0;
#ifndef COMPLEX
if (incx == 0) return (ABS(*x));
#else
if (incx == 0) return (ABS(*x) + ABS(*(x+1)));
#endif
IDEBUG_START; IDEBUG_START;
FUNCTION_PROFILE_START(); FUNCTION_PROFILE_START();
@ -145,7 +159,12 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
#else #else
#ifdef COMPLEX
FLOAT CNAME(blasint n, void *vx, blasint incx){
FLOAT *x = (FLOAT*) vx;
#else
FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
#endif
FLOAT ret; FLOAT ret;
@ -153,6 +172,12 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
if (n <= 0) return 0; if (n <= 0) return 0;
#ifndef COMPLEX
if (incx == 0) return (ABS(*x));
#else
if (incx == 0) return (ABS(*x) + ABS(*(x+1)));
#endif
IDEBUG_START; IDEBUG_START;
FUNCTION_PROFILE_START(); FUNCTION_PROFILE_START();

View File

@ -96,12 +96,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
else else
{ {
dp2 = *dd2 * dy1; dp2 = *dd2 * dy1;
if(dp2 == ZERO)
{
dflag = -TWO;
dparam[0] = dflag;
return;
}
dp1 = *dd1 * *dx1; dp1 = *dd1 * *dx1;
dq2 = dp2 * dy1; dq2 = dp2 * dy1;
dq1 = dp1 * *dx1; dq1 = dp1 * *dx1;
@ -113,24 +107,10 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
dh12 = dp2 / dp1; dh12 = dp2 / dp1;
du = ONE - dh12 * dh21; du = ONE - dh12 * dh21;
if(du > ZERO)
{
dflag = ZERO; dflag = ZERO;
*dd1 = *dd1 / du; *dd1 = *dd1 / du;
*dd2 = *dd2 / du; *dd2 = *dd2 / du;
*dx1 = *dx1 * du; *dx1 = *dx1 * du;
} else {
dflag = -ONE;
dh11 = ZERO;
dh12 = ZERO;
dh21 = ZERO;
dh22 = ZERO;
*dd1 = ZERO;
*dd2 = ZERO;
*dx1 = ZERO;
}
} }
else else

447
interface/sbgemmt.c Normal file
View File

@ -0,0 +1,447 @@
/*********************************************************************/
/* Copyright 2024, The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include "common.h"
#define SMP_THRESHOLD_MIN 65536.0
#define ERROR_NAME "SBGEMMT "
#ifndef GEMM_MULTITHREAD_THRESHOLD
#define GEMM_MULTITHREAD_THRESHOLD 4
#endif
#ifndef CBLAS
void NAME(char *UPLO, char *TRANSA, char *TRANSB,
blasint * M, blasint * K,
FLOAT * Alpha,
IFLOAT * a, blasint * ldA,
IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
{
blasint m, k;
blasint lda, ldb, ldc;
int transa, transb, uplo;
blasint info;
char transA, transB, Uplo;
blasint nrowa, nrowb;
IFLOAT *buffer;
IFLOAT *aa, *bb;
FLOAT *cc;
FLOAT alpha, beta;
PRINT_DEBUG_NAME;
m = *M;
k = *K;
alpha = *Alpha;
beta = *Beta;
lda = *ldA;
ldb = *ldB;
ldc = *ldC;
transA = *TRANSA;
transB = *TRANSB;
Uplo = *UPLO;
TOUPPER(transA);
TOUPPER(transB);
TOUPPER(Uplo);
transa = -1;
transb = -1;
uplo = -1;
if (transA == 'N')
transa = 0;
if (transA == 'T')
transa = 1;
if (transA == 'R')
transa = 0;
if (transA == 'C')
transa = 1;
if (transB == 'N')
transb = 0;
if (transB == 'T')
transb = 1;
if (transB == 'R')
transb = 0;
if (transB == 'C')
transb = 1;
if (Uplo == 'U')
uplo = 0;
if (Uplo == 'L')
uplo = 1;
nrowa = m;
if (transa & 1) nrowa = k;
nrowb = k;
if (transb & 1) nrowb = m;
info = 0;
if (ldc < MAX(1, m))
info = 13;
if (ldb < MAX(1, nrowb))
info = 10;
if (lda < MAX(1, nrowa))
info = 8;
if (k < 0)
info = 5;
if (m < 0)
info = 4;
if (transb < 0)
info = 3;
if (transa < 0)
info = 2;
if (uplo < 0)
info = 1;
if (info != 0) {
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
#else
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m,
blasint k,
FLOAT alpha,
IFLOAT * A, blasint LDA,
IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc)
{
IFLOAT *aa, *bb;
FLOAT *cc;
int transa, transb, uplo;
blasint info;
blasint lda, ldb;
IFLOAT *a, *b;
XFLOAT *buffer;
PRINT_DEBUG_CNAME;
uplo = -1;
transa = -1;
transb = -1;
info = 0;
if (order == CblasColMajor) {
if (Uplo == CblasUpper) uplo = 0;
if (Uplo == CblasLower) uplo = 1;
if (TransA == CblasNoTrans)
transa = 0;
if (TransA == CblasTrans)
transa = 1;
if (TransA == CblasConjNoTrans)
transa = 0;
if (TransA == CblasConjTrans)
transa = 1;
if (TransB == CblasNoTrans)
transb = 0;
if (TransB == CblasTrans)
transb = 1;
if (TransB == CblasConjNoTrans)
transb = 0;
if (TransB == CblasConjTrans)
transb = 1;
a = (void *)A;
b = (void *)B;
lda = LDA;
ldb = LDB;
info = -1;
blasint nrowa;
blasint nrowb;
nrowa = m;
if (transa & 1) nrowa = k;
nrowb = k;
if (transb & 1) nrowb = m;
if (ldc < MAX(1, m))
info = 13;
if (ldb < MAX(1, nrowb))
info = 10;
if (lda < MAX(1, nrowa))
info = 8;
if (k < 0)
info = 5;
if (m < 0)
info = 4;
if (transb < 0)
info = 3;
if (transa < 0)
info = 2;
if (uplo < 0)
info = 1;
}
if (order == CblasRowMajor) {
a = (void *)B;
b = (void *)A;
lda = LDB;
ldb = LDA;
if (Uplo == CblasUpper) uplo = 0;
if (Uplo == CblasLower) uplo = 1;
if (TransB == CblasNoTrans)
transa = 0;
if (TransB == CblasTrans)
transa = 1;
if (TransB == CblasConjNoTrans)
transa = 0;
if (TransB == CblasConjTrans)
transa = 1;
if (TransA == CblasNoTrans)
transb = 0;
if (TransA == CblasTrans)
transb = 1;
if (TransA == CblasConjNoTrans)
transb = 0;
if (TransA == CblasConjTrans)
transb = 1;
info = -1;
blasint ncola;
blasint ncolb;
ncola = m;
if (transa & 1) ncola = k;
ncolb = k;
if (transb & 1) {
ncolb = m;
}
if (ldc < MAX(1,m))
info = 13;
if (ldb < MAX(1, ncolb))
info = 8;
if (lda < MAX(1, ncola))
info = 10;
if (k < 0)
info = 5;
if (m < 0)
info = 4;
if (transb < 0)
info = 2;
if (transa < 0)
info = 3;
if (uplo < 0)
info = 1;
}
if (info >= 0) {
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
#endif
int buffer_size;
blasint i, j;
#ifdef SMP
int nthreads;
#endif
#ifdef SMP
static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *,
BLASLONG, IFLOAT *, BLASLONG, FLOAT,
FLOAT *, BLASLONG, int) = {
sbgemv_thread_n, sbgemv_thread_t,
};
#endif
int (*gemv[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, BLASLONG,
IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = {
SBGEMV_N, SBGEMV_T,};
if (m == 0)
return;
IDEBUG_START;
const blasint incb = ((transb & 1) == 0) ? 1 : ldb;
if (uplo == 1) {
for (i = 0; i < m; i++) {
j = m - i;
aa = a + i;
bb = b + i * ldb;
if (transa & 1) {
aa = a + lda * i;
}
if (transb & 1)
bb = b + i;
cc = c + i * ldc + i;
#if 0
if (beta != ONE)
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
if (alpha == ZERO)
continue;
#endif
IDEBUG_START;
buffer_size = j + k + 128 / sizeof(FLOAT);
#ifdef WINDOWS_ABI
buffer_size += 160 / sizeof(FLOAT);
#endif
// for alignment
buffer_size = (buffer_size + 3) & ~3;
STACK_ALLOC(buffer_size, IFLOAT, buffer);
#ifdef SMP
if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
nthreads = 1;
else
nthreads = num_cpu_avail(2);
if (nthreads == 1) {
#endif
if (!(transa & 1))
(gemv[(int)transa]) (j, k, alpha, aa, lda,
bb, incb, beta, cc, 1);
else
(gemv[(int)transa]) (k, j, alpha, aa, lda,
bb, incb, beta, cc, 1);
#ifdef SMP
} else {
if (!(transa & 1))
(gemv_thread[(int)transa]) (j, k, alpha, aa,
lda, bb, incb, beta, cc,
1, nthreads);
else
(gemv_thread[(int)transa]) (k, j, alpha, aa,
lda, bb, incb, beta, cc,
1, nthreads);
}
#endif
STACK_FREE(buffer);
}
} else {
for (i = 0; i < m; i++) {
j = i + 1;
bb = b + i * ldb;
if (transb & 1) {
bb = b + i;
}
cc = c + i * ldc;
#if 0
if (beta != ONE)
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
if (alpha == ZERO)
continue;
#endif
IDEBUG_START;
buffer_size = j + k + 128 / sizeof(FLOAT);
#ifdef WINDOWS_ABI
buffer_size += 160 / sizeof(FLOAT);
#endif
// for alignment
buffer_size = (buffer_size + 3) & ~3;
STACK_ALLOC(buffer_size, IFLOAT, buffer);
#ifdef SMP
if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
nthreads = 1;
else
nthreads = num_cpu_avail(2);
if (nthreads == 1) {
#endif
if (!(transa & 1))
(gemv[(int)transa]) (j, k, alpha, a, lda, bb,
incb, beta, cc, 1);
else
(gemv[(int)transa]) (k, j, alpha, a, lda, bb,
incb, beta, cc, 1);
#ifdef SMP
} else {
if (!(transa & 1))
(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
bb, incb, beta, cc, 1,
nthreads);
else
(gemv_thread[(int)transa]) (k, j, alpha, a, lda,
bb, incb, beta, cc, 1,
nthreads);
}
#endif
STACK_FREE(buffer);
}
}
IDEBUG_END;
return;
}

View File

@ -39,12 +39,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef CBLAS #ifndef CBLAS
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY) void NAME(blasint *N, void *VALPHA, FLOAT *x, blasint *INCX, void *VBETA, FLOAT *y, blasint *INCY)
{ {
blasint n = *N; blasint n = *N;
blasint incx = *INCX; blasint incx = *INCX;
blasint incy = *INCY; blasint incy = *INCY;
FLOAT* ALPHA = (FLOAT*) VALPHA;
FLOAT* BETA = (FLOAT*) VBETA;
#else #else

View File

@ -66,7 +66,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
info = 0; info = 0;
if (lda < MAX(1, m)) info = 6; if (lda < MAX(1, m)) info = 5;
if (ldc < MAX(1, m)) info = 8; if (ldc < MAX(1, m)) info = 8;
if (n < 0) info = 2; if (n < 0) info = 2;
@ -115,8 +115,8 @@ void CNAME(enum CBLAS_ORDER order,
if (ldc < MAX(1, m)) info = 8; if (ldc < MAX(1, m)) info = 8;
if (lda < MAX(1, m)) info = 5; if (lda < MAX(1, m)) info = 5;
if (n < 0) info = 2; if (n < 0) info = 1;
if (m < 0) info = 1; if (m < 0) info = 2;
} }
if (info >= 0) { if (info >= 0) {

View File

@ -183,7 +183,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
} }
#endif #endif
msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT) * 2; if ( *rows > *cols )
msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT) * 2;
else
msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT) * 2;
b = malloc(msize); b = malloc(msize);
if ( b == NULL ) if ( b == NULL )

View File

@ -102,7 +102,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
if (ada >= h *safmin) { if (ada >= h *safmin) {
*C = sqrt(ada/h); *C = sqrt(ada/h);
*R = *DA / *C; *R = *DA / *C;
*(R+1) = *(DA+1) / *(C+1); *(R+1) = *(DA+1) / *C;
rtmax *= 2.; rtmax *= 2.;
if ( ada > rtmin && h < rtmax) { // no risk of intermediate overflow if ( ada > rtmin && h < rtmax) { // no risk of intermediate overflow
*S = *S1 * (*DA / adahsq) - *(S1+1)* (*(DA+1)/adahsq); *S = *S1 * (*DA / adahsq) - *(S1+1)* (*(DA+1)/adahsq);
@ -115,7 +115,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
*C = ada / adahsq; *C = ada / adahsq;
if (*C >= safmin) { if (*C >= safmin) {
*R = *DA / *C; *R = *DA / *C;
*(R+1) = *(DA+1) / *(C+1); *(R+1) = *(DA+1) / *C;
} else { } else {
*R = *DA * (h / adahsq); *R = *DA * (h / adahsq);
*(R+1) = *(DA+1) * (h / adahsq); *(R+1) = *(DA+1) * (h / adahsq);

View File

@ -1349,6 +1349,9 @@ endif ()
set_target_properties(kernel${TSUFFIX} PROPERTIES COMPILE_FLAGS "${KERNEL_DEFINITIONS}") set_target_properties(kernel${TSUFFIX} PROPERTIES COMPILE_FLAGS "${KERNEL_DEFINITIONS}")
get_target_property(KERNEL_INCLUDE_DIRECTORIES kernel${TSUFFIX} INCLUDE_DIRECTORIES) get_target_property(KERNEL_INCLUDE_DIRECTORIES kernel${TSUFFIX} INCLUDE_DIRECTORIES)
set_target_properties(kernel${TSUFFIX} PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDE_DIRECTORIES};${TARGET_CONF_DIR}") set_target_properties(kernel${TSUFFIX} PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDE_DIRECTORIES};${TARGET_CONF_DIR}")
if (USE_GEMM3M)
target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M)
endif()
endfunction () endfunction ()

View File

@ -61,7 +61,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( n == 1 ) return( ABS(x[0]) ); if ( n == 1 ) return( ABS(x[0]) );
n *= inc_x; n *= inc_x;
while(i < n) while(abs(i) < abs(n))
{ {
if ( x[i] != 0.0 ) if ( x[i] != 0.0 )

View File

@ -62,7 +62,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;
n *= inc_x2; n *= inc_x2;
while(i < n) while(abs(i) < abs(n))
{ {
if ( x[i] != 0.0 ) if ( x[i] != 0.0 )

View File

@ -60,6 +60,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
else else
{ {
temp = - da_i * x[ip+1] ; temp = - da_i * x[ip+1] ;
if (isnan(x[ip]) || isinf(x[ip])) temp = NAN;
x[ip+1] = da_i * x[ip] ; x[ip+1] = da_i * x[ip] ;
} }
} }

View File

@ -1,3 +1,5 @@
CSUMKERNEL=csum.S
ifndef SNRM2KERNEL ifndef SNRM2KERNEL
SNRM2KERNEL = ../arm/nrm2.c SNRM2KERNEL = ../arm/nrm2.c
endif endif

View File

@ -1,3 +1,6 @@
CSUMKERNEL = csum_thunderx2t99.c
ZSUMKERNEL = zsum_thunderx2t99.c
SAMINKERNEL = ../arm/amin.c SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c CAMINKERNEL = ../arm/zamin.c

View File

@ -0,0 +1,3 @@
include $(KERNELDIR)/KERNEL.CORTEXA57

View File

@ -91,8 +91,8 @@ IDAMAXKERNEL = iamax_thunderx2t99.c
ICAMAXKERNEL = izamax_thunderx2t99.c ICAMAXKERNEL = izamax_thunderx2t99.c
IZAMAXKERNEL = izamax_thunderx2t99.c IZAMAXKERNEL = izamax_thunderx2t99.c
SNRM2KERNEL = scnrm2_thunderx2t99.c SNRM2KERNEL = nrm2.S
DNRM2KERNEL = dznrm2_thunderx2t99.c DNRM2KERNEL = nrm2.S
CNRM2KERNEL = scnrm2_thunderx2t99.c CNRM2KERNEL = scnrm2_thunderx2t99.c
ZNRM2KERNEL = dznrm2_thunderx2t99.c ZNRM2KERNEL = dznrm2_thunderx2t99.c

View File

@ -0,0 +1,247 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <arm_neon.h>
#define N "x0" /* vector length */
#define X "x1" /* "X" vector address */
#define INC_X "x2" /* "X" stride */
#define J "x5" /* loop variable */
#define REG0 "wzr"
#define SUMF "s0"
#define SUMFD "d0"
/******************************************************************************/
#define KERNEL_F1 \
"ldr d1, ["X"] \n" \
"add "X", "X", #8 \n" \
"ext v2.8b, v1.8b, v1.8b, #4 \n" \
"fadd s1, s1, s2 \n" \
"fadd "SUMF", "SUMF", s1 \n"
#define KERNEL_F32 \
"ldr q16, ["X"] \n" \
"ldr q17, ["X", #16] \n" \
"ldr q18, ["X", #32] \n" \
"ldr q19, ["X", #48] \n" \
"ldp q20, q21, ["X", #64] \n" \
"ldp q22, q23, ["X", #96] \n" \
"ldp q24, q25, ["X", #128] \n" \
"ldp q26, q27, ["X", #160] \n" \
"fadd v16.4s, v16.4s, v17.4s \n" \
"fadd v18.4s, v18.4s, v19.4s \n" \
"ldp q28, q29, ["X", #192] \n" \
"ldp q30, q31, ["X", #224] \n" \
"add "X", "X", #256 \n" \
"fadd v20.4s, v20.4s, v21.4s \n" \
"fadd v22.4s, v22.4s, v23.4s \n" \
"PRFM PLDL1KEEP, ["X", #1024] \n" \
"PRFM PLDL1KEEP, ["X", #1024+64] \n" \
"fadd v24.4s, v24.4s, v25.4s \n" \
"fadd v26.4s, v26.4s, v27.4s \n" \
"fadd v0.4s, v0.4s, v16.4s \n" \
"fadd v1.4s, v1.4s, v18.4s \n" \
"fadd v2.4s, v2.4s, v20.4s \n" \
"fadd v3.4s, v3.4s, v22.4s \n" \
"PRFM PLDL1KEEP, ["X", #1024+128] \n" \
"PRFM PLDL1KEEP, ["X", #1024+192] \n" \
"fadd v28.4s, v28.4s, v29.4s \n" \
"fadd v30.4s, v30.4s, v31.4s \n" \
"fadd v4.4s, v4.4s, v24.4s \n" \
"fadd v5.4s, v5.4s, v26.4s \n" \
"fadd v6.4s, v6.4s, v28.4s \n" \
"fadd v7.4s, v7.4s, v30.4s \n"
#define KERNEL_F32_FINALIZE \
"fadd v0.4s, v0.4s, v1.4s \n" \
"fadd v2.4s, v2.4s, v3.4s \n" \
"fadd v4.4s, v4.4s, v5.4s \n" \
"fadd v6.4s, v6.4s, v7.4s \n" \
"fadd v0.4s, v0.4s, v2.4s \n" \
"fadd v4.4s, v4.4s, v6.4s \n" \
"fadd v0.4s, v0.4s, v4.4s \n" \
"ext v1.16b, v0.16b, v0.16b, #8 \n" \
"fadd v0.2s, v0.2s, v1.2s \n" \
"faddp "SUMF", v0.2s \n"
#define INIT_S \
"lsl "INC_X", "INC_X", #3 \n"
#define KERNEL_S1 \
"ldr d1, ["X"] \n" \
"add "X", "X", "INC_X" \n" \
"ext v2.8b, v1.8b, v1.8b, #4 \n" \
"fadd s1, s1, s2 \n" \
"fadd "SUMF", "SUMF", s1 \n"
#if defined(SMP)
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
void *c, BLASLONG ldc, int (*function)(), int nthreads);
#endif
static FLOAT casum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
FLOAT asum = 0.0 ;
if ( n < 0 ) return(asum);
__asm__ __volatile__ (
" mov "N", %[N_] \n"
" mov "X", %[X_] \n"
" mov "INC_X", %[INCX_] \n"
" fmov "SUMF", "REG0" \n"
" fmov s1, "REG0" \n"
" fmov s2, "REG0" \n"
" fmov s3, "REG0" \n"
" fmov s4, "REG0" \n"
" fmov s5, "REG0" \n"
" fmov s6, "REG0" \n"
" fmov s7, "REG0" \n"
" cmp "N", xzr \n"
" ble 9f //asum_kernel_L999 \n"
" cmp "INC_X", xzr \n"
" ble 9f //asum_kernel_L999 \n"
" cmp "INC_X", #1 \n"
" bne 5f //asum_kernel_S_BEGIN \n"
"1: //asum_kernel_F_BEGIN: \n"
" asr "J", "N", #5 \n"
" cmp "J", xzr \n"
" beq 3f //asum_kernel_F1 \n"
"2: //asum_kernel_F32: \n"
" "KERNEL_F32" \n"
" subs "J", "J", #1 \n"
" bne 2b //asum_kernel_F32 \n"
" "KERNEL_F32_FINALIZE" \n"
"3: //asum_kernel_F1: \n"
" ands "J", "N", #31 \n"
" ble 9f //asum_kernel_L999 \n"
"4: //asum_kernel_F10: \n"
" "KERNEL_F1" \n"
" subs "J", "J", #1 \n"
" bne 4b //asum_kernel_F10 \n"
" b 9f //asum_kernel_L999 \n"
"5: //asum_kernel_S_BEGIN: \n"
" "INIT_S" \n"
" asr "J", "N", #2 \n"
" cmp "J", xzr \n"
" ble 7f //asum_kernel_S1 \n"
"6: //asum_kernel_S4: \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" subs "J", "J", #1 \n"
" bne 6b //asum_kernel_S4 \n"
"7: //asum_kernel_S1: \n"
" ands "J", "N", #3 \n"
" ble 9f //asum_kernel_L999 \n"
"8: //asum_kernel_S10: \n"
" "KERNEL_S1" \n"
" subs "J", "J", #1 \n"
" bne 8b //asum_kernel_S10 \n"
"9: //asum_kernel_L999: \n"
" fmov %[ASUM_], "SUMFD" \n"
: [ASUM_] "=r" (asum) //%0
: [N_] "r" (n), //%1
[X_] "r" (x), //%2
[INCX_] "r" (inc_x) //%3
: "cc",
"memory",
"x0", "x1", "x2", "x3", "x4", "x5",
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
);
return asum;
}
#if defined(SMP)
static int casum_thread_function(BLASLONG n, BLASLONG dummy0,
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
{
*result = casum_compute(n, x, inc_x);
return 0;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
#if defined(SMP)
int nthreads;
FLOAT dummy_alpha;
#endif
FLOAT asum = 0.0;
#if defined(SMP)
if (inc_x == 0 || n <= 10000)
nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) {
asum = casum_compute(n, x, inc_x);
} else {
int mode, i;
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
FLOAT *ptr;
mode = BLAS_SINGLE | BLAS_COMPLEX;
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
x, inc_x, NULL, 0, result, 0,
( void *)casum_thread_function, nthreads);
ptr = (FLOAT *)result;
for (i = 0; i < nthreads; i++) {
asum = asum + (*ptr);
ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2);
}
}
#else
asum = casum_compute(n, x, inc_x);
#endif
return asum;
}

View File

@ -77,7 +77,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
" cmp "N", xzr \n" " cmp "N", xzr \n"
" ble 9f //nrm2_kernel_L999 \n" " ble 9f //nrm2_kernel_L999 \n"
" cmp "INC_X", xzr \n" " cmp "INC_X", xzr \n"
" ble 9f //nrm2_kernel_L999 \n" " beq 9f //nrm2_kernel_L999 \n"
"1: //nrm2_kernel_F_BEGIN: \n" "1: //nrm2_kernel_F_BEGIN: \n"
" mov x6, #0x7FF0000000000000 //+Infinity \n" " mov x6, #0x7FF0000000000000 //+Infinity \n"
@ -345,7 +345,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
#endif #endif
FLOAT ssq, scale; FLOAT ssq, scale;
if (n <= 0 || inc_x <= 0) return 0.0; if (n <= 0 || inc_x == 0) return 0.0;
#if defined(SMP) #if defined(SMP)
if (n <= 10000) if (n <= 10000)

View File

@ -229,7 +229,7 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
" cmp "N", xzr \n" " cmp "N", xzr \n"
" ble 9f //nrm2_kernel_L999 \n" " ble 9f //nrm2_kernel_L999 \n"
" cmp "INC_X", xzr \n" " cmp "INC_X", xzr \n"
" ble 9f //nrm2_kernel_L999 \n" " beq 9f //nrm2_kernel_L999 \n"
" cmp "INC_X", #1 \n" " cmp "INC_X", #1 \n"
" bne 5f //nrm2_kernel_S_BEGIN \n" " bne 5f //nrm2_kernel_S_BEGIN \n"
@ -315,7 +315,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT nrm2 = 0.0; FLOAT nrm2 = 0.0;
double nrm2_double = 0.0; double nrm2_double = 0.0;
if (n <= 0 || inc_x <= 0) return 0.0; if (n <= 0 || inc_x == 0) return 0.0;
#if defined(SMP) #if defined(SMP)
if (n <= 10000) if (n <= 10000)

View File

@ -223,7 +223,7 @@ zscal_begin:
fcmp DA_I, #0.0 fcmp DA_I, #0.0
beq .Lzscal_kernel_RI_zero beq .Lzscal_kernel_RI_zero
b .Lzscal_kernel_R_zero // b .Lzscal_kernel_R_zero
.Lzscal_kernel_R_non_zero: .Lzscal_kernel_R_non_zero:

View File

@ -0,0 +1,244 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <arm_neon.h>
#define N "x0" /* vector length */
#define X "x1" /* "X" vector address */
#define INC_X "x2" /* "X" stride */
#define J "x5" /* loop variable */
#define REG0 "xzr"
#define SUMF "d0"
#define TMPF "d1"
/******************************************************************************/
#define KERNEL_F1 \
"ldr q1, ["X"] \n" \
"add "X", "X", #16 \n" \
"faddp d1, v1.2d \n" \
"fadd "SUMF", "SUMF", d1 \n"
#define KERNEL_F16 \
"ldr q16, ["X"] \n" \
"ldr q17, ["X", #16] \n" \
"ldr q18, ["X", #32] \n" \
"ldr q19, ["X", #48] \n" \
"ldp q20, q21, ["X", #64] \n" \
"ldp q22, q23, ["X", #96] \n" \
"ldp q24, q25, ["X", #128] \n" \
"ldp q26, q27, ["X", #160] \n" \
"fadd v16.2d, v16.2d, v17.2d \n" \
"fadd v18.2d, v18.2d, v19.2d \n" \
"ldp q28, q29, ["X", #192] \n" \
"ldp q30, q31, ["X", #224] \n" \
"add "X", "X", #256 \n" \
"fadd v20.2d, v20.2d, v21.2d \n" \
"fadd v22.2d, v22.2d, v23.2d \n" \
"PRFM PLDL1KEEP, ["X", #1024] \n" \
"PRFM PLDL1KEEP, ["X", #1024+64] \n" \
"fadd v24.2d, v24.2d, v25.2d \n" \
"fadd v26.2d, v26.2d, v27.2d \n" \
"fadd v28.2d, v28.2d, v29.2d \n" \
"fadd v30.2d, v30.2d, v31.2d \n" \
"fadd v0.2d, v0.2d, v16.2d \n" \
"fadd v1.2d, v1.2d, v18.2d \n" \
"fadd v2.2d, v2.2d, v20.2d \n" \
"fadd v3.2d, v3.2d, v22.2d \n" \
"PRFM PLDL1KEEP, ["X", #1024+128] \n" \
"PRFM PLDL1KEEP, ["X", #1024+192] \n" \
"fadd v4.2d, v4.2d, v24.2d \n" \
"fadd v5.2d, v5.2d, v26.2d \n" \
"fadd v6.2d, v6.2d, v28.2d \n" \
"fadd v7.2d, v7.2d, v30.2d \n"
#define KERNEL_F16_FINALIZE \
"fadd v0.2d, v0.2d, v1.2d \n" \
"fadd v2.2d, v2.2d, v3.2d \n" \
"fadd v4.2d, v4.2d, v5.2d \n" \
"fadd v6.2d, v6.2d, v7.2d \n" \
"fadd v0.2d, v0.2d, v2.2d \n" \
"fadd v4.2d, v4.2d, v6.2d \n" \
"fadd v0.2d, v0.2d, v4.2d \n" \
"faddp "SUMF", v0.2d \n"
#define INIT_S \
"lsl "INC_X", "INC_X", #4 \n"
#define KERNEL_S1 \
"ldr q1, ["X"] \n" \
"add "X", "X", "INC_X" \n" \
"faddp d1, v1.2d \n" \
"fadd "SUMF", "SUMF", d1 \n"
#if defined(SMP)
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
void *c, BLASLONG ldc, int (*function)(), int nthreads);
#endif
static FLOAT zasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
FLOAT asum = 0.0 ;
if ( n < 0 ) return(asum);
__asm__ __volatile__ (
" mov "N", %[N_] \n"
" mov "X", %[X_] \n"
" mov "INC_X", %[INCX_] \n"
" fmov "SUMF", "REG0" \n"
" fmov d1, "REG0" \n"
" fmov d2, "REG0" \n"
" fmov d3, "REG0" \n"
" fmov d4, "REG0" \n"
" fmov d5, "REG0" \n"
" fmov d6, "REG0" \n"
" fmov d7, "REG0" \n"
" cmp "N", xzr \n"
" ble 9f //asum_kernel_L999 \n"
" cmp "INC_X", xzr \n"
" ble 9f //asum_kernel_L999 \n"
" cmp "INC_X", #1 \n"
" bne 5f //asum_kernel_S_BEGIN \n"
"1: //asum_kernel_F_BEGIN: \n"
" asr "J", "N", #4 \n"
" cmp "J", xzr \n"
" beq 3f //asum_kernel_F1 \n"
".align 5 \n"
"2: //asum_kernel_F16: \n"
" "KERNEL_F16" \n"
" subs "J", "J", #1 \n"
" bne 2b //asum_kernel_F16 \n"
" "KERNEL_F16_FINALIZE" \n"
"3: //asum_kernel_F1: \n"
" ands "J", "N", #15 \n"
" ble 9f //asum_kernel_L999 \n"
"4: //asum_kernel_F10: \n"
" "KERNEL_F1" \n"
" subs "J", "J", #1 \n"
" bne 4b //asum_kernel_F10 \n"
" b 9f //asum_kernel_L999 \n"
"5: //asum_kernel_S_BEGIN: \n"
" "INIT_S" \n"
" asr "J", "N", #2 \n"
" cmp "J", xzr \n"
" ble 7f //asum_kernel_S1 \n"
"6: //asum_kernel_S4: \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" "KERNEL_S1" \n"
" subs "J", "J", #1 \n"
" bne 6b //asum_kernel_S4 \n"
"7: //asum_kernel_S1: \n"
" ands "J", "N", #3 \n"
" ble 9f //asum_kernel_L999 \n"
"8: //asum_kernel_S10: \n"
" "KERNEL_S1" \n"
" subs "J", "J", #1 \n"
" bne 8b //asum_kernel_S10 \n"
"9: //asum_kernel_L999: \n"
" fmov %[ASUM_], "SUMF" \n"
: [ASUM_] "=r" (asum) //%0
: [N_] "r" (n), //%1
[X_] "r" (x), //%2
[INCX_] "r" (inc_x) //%3
: "cc",
"memory",
"x0", "x1", "x2", "x3", "x4", "x5",
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
);
return asum;
}
#if defined(SMP)
static int zasum_thread_function(BLASLONG n, BLASLONG dummy0,
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
{
*result = zasum_compute(n, x, inc_x);
return 0;
}
#endif
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
#if defined(SMP)
int nthreads;
FLOAT dummy_alpha;
#endif
FLOAT asum = 0.0;
#if defined(SMP)
if (inc_x == 0 || n <= 10000)
nthreads = 1;
else
nthreads = num_cpu_avail(1);
if (nthreads == 1) {
asum = zasum_compute(n, x, inc_x);
} else {
int mode, i;
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
FLOAT *ptr;
mode = BLAS_DOUBLE | BLAS_COMPLEX;
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
x, inc_x, NULL, 0, result, 0,
( void *)zasum_thread_function, nthreads);
ptr = (FLOAT *)result;
for (i = 0; i < nthreads; i++) {
asum = asum + (*ptr);
ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2);
}
}
#else
asum = zasum_compute(n, x, inc_x);
#endif
return asum;
}

149
kernel/csky/KERNEL Normal file
View File

@ -0,0 +1,149 @@
SAMAXKERNEL = ../arm/amax.c
DAMAXKERNEL = ../arm/amax.c
CAMAXKERNEL = ../arm/zamax.c
ZAMAXKERNEL = ../arm/zamax.c
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMAXKERNEL = ../arm/iamax.c
IDAMAXKERNEL = ../arm/iamax.c
ICAMAXKERNEL = ../arm/izamax.c
IZAMAXKERNEL = ../arm/izamax.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
SASUMKERNEL = ../arm/asum.c
DASUMKERNEL = ../arm/asum.c
CASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = ../arm/zasum.c
SSUMKERNEL = ../arm/sum.c
DSUMKERNEL = ../arm/sum.c
CSUMKERNEL = ../arm/zsum.c
ZSUMKERNEL = ../arm/zsum.c
SAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = ../arm/axpy.c
CAXPYKERNEL = ../arm/zaxpy.c
ZAXPYKERNEL = ../arm/zaxpy.c
SCOPYKERNEL = ../arm/copy.c
DCOPYKERNEL = ../arm/copy.c
CCOPYKERNEL = ../arm/zcopy.c
ZCOPYKERNEL = ../arm/zcopy.c
SDOTKERNEL = ../arm/dot.c
DDOTKERNEL = ../arm/dot.c
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
DSDOTKERNEL = ../generic/dot.c
SNRM2KERNEL = ../arm/nrm2.c
DNRM2KERNEL = ../arm/nrm2.c
CNRM2KERNEL = ../arm/znrm2.c
ZNRM2KERNEL = ../arm/znrm2.c
SROTKERNEL = ../arm/rot.c
DROTKERNEL = ../arm/rot.c
CROTKERNEL = ../arm/zrot.c
ZROTKERNEL = ../arm/zrot.c
SSCALKERNEL = ../arm/scal.c
DSCALKERNEL = ../arm/scal.c
CSCALKERNEL = ../arm/zscal.c
ZSCALKERNEL = ../arm/zscal.c
SSWAPKERNEL = ../arm/swap.c
DSWAPKERNEL = ../arm/swap.c
CSWAPKERNEL = ../arm/zswap.c
ZSWAPKERNEL = ../arm/zswap.c
SGEMVNKERNEL = ../arm/gemv_n.c
DGEMVNKERNEL = ../arm/gemv_n.c
CGEMVNKERNEL = ../arm/zgemv_n.c
ZGEMVNKERNEL = ../arm/zgemv_n.c
SGEMVTKERNEL = ../arm/gemv_t.c
DGEMVTKERNEL = ../arm/gemv_t.c
CGEMVTKERNEL = ../arm/zgemv_t.c
ZGEMVTKERNEL = ../arm/zgemv_t.c
STRMMKERNEL = ../generic/trmmkernel_2x2.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SCABS_KERNEL = ../generic/cabs.c
DCABS_KERNEL = ../generic/cabs.c
QCABS_KERNEL = ../generic/cabs.c
LSAME_KERNEL = ../generic/lsame.c
SGEMM_BETA = ../generic/gemm_beta.c
DGEMM_BETA = ../generic/gemm_beta.c
CGEMM_BETA = ../generic/zgemm_beta.c
ZGEMM_BETA = ../generic/zgemm_beta.c

1
kernel/csky/Makefile Normal file
View File

@ -0,0 +1 @@
clean ::

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -40,7 +40,6 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a,
if ( rows <= 0 ) return(0); if ( rows <= 0 ) return(0);
if ( cols <= 0 ) return(0); if ( cols <= 0 ) return(0);
if ( alpha_r == 1.0 && alpha_i == 0.0 ) return (0);
aptr = a; aptr = a;
lda *= 2; lda *= 2;

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,587 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG i, j;
FLOAT *aoffset;
FLOAT *aoffset1, *aoffset2;
FLOAT *boffset;
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
aoffset = a;
boffset = b;
lda *= 2;
#if 0
fprintf(stderr, "M = %d N = %d\n", m, n);
#endif
j = (n >> 4);
if (j > 0){
do{
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 32;
i = (m >> 1);
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
ctemp05 = *(aoffset1 + 4);
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
ctemp09 = *(aoffset1 + 8);
ctemp10 = *(aoffset1 + 9);
ctemp11 = *(aoffset1 + 10);
ctemp12 = *(aoffset1 + 11);
ctemp13 = *(aoffset1 + 12);
ctemp14 = *(aoffset1 + 13);
ctemp15 = *(aoffset1 + 14);
ctemp16 = *(aoffset1 + 15);
ctemp17 = *(aoffset1 + 16);
ctemp18 = *(aoffset1 + 17);
ctemp19 = *(aoffset1 + 18);
ctemp20 = *(aoffset1 + 19);
ctemp21 = *(aoffset1 + 20);
ctemp22 = *(aoffset1 + 21);
ctemp23 = *(aoffset1 + 22);
ctemp24 = *(aoffset1 + 23);
ctemp25 = *(aoffset1 + 24);
ctemp26 = *(aoffset1 + 25);
ctemp27 = *(aoffset1 + 26);
ctemp28 = *(aoffset1 + 27);
ctemp29 = *(aoffset1 + 28);
ctemp30 = *(aoffset1 + 29);
ctemp31 = *(aoffset1 + 30);
ctemp32 = *(aoffset1 + 31);
ctemp33 = *(aoffset2 + 0);
ctemp34 = *(aoffset2 + 1);
ctemp35 = *(aoffset2 + 2);
ctemp36 = *(aoffset2 + 3);
ctemp37 = *(aoffset2 + 4);
ctemp38 = *(aoffset2 + 5);
ctemp39 = *(aoffset2 + 6);
ctemp40 = *(aoffset2 + 7);
ctemp41 = *(aoffset2 + 8);
ctemp42 = *(aoffset2 + 9);
ctemp43 = *(aoffset2 + 10);
ctemp44 = *(aoffset2 + 11);
ctemp45 = *(aoffset2 + 12);
ctemp46 = *(aoffset2 + 13);
ctemp47 = *(aoffset2 + 14);
ctemp48 = *(aoffset2 + 15);
ctemp49 = *(aoffset2 + 16);
ctemp50 = *(aoffset2 + 17);
ctemp51 = *(aoffset2 + 18);
ctemp52 = *(aoffset2 + 19);
ctemp53 = *(aoffset2 + 20);
ctemp54 = *(aoffset2 + 21);
ctemp55 = *(aoffset2 + 22);
ctemp56 = *(aoffset2 + 23);
ctemp57 = *(aoffset2 + 24);
ctemp58 = *(aoffset2 + 25);
ctemp59 = *(aoffset2 + 26);
ctemp60 = *(aoffset2 + 27);
ctemp61 = *(aoffset2 + 28);
ctemp62 = *(aoffset2 + 29);
ctemp63 = *(aoffset2 + 30);
ctemp64 = *(aoffset2 + 31);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
*(boffset + 4) = -ctemp05;
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
*(boffset + 8) = -ctemp09;
*(boffset + 9) = -ctemp10;
*(boffset + 10) = -ctemp11;
*(boffset + 11) = -ctemp12;
*(boffset + 12) = -ctemp13;
*(boffset + 13) = -ctemp14;
*(boffset + 14) = -ctemp15;
*(boffset + 15) = -ctemp16;
*(boffset + 16) = -ctemp17;
*(boffset + 17) = -ctemp18;
*(boffset + 18) = -ctemp19;
*(boffset + 19) = -ctemp20;
*(boffset + 20) = -ctemp21;
*(boffset + 21) = -ctemp22;
*(boffset + 22) = -ctemp23;
*(boffset + 23) = -ctemp24;
*(boffset + 24) = -ctemp25;
*(boffset + 25) = -ctemp26;
*(boffset + 26) = -ctemp27;
*(boffset + 27) = -ctemp28;
*(boffset + 28) = -ctemp29;
*(boffset + 29) = -ctemp30;
*(boffset + 30) = -ctemp31;
*(boffset + 31) = -ctemp32;
*(boffset + 32) = -ctemp33;
*(boffset + 33) = -ctemp34;
*(boffset + 34) = -ctemp35;
*(boffset + 35) = -ctemp36;
*(boffset + 36) = -ctemp37;
*(boffset + 37) = -ctemp38;
*(boffset + 38) = -ctemp39;
*(boffset + 39) = -ctemp40;
*(boffset + 40) = -ctemp41;
*(boffset + 41) = -ctemp42;
*(boffset + 42) = -ctemp43;
*(boffset + 43) = -ctemp44;
*(boffset + 44) = -ctemp45;
*(boffset + 45) = -ctemp46;
*(boffset + 46) = -ctemp47;
*(boffset + 47) = -ctemp48;
*(boffset + 48) = -ctemp49;
*(boffset + 49) = -ctemp50;
*(boffset + 50) = -ctemp51;
*(boffset + 51) = -ctemp52;
*(boffset + 52) = -ctemp53;
*(boffset + 53) = -ctemp54;
*(boffset + 54) = -ctemp55;
*(boffset + 55) = -ctemp56;
*(boffset + 56) = -ctemp57;
*(boffset + 57) = -ctemp58;
*(boffset + 58) = -ctemp59;
*(boffset + 59) = -ctemp60;
*(boffset + 60) = -ctemp61;
*(boffset + 61) = -ctemp62;
*(boffset + 62) = -ctemp63;
*(boffset + 63) = -ctemp64;
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 64;
i --;
}while(i > 0);
}
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
ctemp05 = *(aoffset1 + 4);
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
ctemp09 = *(aoffset1 + 8);
ctemp10 = *(aoffset1 + 9);
ctemp11 = *(aoffset1 + 10);
ctemp12 = *(aoffset1 + 11);
ctemp13 = *(aoffset1 + 12);
ctemp14 = *(aoffset1 + 13);
ctemp15 = *(aoffset1 + 14);
ctemp16 = *(aoffset1 + 15);
ctemp17 = *(aoffset1 + 16);
ctemp18 = *(aoffset1 + 17);
ctemp19 = *(aoffset1 + 18);
ctemp20 = *(aoffset1 + 19);
ctemp21 = *(aoffset1 + 20);
ctemp22 = *(aoffset1 + 21);
ctemp23 = *(aoffset1 + 22);
ctemp24 = *(aoffset1 + 23);
ctemp25 = *(aoffset1 + 24);
ctemp26 = *(aoffset1 + 25);
ctemp27 = *(aoffset1 + 26);
ctemp28 = *(aoffset1 + 27);
ctemp29 = *(aoffset1 + 28);
ctemp30 = *(aoffset1 + 29);
ctemp31 = *(aoffset1 + 30);
ctemp32 = *(aoffset1 + 31);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
*(boffset + 4) = -ctemp05;
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
*(boffset + 8) = -ctemp09;
*(boffset + 9) = -ctemp10;
*(boffset + 10) = -ctemp11;
*(boffset + 11) = -ctemp12;
*(boffset + 12) = -ctemp13;
*(boffset + 13) = -ctemp14;
*(boffset + 14) = -ctemp15;
*(boffset + 15) = -ctemp16;
*(boffset + 16) = -ctemp17;
*(boffset + 17) = -ctemp18;
*(boffset + 18) = -ctemp19;
*(boffset + 19) = -ctemp20;
*(boffset + 20) = -ctemp21;
*(boffset + 21) = -ctemp22;
*(boffset + 22) = -ctemp23;
*(boffset + 23) = -ctemp24;
*(boffset + 24) = -ctemp25;
*(boffset + 25) = -ctemp26;
*(boffset + 26) = -ctemp27;
*(boffset + 27) = -ctemp28;
*(boffset + 28) = -ctemp29;
*(boffset + 29) = -ctemp30;
*(boffset + 30) = -ctemp31;
*(boffset + 31) = -ctemp32;
boffset += 32;
}
j--;
}while(j > 0);
} /* end of if(j > 0) */
if (n & 8){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 16;
i = (m >> 1);
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
ctemp05 = *(aoffset1 + 4);
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
ctemp09 = *(aoffset1 + 8);
ctemp10 = *(aoffset1 + 9);
ctemp11 = *(aoffset1 + 10);
ctemp12 = *(aoffset1 + 11);
ctemp13 = *(aoffset1 + 12);
ctemp14 = *(aoffset1 + 13);
ctemp15 = *(aoffset1 + 14);
ctemp16 = *(aoffset1 + 15);
ctemp17 = *(aoffset2 + 0);
ctemp18 = *(aoffset2 + 1);
ctemp19 = *(aoffset2 + 2);
ctemp20 = *(aoffset2 + 3);
ctemp21 = *(aoffset2 + 4);
ctemp22 = *(aoffset2 + 5);
ctemp23 = *(aoffset2 + 6);
ctemp24 = *(aoffset2 + 7);
ctemp25 = *(aoffset2 + 8);
ctemp26 = *(aoffset2 + 9);
ctemp27 = *(aoffset2 + 10);
ctemp28 = *(aoffset2 + 11);
ctemp29 = *(aoffset2 + 12);
ctemp30 = *(aoffset2 + 13);
ctemp31 = *(aoffset2 + 14);
ctemp32 = *(aoffset2 + 15);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
*(boffset + 4) = -ctemp05;
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
*(boffset + 8) = -ctemp09;
*(boffset + 9) = -ctemp10;
*(boffset + 10) = -ctemp11;
*(boffset + 11) = -ctemp12;
*(boffset + 12) = -ctemp13;
*(boffset + 13) = -ctemp14;
*(boffset + 14) = -ctemp15;
*(boffset + 15) = -ctemp16;
*(boffset + 16) = -ctemp17;
*(boffset + 17) = -ctemp18;
*(boffset + 18) = -ctemp19;
*(boffset + 19) = -ctemp20;
*(boffset + 20) = -ctemp21;
*(boffset + 21) = -ctemp22;
*(boffset + 22) = -ctemp23;
*(boffset + 23) = -ctemp24;
*(boffset + 24) = -ctemp25;
*(boffset + 25) = -ctemp26;
*(boffset + 26) = -ctemp27;
*(boffset + 27) = -ctemp28;
*(boffset + 28) = -ctemp29;
*(boffset + 29) = -ctemp30;
*(boffset + 30) = -ctemp31;
*(boffset + 31) = -ctemp32;
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 32;
i --;
}while(i > 0);
}
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
ctemp05 = *(aoffset1 + 4);
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
ctemp09 = *(aoffset1 + 8);
ctemp10 = *(aoffset1 + 9);
ctemp11 = *(aoffset1 + 10);
ctemp12 = *(aoffset1 + 11);
ctemp13 = *(aoffset1 + 12);
ctemp14 = *(aoffset1 + 13);
ctemp15 = *(aoffset1 + 14);
ctemp16 = *(aoffset1 + 15);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
*(boffset + 4) = -ctemp05;
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
*(boffset + 8) = -ctemp09;
*(boffset + 9) = -ctemp10;
*(boffset + 10) = -ctemp11;
*(boffset + 11) = -ctemp12;
*(boffset + 12) = -ctemp13;
*(boffset + 13) = -ctemp14;
*(boffset + 14) = -ctemp15;
*(boffset + 15) = -ctemp16;
boffset += 16;
}
}
if (n & 4){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 8;
i = (m >> 1);
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
ctemp05 = *(aoffset1 + 4);
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
ctemp09 = *(aoffset2 + 0);
ctemp10 = *(aoffset2 + 1);
ctemp11 = *(aoffset2 + 2);
ctemp12 = *(aoffset2 + 3);
ctemp13 = *(aoffset2 + 4);
ctemp14 = *(aoffset2 + 5);
ctemp15 = *(aoffset2 + 6);
ctemp16 = *(aoffset2 + 7);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
*(boffset + 4) = -ctemp05;
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
*(boffset + 8) = -ctemp09;
*(boffset + 9) = -ctemp10;
*(boffset + 10) = -ctemp11;
*(boffset + 11) = -ctemp12;
*(boffset + 12) = -ctemp13;
*(boffset + 13) = -ctemp14;
*(boffset + 14) = -ctemp15;
*(boffset + 15) = -ctemp16;
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 16;
i --;
}while(i > 0);
}
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
ctemp05 = *(aoffset1 + 4);
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
*(boffset + 4) = -ctemp05;
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
boffset += 8;
}
}
if (n & 2){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 4;
i = (m >> 1);
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
ctemp05 = *(aoffset2 + 0);
ctemp06 = *(aoffset2 + 1);
ctemp07 = *(aoffset2 + 2);
ctemp08 = *(aoffset2 + 3);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
*(boffset + 4) = -ctemp05;
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 8;
i --;
}while(i > 0);
}
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
boffset += 4;
}
}
if (n & 1){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
// aoffset += 2;
i = (m >> 1);
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 4;
i --;
}while(i > 0);
}
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
// boffset += 2;
}
}
return 0;
}

View File

@ -0,0 +1,333 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js, offset;
FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
FLOAT data17, data18, data19, data20, data21, data22, data23, data24;
FLOAT data25, data26, data27, data28, data29, data30, data31, data32;
FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8;
FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16;
lda *= 2;
js = (n >> 4);
while (js > 0){
offset = posX - posY;
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
if (offset > -8) ao9 = a + (posX + 8) * 2 + posY * lda; else ao9 = a + posY * 2 + (posX + 8) * lda;
if (offset > -9) ao10 = a + (posX + 9) * 2 + posY * lda; else ao10 = a + posY * 2 + (posX + 9) * lda;
if (offset > -10) ao11 = a + (posX + 10) * 2 + posY * lda; else ao11 = a + posY * 2 + (posX + 10) * lda;
if (offset > -11) ao12 = a + (posX + 11) * 2 + posY * lda; else ao12 = a + posY * 2 + (posX + 11) * lda;
if (offset > -12) ao13 = a + (posX + 12) * 2 + posY * lda; else ao13 = a + posY * 2 + (posX + 12) * lda;
if (offset > -13) ao14 = a + (posX + 13) * 2 + posY * lda; else ao14 = a + posY * 2 + (posX + 13) * lda;
if (offset > -14) ao15 = a + (posX + 14) * 2 + posY * lda; else ao15 = a + posY * 2 + (posX + 14) * lda;
if (offset > -15) ao16 = a + (posX + 15) * 2 + posY * lda; else ao16 = a + posY * 2 + (posX + 15) * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
data05 = *(ao3 + 0);
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);
data09 = *(ao5 + 0);
data10 = *(ao5 + 1);
data11 = *(ao6 + 0);
data12 = *(ao6 + 1);
data13 = *(ao7 + 0);
data14 = *(ao7 + 1);
data15 = *(ao8 + 0);
data16 = *(ao8 + 1);
data17 = *(ao9 + 0);
data18 = *(ao9 + 1);
data19 = *(ao10 + 0);
data20 = *(ao10 + 1);
data21 = *(ao11 + 0);
data22 = *(ao11 + 1);
data23 = *(ao12 + 0);
data24 = *(ao12 + 1);
data25 = *(ao13 + 0);
data26 = *(ao13 + 1);
data27 = *(ao14 + 0);
data28 = *(ao14 + 1);
data29 = *(ao15 + 0);
data30 = *(ao15 + 1);
data31 = *(ao16 + 0);
data32 = *(ao16 + 1);
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
if (offset > -2) ao3 += lda; else ao3 += 2;
if (offset > -3) ao4 += lda; else ao4 += 2;
if (offset > -4) ao5 += lda; else ao5 += 2;
if (offset > -5) ao6 += lda; else ao6 += 2;
if (offset > -6) ao7 += lda; else ao7 += 2;
if (offset > -7) ao8 += lda; else ao8 += 2;
if (offset > -8) ao9 += lda; else ao9 += 2;
if (offset > -9) ao10 += lda; else ao10 += 2;
if (offset > -10) ao11 += lda; else ao11 += 2;
if (offset > -11) ao12 += lda; else ao12 += 2;
if (offset > -12) ao13 += lda; else ao13 += 2;
if (offset > -13) ao14 += lda; else ao14 += 2;
if (offset > -14) ao15 += lda; else ao15 += 2;
if (offset > -15) ao16 += lda; else ao16 += 2;
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = data05;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
b[11] = data12;
b[12] = data13;
b[13] = data14;
b[14] = data15;
b[15] = data16;
b[16] = data17;
b[17] = data18;
b[18] = data19;
b[19] = data20;
b[20] = data21;
b[21] = data22;
b[22] = data23;
b[23] = data24;
b[24] = data25;
b[25] = data26;
b[26] = data27;
b[27] = data28;
b[28] = data29;
b[29] = data30;
b[30] = data31;
b[31] = data32;
b += 32;
offset --;
i --;
}
posX += 16;
js --;
}
if (n & 8) {
offset = posX - posY;
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
data05 = *(ao3 + 0);
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);
data09 = *(ao5 + 0);
data10 = *(ao5 + 1);
data11 = *(ao6 + 0);
data12 = *(ao6 + 1);
data13 = *(ao7 + 0);
data14 = *(ao7 + 1);
data15 = *(ao8 + 0);
data16 = *(ao8 + 1);
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
if (offset > -2) ao3 += lda; else ao3 += 2;
if (offset > -3) ao4 += lda; else ao4 += 2;
if (offset > -4) ao5 += lda; else ao5 += 2;
if (offset > -5) ao6 += lda; else ao6 += 2;
if (offset > -6) ao7 += lda; else ao7 += 2;
if (offset > -7) ao8 += lda; else ao8 += 2;
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = data05;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
b[11] = data12;
b[12] = data13;
b[13] = data14;
b[14] = data15;
b[15] = data16;
b += 16;
offset --;
i --;
}
posX += 8;
}
if (n & 4) {
offset = posX - posY;
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
data05 = *(ao3 + 0);
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
if (offset > -2) ao3 += lda; else ao3 += 2;
if (offset > -3) ao4 += lda; else ao4 += 2;
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = data05;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
b += 8;
offset --;
i --;
}
posX += 4;
}
if (n & 2) {
offset = posX - posY;
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b += 4;
offset --;
i --;
}
posX += 2;
}
if (n & 1) {
offset = posX - posY;
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
if (offset > 0) ao1 += lda; else ao1 += 2;
b[ 0] = data01;
b[ 1] = data02;
b += 2;
offset --;
i --;
}
}
return 0;
}

Some files were not shown because too many files have changed in this diff Show More