Merge pull request #4606 from OpenMathLib/develop
Merge develop branch for 0.3.27
This commit is contained in:
commit
8f3bb62254
86
.cirrus.yml
86
.cirrus.yml
|
@ -1,44 +1,44 @@
|
|||
macos_instance:
|
||||
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
|
||||
|
||||
task:
|
||||
name: AppleM1/LLVM
|
||||
compile_script:
|
||||
- brew install llvm
|
||||
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
- make TARGET=VORTEX USE_OPENMP=1 CC=clang
|
||||
#task:
|
||||
# name: AppleM1/LLVM
|
||||
# compile_script:
|
||||
# - brew install llvm
|
||||
# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
# - make TARGET=VORTEX USE_OPENMP=1 CC=clang
|
||||
|
||||
task:
|
||||
name: AppleM1/LLVM/ILP64
|
||||
compile_script:
|
||||
- brew install llvm
|
||||
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
- make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1
|
||||
#task:
|
||||
# name: AppleM1/LLVM/ILP64
|
||||
# compile_script:
|
||||
# - brew install llvm
|
||||
# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
# - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1
|
||||
|
||||
task:
|
||||
name: AppleM1/LLVM/CMAKE
|
||||
compile_script:
|
||||
- brew install llvm
|
||||
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
- mkdir build
|
||||
- cd build
|
||||
- cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
|
||||
- make -j 4
|
||||
#task:
|
||||
# name: AppleM1/LLVM/CMAKE
|
||||
# compile_script:
|
||||
# - brew install llvm
|
||||
# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
# - mkdir build
|
||||
# - cd build
|
||||
# - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
|
||||
# - make -j 4
|
||||
|
||||
task:
|
||||
name: AppleM1/GCC/MAKE/OPENMP
|
||||
compile_script:
|
||||
- brew install gcc@11
|
||||
- export PATH=/opt/homebrew/bin:$PATH
|
||||
- export LDFLAGS="-L/opt/homebrew/lib"
|
||||
- export CPPFLAGS="-I/opt/homebrew/include"
|
||||
- make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
|
||||
#task:
|
||||
# name: AppleM1/GCC/MAKE/OPENMP
|
||||
# compile_script:
|
||||
# - brew install gcc@11
|
||||
# - export PATH=/opt/homebrew/bin:$PATH
|
||||
# - export LDFLAGS="-L/opt/homebrew/lib"
|
||||
# - export CPPFLAGS="-I/opt/homebrew/include"
|
||||
# - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
|
||||
|
||||
macos_instance:
|
||||
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
|
||||
|
@ -58,8 +58,8 @@ task:
|
|||
- export VALID_ARCHS="i386 x86_64"
|
||||
- xcrun --sdk macosx --show-sdk-path
|
||||
- xcodebuild -version
|
||||
- export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX12.3.sdk -arch x86_64"
|
||||
- export CC=/Applications/Xcode-15.3.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-15.3.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX14.4.sdk -arch x86_64"
|
||||
- make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
|
||||
always:
|
||||
config_artifacts:
|
||||
|
@ -78,8 +78,8 @@ task:
|
|||
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
- export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
- export CC=/Applications/Xcode-15.3.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
- export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-15.3.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS17.4.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
- make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1
|
||||
always:
|
||||
config_artifacts:
|
||||
|
@ -91,14 +91,16 @@ macos_instance:
|
|||
task:
|
||||
name: AppleM1/LLVM armv7-androidndk xbuild
|
||||
compile_script:
|
||||
- #brew install android-ndk
|
||||
- brew install android-ndk
|
||||
- export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
- export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
- export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
- find /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b -name "armv7a-linux-androideabi*-ranlib"
|
||||
- ls /System/Volumes/Data/opt/homebrew
|
||||
- ls -l /System/Volumes/Data/opt/homebrew/Caskroom/
|
||||
- find /System/Volumes/Data/opt/homebrew -name "armv7a-linux-androideabi*-ranlib"
|
||||
- #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
- #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b/AndroidNDK8937393.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
|
||||
- export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/26c/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
|
||||
- make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
|
||||
always:
|
||||
config_artifacts:
|
||||
|
|
|
@ -0,0 +1,149 @@
|
|||
name: apple m
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read # to fetch code (actions/checkout)
|
||||
|
||||
jobs:
|
||||
build:
|
||||
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||
runs-on: macos-14
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build: [cmake, make]
|
||||
fortran: [gfortran]
|
||||
openmp: [0, 1]
|
||||
ilp64: [0, 1]
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Print system information
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
cat /proc/cpuinfo
|
||||
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||
sysctl -a | grep machdep.cpu
|
||||
else
|
||||
echo "::error::$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Install Dependencies
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
sudo apt-get install -y gfortran cmake ccache libtinfo5
|
||||
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
|
||||
brew reinstall gcc
|
||||
brew install coreutils cmake ccache
|
||||
brew install llvm
|
||||
else
|
||||
echo "::error::$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Compilation cache
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.ccache
|
||||
# We include the commit sha in the cache key, as new cache entries are
|
||||
# only created if there is no existing entry for the key yet.
|
||||
# GNU make and cmake call the compilers differently. It looks like
|
||||
# that causes the cache to mismatch. Keep the ccache for both build
|
||||
# tools separate to avoid polluting each other.
|
||||
key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
|
||||
# Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
|
||||
restore-keys: |
|
||||
ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}-${{ github.ref }}
|
||||
ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}
|
||||
ccache-${{ runner.os }}-${{ matrix.build }}
|
||||
|
||||
- name: Configure ccache
|
||||
run: |
|
||||
if [ "${{ matrix.build }}" = "make" ]; then
|
||||
# Add ccache to path
|
||||
if [ "$RUNNER_OS" = "Linux" ]; then
|
||||
echo "/usr/lib/ccache" >> $GITHUB_PATH
|
||||
elif [ "$RUNNER_OS" = "macOS" ]; then
|
||||
echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH
|
||||
echo "/opt/homebrew/opt/llvm/bin" >>$GITHUB_PATH
|
||||
echo "" >>$GITHUB_PATH
|
||||
else
|
||||
echo "::error::$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
|
||||
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||
echo "max_size = 300M" > ~/.ccache/ccache.conf
|
||||
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||
ccache -s
|
||||
|
||||
- name: Build OpenBLAS
|
||||
run: |
|
||||
export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
export CC="/opt/homebrew/opt/llvm/bin/clang"
|
||||
case "${{ matrix.build }}" in
|
||||
"make")
|
||||
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=${{matrix.openmp}} INTERFACE64=${{matrix.ilp64}} FC="ccache ${{ matrix.fortran }}"
|
||||
;;
|
||||
"cmake")
|
||||
export LDFLAGS="$LDFLAGS -Wl,-ld_classic"
|
||||
mkdir build && cd build
|
||||
cmake -DDYNAMIC_ARCH=1 \
|
||||
-DUSE_OPENMP=${{matrix.openmp}} \
|
||||
-DINTERFACE64=${{matrix.ilp64}} \
|
||||
-DNOFORTRAN=0 \
|
||||
-DBUILD_WITHOUT_LAPACK=0 \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=ON \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
|
||||
..
|
||||
cmake --build .
|
||||
;;
|
||||
*)
|
||||
echo "::error::Configuration not supported"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
- name: Show ccache status
|
||||
continue-on-error: true
|
||||
run: ccache -s
|
||||
|
||||
- name: Run tests
|
||||
timeout-minutes: 60
|
||||
run: |
|
||||
case "${{ matrix.build }}" in
|
||||
"make")
|
||||
MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0'
|
||||
echo "::group::Tests in 'test' directory"
|
||||
make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
|
||||
echo "::endgroup::"
|
||||
echo "::group::Tests in 'ctest' directory"
|
||||
make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
|
||||
echo "::endgroup::"
|
||||
echo "::group::Tests in 'utest' directory"
|
||||
make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
|
||||
echo "::endgroup::"
|
||||
;;
|
||||
"cmake")
|
||||
cd build && ctest
|
||||
;;
|
||||
*)
|
||||
echo "::error::Configuration not supported"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
|
@ -14,8 +14,8 @@ jobs:
|
|||
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282
|
||||
toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz
|
||||
xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1698113812618
|
||||
toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0-20231018.tar.gz
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
|
@ -76,7 +76,7 @@ jobs:
|
|||
run: |
|
||||
wget ${xuetie_toolchain}/${toolchain_file_name}
|
||||
tar -xvf ${toolchain_file_name} -C /opt
|
||||
export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1/bin:$PATH"
|
||||
export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0/bin:$PATH"
|
||||
|
||||
make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
|
||||
|
||||
|
|
|
@ -42,6 +42,7 @@ jobs:
|
|||
- name: Install Dependencies
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y gfortran cmake ccache libtinfo5
|
||||
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
|
||||
|
|
|
@ -0,0 +1,253 @@
|
|||
name: riscv64 zvl256b qemu test
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read # to fetch code (actions/checkout)
|
||||
|
||||
jobs:
|
||||
TEST:
|
||||
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
triple: riscv64-unknown-linux-gnu
|
||||
riscv_gnu_toolchain: https://github.com/riscv-collab/riscv-gnu-toolchain
|
||||
riscv_gnu_toolchain_version: 13.2.0
|
||||
riscv_gnu_toolchain_nightly_download_path: /releases/download/2024.02.02/riscv64-glibc-ubuntu-22.04-llvm-nightly-2024.02.02-nightly.tar.gz
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- target: RISCV64_ZVL128B
|
||||
opts: TARGET=RISCV64_ZVL128B BINARY=64 ARCH=riscv64
|
||||
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=128,elen=64
|
||||
- target: RISCV64_ZVL256B
|
||||
opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64
|
||||
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: install build deps
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install autoconf automake autotools-dev ninja-build make \
|
||||
libgomp1-riscv64-cross ccache
|
||||
wget ${riscv_gnu_toolchain}/${riscv_gnu_toolchain_nightly_download_path}
|
||||
tar -xvf $(basename ${riscv_gnu_toolchain_nightly_download_path}) -C /opt
|
||||
|
||||
- name: Compilation cache
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.ccache
|
||||
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
|
||||
restore-keys: |
|
||||
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
|
||||
ccache-${{ runner.os }}-${{ matrix.target }}
|
||||
|
||||
- name: Configure ccache
|
||||
run: |
|
||||
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||
echo "max_size = 300M" > ~/.ccache/ccache.conf
|
||||
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||
ccache -s
|
||||
|
||||
- name: build OpenBLAS libs
|
||||
run: |
|
||||
export PATH="/opt/riscv/bin:$PATH"
|
||||
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
|
||||
CC='ccache clang --rtlib=compiler-rt -target ${triple} --sysroot /opt/riscv/sysroot --gcc-toolchain=/opt/riscv/lib/gcc/riscv64-unknown-linux-gnu/${riscv_gnu_toolchain_version}/' \
|
||||
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
|
||||
RANLIB='ccache ${triple}-ranlib' \
|
||||
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
|
||||
HOSTCC=gcc HOSTFC=gfortran -j$(nproc)
|
||||
|
||||
- name: build OpenBLAS tests
|
||||
run: |
|
||||
export PATH="/opt/riscv/bin:$PATH"
|
||||
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
|
||||
CC='${triple}-gcc' \
|
||||
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
|
||||
RANLIB='ccache ${triple}-ranlib' \
|
||||
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
|
||||
HOSTCC=gcc HOSTFC=gfortran -j$(nproc) tests
|
||||
|
||||
- name: build lapack-netlib tests
|
||||
working-directory: ./lapack-netlib/TESTING
|
||||
run: |
|
||||
export PATH="/opt/riscv/bin:$PATH"
|
||||
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
|
||||
CC='${triple}-gcc' \
|
||||
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
|
||||
RANLIB='ccache ${triple}-ranlib' \
|
||||
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
|
||||
HOSTCC=gcc HOSTFC=gfortran -j$(nproc) \
|
||||
LIN/xlintsts LIN/xlintstc LIN/xlintstd LIN/xlintstz LIN/xlintstrfs \
|
||||
LIN/xlintstrfc LIN/xlintstrfd LIN/xlintstrfz LIN/xlintstds \
|
||||
LIN/xlintstzc EIG/xeigtsts EIG/xeigtstc EIG/xeigtstd EIG/xeigtstz \
|
||||
|
||||
- name: OpenBLAS tests
|
||||
shell: bash
|
||||
run: |
|
||||
export PATH="/opt/riscv/bin:$PATH"
|
||||
export QEMU_CPU=${{ matrix.qemu_cpu }}
|
||||
rm -rf ./test_out
|
||||
mkdir -p ./test_out
|
||||
run_test() { local DIR=$1; local CMD=$2; local DATA=$3; local OUTPUT="./test_out/$DIR.$CMD"; \
|
||||
echo "`pwd`/$DIR/$CMD $DIR/$DATA" >> $OUTPUT; \
|
||||
if [[ -z $DATA ]]; then qemu-riscv64 ./$DIR/$CMD |& tee $OUTPUT ; \
|
||||
else qemu-riscv64 ./$DIR/$CMD < ./$DIR/$DATA |& tee $OUTPUT ; fi ; \
|
||||
RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi \
|
||||
}
|
||||
run_test test cblat1 &
|
||||
run_test test cblat2 cblat2.dat &
|
||||
run_test test cblat3 cblat3.dat &
|
||||
run_test test dblat1 &
|
||||
run_test test dblat2 dblat2.dat &
|
||||
run_test test dblat3 dblat3.dat &
|
||||
run_test test sblat1 &
|
||||
run_test test sblat2 sblat2.dat &
|
||||
run_test test sblat3 sblat3.dat &
|
||||
run_test test zblat1 &
|
||||
run_test test zblat2 zblat2.dat &
|
||||
run_test test zblat3 zblat3.dat &
|
||||
run_test ctest xccblat1 &
|
||||
run_test ctest xccblat2 cin2 &
|
||||
run_test ctest xccblat3 cin3 &
|
||||
run_test ctest xdcblat1 &
|
||||
run_test ctest xdcblat2 din2 &
|
||||
run_test ctest xdcblat3 din3 &
|
||||
run_test ctest xscblat1 &
|
||||
run_test ctest xscblat2 sin2 &
|
||||
run_test ctest xscblat3 sin3 &
|
||||
run_test ctest xzcblat1 &
|
||||
run_test ctest xzcblat2 zin2 &
|
||||
run_test ctest xzcblat3 zin3 &
|
||||
wait
|
||||
while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*)
|
||||
if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi
|
||||
|
||||
- name: netlib tests
|
||||
shell: bash
|
||||
run: |
|
||||
: # these take a very long time
|
||||
echo "Skipping netlib tests in CI"
|
||||
exit 0
|
||||
: # comment out exit above to enable the tests
|
||||
: # probably we want to identify a subset to run in CI
|
||||
export PATH="/opt/riscv/bin:$PATH"
|
||||
export QEMU_CPU=${{ matrix.qemu_cpu }}
|
||||
rm -rf ./test_out
|
||||
mkdir -p ./test_out
|
||||
run_test() { local OUTPUT="./test_out/$1"; local DATA="./lapack-netlib/TESTING/$2"; local CMD="./lapack-netlib/TESTING/$3"; \
|
||||
echo "$4" >> $OUTPUT; \
|
||||
echo "$CMD" >> $OUTPUT; \
|
||||
qemu-riscv64 $CMD < $DATA |& tee $OUTPUT; \
|
||||
RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi; \
|
||||
if grep -q fail $OUTPUT ; then echo "*** FAIL: log contains 'fail'" >> $OUTPUT ; fi ; \
|
||||
if grep -q rror $OUTPUT | grep -v -q "passed" | grep -v "largest error" ; then echo "*** FAIL: log contains 'error'" >> $OUTPUT ; fi \
|
||||
}
|
||||
run_test stest.out stest.in LIN/xlintsts "Testing REAL LAPACK linear equation routines" &
|
||||
run_test ctest.out ctest.in LIN/xlintstc "Testing COMPLEX LAPACK linear equation routines" &
|
||||
run_test dtest.out dtest.in LIN/xlintstd "Testing DOUBLE PRECISION LAPACK linear equation routines" &
|
||||
run_test ztest.out ztest.in LIN/xlintstz "Testing COMPLEX16 LAPACK linear equation routines" &
|
||||
run_test dstest.out dstest.in LIN/xlintstds "Testing SINGLE-DOUBLE PRECISION LAPACK prototype linear equation routines" &
|
||||
run_test zctest.out zctest.in LIN/xlintstzc "Testing COMPLEX-COMPLEX16 LAPACK prototype linear equation routines" &
|
||||
run_test stest_rfp.out stest_rfp.in LIN/xlintstrfs "Testing REAL LAPACK RFP prototype linear equation routines" &
|
||||
run_test dtest_rfp.out dtest_rfp.in LIN/xlintstrfd "Testing DOUBLE PRECISION LAPACK RFP prototype linear equation routines" &
|
||||
run_test ctest_rfp.out ctest_rfp.in LIN/xlintstrfc "Testing COMPLEX LAPACK RFP prototype linear equation routines" &
|
||||
run_test ztest_rfp.out ztest_rfp.in LIN/xlintstrfz "Testing COMPLEX16 LAPACK RFP prototype linear equation routines" &
|
||||
run_test snep.out nep.in EIG/xeigtsts "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
|
||||
run_test ssep.out sep.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test sse2.out se2.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test ssvd.out svd.in EIG/xeigtsts "SVD - Testing Singular Value Decomposition routines" &
|
||||
run_test sec.out sec.in EIG/xeigtsts "SEC - Testing REAL Eigen Condition Routines" &
|
||||
run_test sed.out sed.in EIG/xeigtsts "SEV - Testing REAL Nonsymmetric Eigenvalue Driver" &
|
||||
run_test sgg.out sgg.in EIG/xeigtsts "SGG - Testing REAL Nonsymmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test sgd.out sgd.in EIG/xeigtsts "SGD - Testing REAL Nonsymmetric Generalized Eigenvalue Problem driver routines" &
|
||||
run_test ssb.out ssb.in EIG/xeigtsts "SSB - Testing REAL Symmetric Eigenvalue Problem routines" &
|
||||
run_test ssg.out ssg.in EIG/xeigtsts "SSG - Testing REAL Symmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test sbal.out sbal.in EIG/xeigtsts "SGEBAL - Testing the balancing of a REAL general matrix" &
|
||||
run_test sbak.out sbak.in EIG/xeigtsts "SGEBAK - Testing the back transformation of a REAL balanced matrix" &
|
||||
run_test sgbal.out sgbal.in EIG/xeigtsts "SGGBAL - Testing the balancing of a pair of REAL general matrices" &
|
||||
run_test sgbak.out sgbak.in EIG/xeigtsts "SGGBAK - Testing the back transformation of a pair of REAL balanced matrices" &
|
||||
run_test sbb.out sbb.in EIG/xeigtsts "SBB - Testing banded Singular Value Decomposition routines" &
|
||||
run_test sglm.out glm.in EIG/xeigtsts "GLM - Testing Generalized Linear Regression Model routines" &
|
||||
run_test sgqr.out gqr.in EIG/xeigtsts "GQR - Testing Generalized QR and RQ factorization routines" &
|
||||
run_test sgsv.out gsv.in EIG/xeigtsts "GSV - Testing Generalized Singular Value Decomposition routines" &
|
||||
run_test scsd.out csd.in EIG/xeigtsts "CSD - Testing CS Decomposition routines" &
|
||||
run_test slse.out lse.in EIG/xeigtsts "LSE - Testing Constrained Linear Least Squares routines" &
|
||||
run_test cnep.out nep.in EIG/xeigtstc "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
|
||||
run_test csep.out sep.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test cse2.out se2.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test csvd.out svd.in EIG/xeigtstc "SVD - Testing Singular Value Decomposition routines" &
|
||||
run_test cec.out cec.in EIG/xeigtstc "CEC - Testing COMPLEX Eigen Condition Routines" &
|
||||
run_test ced.out ced.in EIG/xeigtstc "CES - Testing COMPLEX Nonsymmetric Schur Form Driver" &
|
||||
run_test cgg.out cgg.in EIG/xeigtstc "CGG - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test cgd.out cgd.in EIG/xeigtstc "CGD - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem driver routines" &
|
||||
run_test csb.out csb.in EIG/xeigtstc "CHB - Testing Hermitian Eigenvalue Problem routines" &
|
||||
run_test csg.out csg.in EIG/xeigtstc "CSG - Testing Symmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test cbal.out cbal.in EIG/xeigtstc "CGEBAL - Testing the balancing of a COMPLEX general matrix" &
|
||||
run_test cbak.out cbak.in EIG/xeigtstc "CGEBAK - Testing the back transformation of a COMPLEX balanced matrix" &
|
||||
run_test cgbal.out cgbal.in EIG/xeigtstc "CGGBAL - Testing the balancing of a pair of COMPLEX general matrices" &
|
||||
run_test cgbak.out cgbak.in EIG/xeigtstc "CGGBAK - Testing the back transformation of a pair of COMPLEX balanced matrices" &
|
||||
run_test cbb.out cbb.in EIG/xeigtstc "CBB - Testing banded Singular Value Decomposition routines" &
|
||||
run_test cglm.out glm.in EIG/xeigtstc "GLM - Testing Generalized Linear Regression Model routines" &
|
||||
run_test cgqr.out gqr.in EIG/xeigtstc "GQR - Testing Generalized QR and RQ factorization routines" &
|
||||
run_test cgsv.out gsv.in EIG/xeigtstc "GSV - Testing Generalized Singular Value Decomposition routines" &
|
||||
run_test ccsd.out csd.in EIG/xeigtstc "CSD - Testing CS Decomposition routines" &
|
||||
run_test clse.out lse.in EIG/xeigtstc "LSE - Testing Constrained Linear Least Squares routines" &
|
||||
run_test dnep.out nep.in EIG/xeigtstd "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
|
||||
run_test dsep.out sep.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test dse2.out se2.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test dsvd.out svd.in EIG/xeigtstd "SVD - Testing Singular Value Decomposition routines" &
|
||||
run_test dec.out dec.in EIG/xeigtstd "DEC - Testing DOUBLE PRECISION Eigen Condition Routines" &
|
||||
run_test ded.out ded.in EIG/xeigtstd "DEV - Testing DOUBLE PRECISION Nonsymmetric Eigenvalue Driver" &
|
||||
run_test dgg.out dgg.in EIG/xeigtstd "DGG - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test dgd.out dgd.in EIG/xeigtstd "DGD - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem driver routines" &
|
||||
run_test dsb.out dsb.in EIG/xeigtstd "DSB - Testing DOUBLE PRECISION Symmetric Eigenvalue Problem routines" &
|
||||
run_test dsg.out dsg.in EIG/xeigtstd "DSG - Testing DOUBLE PRECISION Symmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test dbal.out dbal.in EIG/xeigtstd "DGEBAL - Testing the balancing of a DOUBLE PRECISION general matrix" &
|
||||
run_test dbak.out dbak.in EIG/xeigtstd "DGEBAK - Testing the back transformation of a DOUBLE PRECISION balanced matrix" &
|
||||
run_test dgbal.out dgbal.in EIG/xeigtstd "DGGBAL - Testing the balancing of a pair of DOUBLE PRECISION general matrices" &
|
||||
run_test dgbak.out dgbak.in EIG/xeigtstd "DGGBAK - Testing the back transformation of a pair of DOUBLE PRECISION balanced matrices" &
|
||||
run_test dbb.out dbb.in EIG/xeigtstd "DBB - Testing banded Singular Value Decomposition routines" &
|
||||
run_test dglm.out glm.in EIG/xeigtstd "GLM - Testing Generalized Linear Regression Model routines" &
|
||||
run_test dgqr.out gqr.in EIG/xeigtstd "GQR - Testing Generalized QR and RQ factorization routines" &
|
||||
run_test dgsv.out gsv.in EIG/xeigtstd "GSV - Testing Generalized Singular Value Decomposition routines" &
|
||||
run_test dcsd.out csd.in EIG/xeigtstd "CSD - Testing CS Decomposition routines" &
|
||||
run_test dlse.out lse.in EIG/xeigtstd "LSE - Testing Constrained Linear Least Squares routines" &
|
||||
run_test znep.out nep.in EIG/xeigtstz "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
|
||||
run_test zsep.out sep.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test zse2.out se2.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test zsvd.out svd.in EIG/xeigtstz "SVD - Testing Singular Value Decomposition routines" &
|
||||
run_test zec.out zec.in EIG/xeigtstz "ZEC - Testing COMPLEX16 Eigen Condition Routines" &
|
||||
run_test zed.out zed.in EIG/xeigtstz "ZES - Testing COMPLEX16 Nonsymmetric Schur Form Driver" &
|
||||
run_test zgg.out zgg.in EIG/xeigtstz "ZGG - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test zgd.out zgd.in EIG/xeigtstz "ZGD - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem driver routines" &
|
||||
run_test zsb.out zsb.in EIG/xeigtstz "ZHB - Testing Hermitian Eigenvalue Problem routines" &
|
||||
run_test zsg.out zsg.in EIG/xeigtstz "ZSG - Testing Symmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test zbal.out zbal.in EIG/xeigtstz "ZGEBAL - Testing the balancing of a COMPLEX16 general matrix" &
|
||||
run_test zbak.out zbak.in EIG/xeigtstz "ZGEBAK - Testing the back transformation of a COMPLEX16 balanced matrix" &
|
||||
run_test zgbal.out zgbal.in EIG/xeigtstz "ZGGBAL - Testing the balancing of a pair of COMPLEX general matrices" &
|
||||
run_test zgbak.out zgbak.in EIG/xeigtstz "ZGGBAK - Testing the back transformation of a pair of COMPLEX16 balanced matrices" &
|
||||
run_test zbb.out zbb.in EIG/xeigtstz "ZBB - Testing banded Singular Value Decomposition routines" &
|
||||
run_test zglm.out glm.in EIG/xeigtstz "GLM - Testing Generalized Linear Regression Model routines" &
|
||||
run_test zgqr.out gqr.in EIG/xeigtstz "GQR - Testing Generalized QR and RQ factorization routines" &
|
||||
run_test zgsv.out gsv.in EIG/xeigtstz "GSV - Testing Generalized Singular Value Decomposition routines" &
|
||||
run_test zcsd.out csd.in EIG/xeigtstz "CSD - Testing CS Decomposition routines" &
|
||||
run_test zlse.out lse.in EIG/xeigtstz "LSE - Testing Constrained Linear Least Squares routines" &
|
||||
wait
|
||||
while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*)
|
||||
python ./lapack-netlib/lapack_testing.py -d ./test_out -e > netlib_summary
|
||||
TOTALS="$(grep 'ALL PRECISIONS' netlib_summary)"
|
||||
NUMERICAL_ERRORS=-1
|
||||
OTHER_ERRORS=-1
|
||||
. <(awk '/ALL PRECISIONS/{printf "NUMERICAL_ERRORS=%s\nOTHER_ERRORS=%s\n", $5, $7}' netlib_summary
|
||||
if (( NUMERICAL_ERRORS != 0 )) || (( OTHER_ERRORS != 0 )) ; then cat netlib_summary ; FAILURES=1 ; fi
|
||||
if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi
|
|
@ -47,46 +47,59 @@ config_last.h
|
|||
getarch
|
||||
getarch_2nd
|
||||
utest/openblas_utest
|
||||
utest/openblas_utest_ext
|
||||
ctest/xccblat1
|
||||
ctest/xccblat2
|
||||
ctest/xccblat3
|
||||
ctest/xccblat3_3m
|
||||
ctest/xdcblat1
|
||||
ctest/xdcblat2
|
||||
ctest/xdcblat3
|
||||
ctest/xdcblat3_3m
|
||||
ctest/xscblat1
|
||||
ctest/xscblat2
|
||||
ctest/xscblat3
|
||||
ctest/xscblat3_3m
|
||||
ctest/xzcblat1
|
||||
ctest/xzcblat2
|
||||
ctest/xzcblat3
|
||||
ctest/xzcblat3_3m
|
||||
exports/linktest.c
|
||||
exports/linux.def
|
||||
kernel/setparam_*.c
|
||||
kernel/kernel_*.h
|
||||
test/CBLAT2.SUMM
|
||||
test/CBLAT3.SUMM
|
||||
test/CBLAT3_3M.SUMM
|
||||
test/DBLAT2.SUMM
|
||||
test/DBLAT3.SUMM
|
||||
test/DBLAT3_3M.SUMM
|
||||
test/SBLAT2.SUMM
|
||||
test/SBLAT3.SUMM
|
||||
test/SBLAT3_3M.SUMM
|
||||
test/ZBLAT2.SUMM
|
||||
test/ZBLAT3.SUMM
|
||||
test/ZBLAT3_3M.SUMM
|
||||
test/SHBLAT3.SUMM
|
||||
test/SBBLAT3.SUMM
|
||||
test/cblat1
|
||||
test/cblat2
|
||||
test/cblat3
|
||||
test/cblat3_3m
|
||||
test/dblat1
|
||||
test/dblat2
|
||||
test/dblat3
|
||||
test/dblat3_3m
|
||||
test/sblat1
|
||||
test/sblat2
|
||||
test/sblat3
|
||||
test/sblat3_3m
|
||||
test/test_shgemm
|
||||
test/test_sbgemm
|
||||
test/zblat1
|
||||
test/zblat2
|
||||
test/zblat3
|
||||
test/zblat3_3m
|
||||
build
|
||||
build.*
|
||||
*.swp
|
||||
|
|
|
@ -24,6 +24,8 @@ option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, d
|
|||
|
||||
option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
|
||||
|
||||
option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF)
|
||||
|
||||
option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
|
||||
|
||||
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
|
||||
|
@ -40,6 +42,11 @@ option(USE_PERL "Use the older PERL scripts for build preparation instead of uni
|
|||
|
||||
option(NO_WARMUP "Do not run a benchmark on each startup just to find the best location for the memory buffer" ON)
|
||||
|
||||
option(FIXED_LIBNAME "Use a non-versioned name for the library and no symbolic linking to variant names" OFF)
|
||||
|
||||
set(LIBNAMEPREFIX "" CACHE STRING "Add a prefix to the openblas part of the library name" )
|
||||
set(LIBNAMESUFFIX "" CACHE STRING "Add a suffix after the openblas part of the library name" )
|
||||
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
|
||||
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
|
||||
else()
|
||||
|
@ -96,7 +103,7 @@ message(WARNING "CMake support is experimental. It does not yet support all buil
|
|||
include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
|
||||
|
||||
set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE})
|
||||
set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
|
||||
|
||||
set(BLASDIRS interface driver/level2 driver/level3 driver/others)
|
||||
|
||||
|
@ -323,7 +330,7 @@ if (NOT NOFORTRAN)
|
|||
# Build test and ctest
|
||||
add_subdirectory(test)
|
||||
endif()
|
||||
if (BUILD_TESTING)
|
||||
if (BUILD_TESTING AND NOT BUILD_WITHOUT_LAPACK)
|
||||
add_subdirectory(lapack-netlib/TESTING)
|
||||
endif()
|
||||
endif()
|
||||
|
@ -336,11 +343,12 @@ endif()
|
|||
add_subdirectory(cpp_thread_test)
|
||||
endif()
|
||||
|
||||
if (NOT FIXED_LIBNAME)
|
||||
set_target_properties(${OpenBLAS_LIBS} PROPERTIES
|
||||
VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}
|
||||
SOVERSION ${OpenBLAS_MAJOR_VERSION}
|
||||
)
|
||||
|
||||
endif()
|
||||
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
|
||||
if (NOT MSVC)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition")
|
||||
|
@ -452,6 +460,61 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
|
|||
endif()
|
||||
endif()
|
||||
|
||||
if (BUILD_BENCHMARKS)
|
||||
#find_package(OpenMP REQUIRED)
|
||||
file(GLOB SOURCES "benchmark/*.c")
|
||||
if (NOT USE_OPENMP)
|
||||
file(GLOB REMFILE "benchmark/smallscaling.c")
|
||||
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||
endif()
|
||||
if (BUILD_WITHOUT_LAPACK)
|
||||
file(GLOB REMFILE "benchmark/cholesky.c")
|
||||
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||
file(GLOB REMFILE "benchmark/geev.c")
|
||||
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||
file(GLOB REMFILE "benchmark/gesv.c")
|
||||
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||
file(GLOB REMFILE "benchmark/getri.c")
|
||||
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||
file(GLOB REMFILE "benchmark/potrf.c")
|
||||
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||
file(GLOB REMFILE "benchmark/spmv.c")
|
||||
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||
file(GLOB REMFILE "benchmark/symv.c")
|
||||
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||
file(GLOB REMFILE "benchmark/linpack.c")
|
||||
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||
endif()
|
||||
if (NOT USE_GEMM3M)
|
||||
file(GLOB REMFILE "benchmark/gemm3m.c")
|
||||
list(REMOVE_ITEM SOURCES ${REMFILE})
|
||||
endif()
|
||||
foreach(source ${SOURCES})
|
||||
get_filename_component(name ${source} NAME_WE)
|
||||
if ((NOT ${name} STREQUAL "zdot-intel") AND (NOT ${name} STREQUAL "cula_wrapper"))
|
||||
set(defines DEFAULT COMPLEX DOUBLE "COMPLEX\;DOUBLE")
|
||||
foreach(define ${defines})
|
||||
set(target_name "benchmark_${name}")
|
||||
if (NOT "${define}" STREQUAL "DEFAULT")
|
||||
string(JOIN "_" define_str ${define})
|
||||
set(target_name "${target_name}_${define_str}")
|
||||
endif()
|
||||
if ((NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX_DOUBLE") AND
|
||||
(NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX_DOUBLE") AND
|
||||
(NOT ${target_name} STREQUAL "benchmark_max_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX_DOUBLE") AND
|
||||
(NOT ${target_name} STREQUAL "benchmark_min_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX_DOUBLE"))
|
||||
add_executable(${target_name} ${source})
|
||||
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
||||
target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} )
|
||||
# target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C)
|
||||
if (NOT "${define}" STREQUAL "DEFAULT")
|
||||
target_compile_definitions(${target_name} PRIVATE ${define})
|
||||
endif()
|
||||
endif()
|
||||
endforeach()
|
||||
endif()
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
|
||||
# Install project
|
||||
|
|
|
@ -218,4 +218,8 @@ In chronological order:
|
|||
* [2022-08] Fix building from sources for QNX
|
||||
|
||||
* Mark Seminatore <https://github.com/mseminatore>
|
||||
* [2023-11-09] Improve Windows threading performance scaling
|
||||
* [2023-11-09] Improve Windows threading performance scaling
|
||||
* [2024-02-09] Introduce MT_TRACE facility and improve code consistency
|
||||
|
||||
* Dirreke <https://github.com/mseminatore>
|
||||
* [2024-01-16] Add basic support for the CSKY architecture
|
||||
|
|
100
Changelog.txt
100
Changelog.txt
|
@ -1,4 +1,104 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.27
|
||||
4-Apr-2024
|
||||
|
||||
general:
|
||||
- added initial (generic) support for the CSKY architecture
|
||||
- capped the maximum number of threads used in GEMM, GETRF and POTRF to avoid creating
|
||||
underutilized or idle threads
|
||||
- sped up multithreaded POTRF on all platforms
|
||||
- added extension openblas_set_num_threads_local() that returns the previous thread count
|
||||
- re-evaluated the SGEMV and DGEMV load thresholds to avoid activating multithreading
|
||||
for too small workloads
|
||||
- improved the fallback code used when the precompiled number of threads is exceeded,
|
||||
and made it callable multiple times during the lifetime of an instance
|
||||
- added CBLAS interfaces for the BLAS extensions ?AMIN,?AMAX, CAXPYC and ZAXPYC
|
||||
- fixed a potential buffer overflow in the interface to the GEMMT kernels
|
||||
- fixed use of incompatible pointer types in GEMMT and C/ZAXPBY as flagged by GCC-14
|
||||
- fixed unwanted case sensitivity of the character parameters in ?TRTRS
|
||||
- sped up the OpenMP thread management code
|
||||
- fixed sizing of logical variables in INTERFACE64 builds of the C version of LAPACK
|
||||
- fixed inclusion of new LAPACK and LAPACKE functions from LAPACK 3.11 in the shared library
|
||||
- added a testsuite for the BLAS extensions
|
||||
- modified the error thresholds for SGS/DGS functions in the LAPACK testsuite to suppress
|
||||
spurious errors
|
||||
- added support for building the benchmark collection with CMAKE
|
||||
- added rewriting of linker options to avoid linking both libgomp and libomp in CMAKE builds
|
||||
with OpenMP enabled that use clang with gfortran
|
||||
- fixed building on systems with ucLibc
|
||||
- added support for calling ?NRM2 with a negative increment value on all architectures
|
||||
- added support for the LLVM18 version of the flang-new compiler
|
||||
- fixed handling of the OPENBLAS_LOOPS variable in several benchmarks
|
||||
- Integrated fixes from the Reference-LAPACK project:
|
||||
- Increased accuracy in C/ZLARFGP (Reference-LAPACK PR 981)
|
||||
|
||||
x86:
|
||||
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||
- fixed GEMM3M functions failing in CMAKE builds
|
||||
|
||||
x86-64:
|
||||
- removed all instances of sched_yield() on Linux and BSD
|
||||
- fixed a potential deadlock in the thread server on MSWindows (introduced in 0.3.26)
|
||||
- fixed GEMM3M functions failing in CMAKE builds
|
||||
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||
- added compiler checks for AVX512BF16 compatibility
|
||||
- fixed LLVM compiler options for Sapphire Rapids
|
||||
- fixed cpu handling fallbacks for Sapphire Rapids with
|
||||
disabled AVX2 in DYNAMIC_ARCH mode
|
||||
- fixed extensions SCSUM and DZSUM
|
||||
- improved GEMM performance for ZEN targets
|
||||
|
||||
arm:
|
||||
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||
|
||||
arm64:
|
||||
- added initial support for the Cortex-A76 cpu
|
||||
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||
- fixed default compiler options for gcc (-march and -mtune)
|
||||
- added support for ArmCompilerForLinux
|
||||
- added support for the NeoverseV2 cpu in DYNAMIC_ARCH builds
|
||||
- fixed mishandling of the INTERFACE64 option in CMAKE builds
|
||||
- corrected SCSUM kernels (erroneously duplicating SCASUM behaviour)
|
||||
- added SVE-enabled kernels for CSUM/ZSUM
|
||||
- worked around an inaccuracy in the NRM2 kernels for NeoverseN1 and Apple M
|
||||
|
||||
power:
|
||||
- improved performance of SGEMM on POWER8/9/10
|
||||
- improved performance of DGEMM on POWER10
|
||||
- added support for OpenMP builds with xlc/xlf on AIX
|
||||
- improved cpu autodetection for DYNAMIC_ARCH builds on older AIX
|
||||
- fixed cpu core counting on AIX
|
||||
- added support for building a shared library on AIX
|
||||
|
||||
riscv64:
|
||||
- added support for the X280 cpu
|
||||
- added support for semi-generic RISCV models with vector length 128 or 256
|
||||
- added support for compiling with either RVV 0.7.1 or RVV 1.0 standard compilers
|
||||
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||
- improved cpu model autodetection
|
||||
- fixed corner cases in ?AXPBY for C910V
|
||||
- fixed handling of zero increments in ?AXPY kernels for C910V
|
||||
|
||||
loongarch64:
|
||||
- added optimized kernels for ?AMIN and ?AMAX
|
||||
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||
- fixed handling of corner cases in ?AXPBY
|
||||
- fixed computation of SAMIN and DAMIN in LSX mode
|
||||
- fixed computation of ?ROT
|
||||
- added optimized SSYMV and DSYMV kernels for LSX and LASX mode
|
||||
- added optimized CGEMM and ZGEMM kernels for LSX and LASX mode
|
||||
- added optimized CGEMV and ZGEMV kernels
|
||||
|
||||
mips:
|
||||
- fixed utilizing MSA on P5600 and related cpus (broken in 0.3.22)
|
||||
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||
- fixed mishandling of the INTERFACE64 option in CMAKE builds
|
||||
|
||||
zarch:
|
||||
- fixed handling of NaN and Inf arguments in ZSCAL
|
||||
- fixed calculation of ?SUM on Z13
|
||||
|
||||
====================================================================
|
||||
Version 0.3.26
|
||||
2-Jan-2024
|
||||
|
|
32
Makefile
32
Makefile
|
@ -1,5 +1,9 @@
|
|||
TOPDIR = .
|
||||
include ./Makefile.system
|
||||
LNCMD = ln -fs
|
||||
ifeq ($(FIXED_LIBNAME), 1)
|
||||
LNCMD = true
|
||||
endif
|
||||
|
||||
BLASDIRS = interface driver/level2 driver/level3 driver/others
|
||||
|
||||
|
@ -134,17 +138,17 @@ shared : libs netlib $(RELA)
|
|||
ifneq ($(NO_SHARED), 1)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
|
||||
@$(MAKE) -C exports so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
@$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so
|
||||
@$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD))
|
||||
@$(MAKE) -C exports so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
@$(MAKE) -C exports dyn
|
||||
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
@$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||
@$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
@$(MAKE) -C exports dll
|
||||
|
@ -152,6 +156,9 @@ endif
|
|||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
@$(MAKE) -C exports dll
|
||||
endif
|
||||
ifeq ($(OSNAME), AIX)
|
||||
@$(MAKE) -C exports so
|
||||
endif
|
||||
endif
|
||||
|
||||
tests : shared
|
||||
|
@ -229,13 +236,13 @@ ifeq ($(INTERFACE64),1)
|
|||
endif
|
||||
@echo THELIBNAME=$(LIBNAME) >> Makefile.conf_last
|
||||
@echo THELIBSONAME=$(LIBSONAME) >> Makefile.conf_last
|
||||
@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
@-$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
@touch lib.grd
|
||||
|
||||
prof : prof_blas prof_lapack
|
||||
|
||||
prof_blas :
|
||||
ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
|
||||
$(LNCMD) $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
|
||||
for d in $(SUBDIRS) ; \
|
||||
do if test -d $$d; then \
|
||||
$(MAKE) -C $$d prof || exit 1 ; \
|
||||
|
@ -246,7 +253,7 @@ ifeq ($(DYNAMIC_ARCH), 1)
|
|||
endif
|
||||
|
||||
blas :
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
for d in $(BLASDIRS) ; \
|
||||
do if test -d $$d; then \
|
||||
$(MAKE) -C $$d libs || exit 1 ; \
|
||||
|
@ -254,7 +261,7 @@ blas :
|
|||
done
|
||||
|
||||
hpl :
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
for d in $(BLASDIRS) ../laswp exports ; \
|
||||
do if test -d $$d; then \
|
||||
$(MAKE) -C $$d $(@F) || exit 1 ; \
|
||||
|
@ -268,7 +275,7 @@ ifeq ($(DYNAMIC_ARCH), 1)
|
|||
endif
|
||||
|
||||
hpl_p :
|
||||
ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
|
||||
$(LNCMD) $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
|
||||
for d in $(SUBDIRS) ../laswp exports ; \
|
||||
do if test -d $$d; then \
|
||||
$(MAKE) -C $$d $(@F) || exit 1 ; \
|
||||
|
@ -309,8 +316,12 @@ endif
|
|||
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1)
|
||||
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
else
|
||||
ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGIBM1)
|
||||
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
else
|
||||
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
endif
|
||||
endif
|
||||
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
|
@ -401,6 +412,7 @@ lapack-runtest: lapack-test
|
|||
|
||||
blas-test:
|
||||
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out)
|
||||
|
||||
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
|
||||
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out)
|
||||
|
||||
|
|
|
@ -58,6 +58,13 @@ FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA76)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a76
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a76
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), FT2000)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
|
@ -104,19 +111,25 @@ ifneq ($(F_COMPILER), NAG)
|
|||
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native
|
||||
CCOMMON_OPT += -march=armv8.4-a+sve
|
||||
ifneq ($(CROSS), 1)
|
||||
CCOMMON_OPT += -mtune=native
|
||||
endif
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.4-a -mtune=native
|
||||
FCOMMON_OPT += -march=armv8.4-a
|
||||
ifneq ($(CROSS), 1)
|
||||
FCOMMON_OPT += -mtune=native
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
|
@ -132,25 +145,31 @@ ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
|
|||
ifneq ($(OSNAME), Darwin)
|
||||
CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
|
||||
endif
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native
|
||||
CCOMMON_OPT += -march=armv8.5-a+sve
|
||||
ifneq ($(CROSS), 1)
|
||||
CCOMMON_OPT += -mtune=native
|
||||
endif
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.5-a -mtune=native
|
||||
FCOMMON_OPT += -march=armv8.5-a
|
||||
ifneq ($(CROSS), 1)
|
||||
FCOMMON_OPT += -mtune=native
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
|
@ -258,9 +277,17 @@ endif
|
|||
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
|
||||
ifeq ($(CORE), CORTEXX1)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72
|
||||
CCOMMON_OPT += -march=armv8.2-a
|
||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ12) $(ISCLANG)))
|
||||
CCOMMON_OPT += -mtune=cortex-x1
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-x1
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
@ -271,6 +298,12 @@ CCOMMON_OPT += -march=armv8.4-a+sve
|
|||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.4-a+sve
|
||||
endif
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ12) $(ISCLANG)))
|
||||
CCOMMON_OPT += -mtune=cortex-x2
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -mtune=cortex-x2
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
@ -290,6 +323,12 @@ CCOMMON_OPT += -march=armv8.4-a+sve
|
|||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.4-a+sve
|
||||
endif
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ12) $(ISCLANG)))
|
||||
CCOMMON_OPT += -mtune=cortex-a710
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -mtune=cortex-a710
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
ifeq ($(CORE), CK860FV)
|
||||
CCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float
|
||||
FCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float -static
|
||||
endif
|
|
@ -2,11 +2,15 @@ TOPDIR = .
|
|||
export GOTOBLAS_MAKEFILE = 1
|
||||
-include $(TOPDIR)/Makefile.conf_last
|
||||
include ./Makefile.system
|
||||
LNCMD = ln -fs
|
||||
|
||||
ifdef THELIBNAME
|
||||
LIBNAME=$(THELIBNAME)
|
||||
LIBSONAME=$(THELIBSONAME)
|
||||
endif
|
||||
ifeq ($(FIXED_LIBNAME), 1)
|
||||
LNCMD = true
|
||||
endif
|
||||
ifeq ($(INTERFACE64),1)
|
||||
USE_64BITINT=1
|
||||
endif
|
||||
|
@ -99,7 +103,7 @@ ifneq ($(NO_STATIC),1)
|
|||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
endif
|
||||
#for install shared library
|
||||
ifneq ($(NO_SHARED),1)
|
||||
|
@ -107,21 +111,21 @@ ifneq ($(NO_SHARED),1)
|
|||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
|
||||
@install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD))
|
||||
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@-install_name_tool -id "$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(MAJOR_VERSION).dylib" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
|
||||
ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
|
||||
$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
|
||||
|
@ -149,15 +153,15 @@ ifneq ($(NO_STATIC),1)
|
|||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
endif
|
||||
#for install shared library
|
||||
ifneq ($(NO_SHARED),1)
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
|
||||
endif
|
||||
|
@ -170,6 +174,8 @@ endif
|
|||
|
||||
@echo Generating $(LIBSONAMEBASE)$(SUFFIX64).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
|
||||
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(PKGFILE)"
|
||||
@echo 'libprefix='$(LIBNAMEPREFIX) >> "$(PKGFILE)"
|
||||
@echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)"
|
||||
@echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)"
|
||||
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)"
|
||||
@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
|
||||
|
@ -186,7 +192,7 @@ endif
|
|||
ifneq ($(NO_SHARED),1)
|
||||
#ifeq logical or
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
|
||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
endif
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
|
||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
|
|
|
@ -55,6 +55,26 @@ ifeq ($(TARGET), C910V)
|
|||
TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), CK860FV)
|
||||
TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), x280)
|
||||
TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), RISCV64_ZVL256B)
|
||||
TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), RISCV64_ZVL128B)
|
||||
TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), RISCV64_GENERIC)
|
||||
TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d
|
||||
endif
|
||||
|
||||
all: getarch_2nd
|
||||
./getarch_2nd 0 >> $(TARGET_MAKE)
|
||||
./getarch_2nd 1 >> $(TARGET_CONF)
|
||||
|
|
|
@ -2,3 +2,19 @@ ifeq ($(CORE), C910V)
|
|||
CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920
|
||||
FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static
|
||||
endif
|
||||
ifeq ($(CORE), x280)
|
||||
CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math
|
||||
FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
|
||||
endif
|
||||
ifeq ($(CORE), RISCV64_ZVL256B)
|
||||
CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d
|
||||
FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
|
||||
endif
|
||||
ifeq ($(CORE), RISCV64_ZVL128B)
|
||||
CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
|
||||
FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
|
||||
endif
|
||||
ifeq ($(CORE), RISCV64_GENERIC)
|
||||
CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d
|
||||
FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static
|
||||
endif
|
||||
|
|
|
@ -3,7 +3,12 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.26
|
||||
VERSION = 0.3.26.dev
|
||||
|
||||
# If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
|
||||
# and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library
|
||||
#
|
||||
# LIBNAMEPREFIX = scipy
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
|
|
@ -365,8 +365,9 @@ GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
|
|||
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
|
||||
GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8)
|
||||
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||
GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11)
|
||||
GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
|
||||
GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11)
|
||||
GCCVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12)
|
||||
# Note that the behavior of -dumpversion is compile-time-configurable for
|
||||
# gcc-7.x and newer. Use -dumpfullversion there
|
||||
ifeq ($(GCCVERSIONGTEQ7),1)
|
||||
|
@ -873,6 +874,11 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), csky)
|
||||
NO_BINARY_MODE = 1
|
||||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
||||
#
|
||||
# C Compiler dependent settings
|
||||
#
|
||||
|
@ -1176,7 +1182,7 @@ ifeq ($(F_COMPILER), IBM)
|
|||
CCOMMON_OPT += -DF_INTERFACE_IBM
|
||||
FEXTRALIB += -lxlf90
|
||||
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG))
|
||||
FCOMMON_OPT += -qextname
|
||||
FCOMMON_OPT += -qextname -qzerosize
|
||||
endif
|
||||
# FCOMMON_OPT += -qarch=440
|
||||
ifdef BINARY64
|
||||
|
@ -1511,16 +1517,28 @@ ifndef LIBSONAMEBASE
|
|||
LIBSONAMEBASE = openblas
|
||||
endif
|
||||
|
||||
ifndef LIBNAMEPREFIX
|
||||
LIBNAMEPREFIX =
|
||||
endif
|
||||
|
||||
SYMPREFIX=$(SYMBOLPREFIX)
|
||||
ifeq ($(SYMBOLPREFIX),$(LIBNAMEPREFIX))
|
||||
SYMPREFIX=
|
||||
endif
|
||||
SYMSUFFIX=$(SYMBOLSUFFIX)
|
||||
ifeq ($(SYMBOLSUFFIX),$(LIBNAMESUFFIX))
|
||||
SYMSUFFIX=
|
||||
endif
|
||||
ifndef LIBNAMESUFFIX
|
||||
LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)
|
||||
LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)
|
||||
else
|
||||
LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX)
|
||||
LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)$(LIBNAMESUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
LIBPREFIX = cyg$(LIBNAMEBASE)
|
||||
LIBPREFIX = cyg$(LIBNAMEPREFIX)$(LIBNAMEBASE)
|
||||
else
|
||||
LIBPREFIX = lib$(LIBNAMEBASE)
|
||||
LIBPREFIX = lib$(LIBNAMEPREFIX)$(LIBNAMEBASE)
|
||||
endif
|
||||
|
||||
KERNELDIR = $(TOPDIR)/kernel/$(ARCH)
|
||||
|
@ -1652,6 +1670,10 @@ ifeq ($(F_COMPILER),CRAY)
|
|||
LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||
override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||
endif
|
||||
ifeq ($(F_COMPILER),FLANGNEW)
|
||||
LAPACK_FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||
override FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||
endif
|
||||
|
||||
LAPACK_CFLAGS = $(CFLAGS)
|
||||
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
|
||||
|
@ -1699,14 +1721,14 @@ LIBNAME_P = $(LIBPREFIX)p$(REVISION)_p.$(LIBSUFFIX)
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(FIXED_LIBNAME),1)
|
||||
LIBNAME = lib$(LIBNAMEPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).$(LIBSUFFIX)
|
||||
LIBNAME_P = lib$(LIBNAMEPREFIX)$(LISOBNAMEBASE)$(LIBNAMESUFFIX)_p.$(LIBSUFFIX)
|
||||
endif
|
||||
|
||||
LIBDLLNAME = $(LIBPREFIX).dll
|
||||
IMPLIBNAME = lib$(LIBNAMEBASE).dll.a
|
||||
ifneq ($(OSNAME), AIX)
|
||||
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
|
||||
else
|
||||
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a)
|
||||
endif
|
||||
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
|
||||
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
|
||||
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)
|
||||
|
|
|
@ -130,11 +130,11 @@ ifeq ($(C_COMPILER), GCC)
|
|||
endif
|
||||
endif
|
||||
else ifeq ($(C_COMPILER), CLANG)
|
||||
# cooperlake support was added in clang 12
|
||||
# sapphire rapids support was added in clang 12
|
||||
ifeq ($(CLANGVERSIONGTEQ12), 1)
|
||||
CCOMMON_OPT += -march=cooperlake
|
||||
CCOMMON_OPT += -march=sapphirerapids
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=cooperlake
|
||||
FCOMMON_OPT += -march=sapphirerapids
|
||||
endif
|
||||
else # not supported in clang, fallback to avx512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
|
|
23
README.md
23
README.md
|
@ -167,6 +167,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
|||
- **Cortex A57**: Optimized Level-3 and Level-2 functions
|
||||
- **Cortex A72**: same as A57 ( different cpu specifications)
|
||||
- **Cortex A73**: same as A57 (different cpu specifications)
|
||||
- **Cortex A76**: same as A57 (different cpu specifications)
|
||||
- **Falkor**: same as A57 (different cpu specifications)
|
||||
- **ThunderX**: Optimized some Level-1 functions
|
||||
- **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
|
||||
|
@ -185,6 +186,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
|||
- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
|
||||
- **POWER10**: Optimized Level-3 BLAS including SBGEMM and some Level-1,2.
|
||||
|
||||
- **AIX**: Dynamic architecture with OpenXL and OpenMP.
|
||||
```sh
|
||||
make CC=ibm-clang_r FC=xlf TARGET=POWER7 BINARY=64 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 USE_THREAD=1
|
||||
```
|
||||
|
||||
#### IBM zEnterprise System
|
||||
|
||||
- **Z13**: Optimized Level-3 BLAS and Level-1,2
|
||||
|
@ -198,6 +204,21 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
|||
```
|
||||
(also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision)
|
||||
|
||||
- **x280**: Level-3 BLAS and Level-1,2 are optimized by RISC-V Vector extension 1.0.
|
||||
```sh
|
||||
make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran
|
||||
```
|
||||
|
||||
- **ZVL???B**: Level-3 BLAS and Level-1,2 including vectorised kernels targeting generic RISCV cores with vector support with registers of at least the corresponding width; ZVL128B and ZVL256B are available.
|
||||
e.g.:
|
||||
```sh
|
||||
make TARGET=RISCV64_ZVL256B CFLAGS="-DTARGET=RISCV64_ZVL256B" \
|
||||
BINARY=64 ARCH=riscv64 CC='clang -target riscv64-unknown-linux-gnu' \
|
||||
AR=riscv64-unknown-linux-gnu-ar AS=riscv64-unknown-linux-gnu-gcc \
|
||||
LD=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran \
|
||||
HOSTCC=gcc HOSTFC=gfortran -j
|
||||
```
|
||||
|
||||
### Support for multiple targets in a single library
|
||||
|
||||
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
|
||||
|
@ -227,7 +248,7 @@ Please note that it is not possible to combine support for different architectur
|
|||
- **NetBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
||||
- **AIX**: Supported on PPC up to POWER8
|
||||
- **AIX**: Supported on PPC up to POWER10
|
||||
- **Haiku**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **SunOS**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **Cortex-M**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-on-Cortex-M>.
|
||||
|
|
|
@ -93,6 +93,7 @@ CORTEXA53
|
|||
CORTEXA57
|
||||
CORTEXA72
|
||||
CORTEXA73
|
||||
CORTEXA76
|
||||
CORTEXA510
|
||||
CORTEXA710
|
||||
CORTEXX1
|
||||
|
@ -118,8 +119,11 @@ Z13
|
|||
Z14
|
||||
|
||||
10.RISC-V 64:
|
||||
RISCV64_GENERIC
|
||||
RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54)
|
||||
RISCV64_ZVL128B
|
||||
C910V
|
||||
x280
|
||||
RISCV64_ZVL256B
|
||||
|
||||
11.LOONGARCH64:
|
||||
LOONGSONGENERIC
|
||||
|
@ -133,3 +137,7 @@ E2K
|
|||
EV4
|
||||
EV5
|
||||
EV6
|
||||
|
||||
14.CSKY
|
||||
CSKY
|
||||
CK860FV
|
||||
|
|
|
@ -37,6 +37,12 @@ ESSL=/opt/ibm/lib
|
|||
#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
|
||||
LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
|
||||
|
||||
# x280 temporary workaround for gfortran
|
||||
ifeq ($(TARGET), x280)
|
||||
CCOMMON_OPT:=$(filter-out -mllvm --riscv-v-vector-bits-min=512,$(CCOMMON_OPT))
|
||||
endif
|
||||
|
||||
|
||||
ifneq ($(NO_LAPACK), 1)
|
||||
GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||
scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \
|
||||
|
@ -3436,4 +3442,4 @@ smallscaling: smallscaling.c ../$(LIBNAME)
|
|||
clean ::
|
||||
@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling
|
||||
|
||||
include $(TOPDIR)/Makefile.tail
|
||||
include $(TOPDIR)/Makefile.tail
|
|
@ -92,7 +92,7 @@ int main(int argc, char *argv[]){
|
|||
|
||||
if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step);
|
||||
|
||||
|
|
|
@ -85,7 +85,7 @@ int main(int argc, char *argv[]){
|
|||
double time1, time2, timeg1,timeg2;
|
||||
|
||||
char *p;
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
|
|
|
@ -120,7 +120,7 @@ int main(int argc, char *argv[]){
|
|||
|
||||
if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]);
|
||||
|
||||
|
|
|
@ -54,7 +54,7 @@ int main(int argc, char *argv[]){
|
|||
int step = 1;
|
||||
int loops = 1;
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
|
||||
|
||||
double time1,timeg;
|
||||
|
||||
|
|
24
c_check
24
c_check
|
@ -91,6 +91,7 @@ case "$data" in
|
|||
*ARCH_ZARCH*) architecture=zarch ;;
|
||||
*ARCH_RISCV64*) architecture=riscv64 ;;
|
||||
*ARCH_LOONGARCH64*) architecture=loongarch64 ;;
|
||||
*ARCH_CSKY*) architecture=csky ;;
|
||||
esac
|
||||
|
||||
defined=0
|
||||
|
@ -236,6 +237,7 @@ case "$data" in
|
|||
*ARCH_ARM*) architecture=arm ;;
|
||||
*ARCH_ZARCH*) architecture=zarch ;;
|
||||
*ARCH_LOONGARCH64*) architecture=loongarch64 ;;
|
||||
*ARCH_CSKY*) architecture=csky ;;
|
||||
esac
|
||||
|
||||
binformat='bin32'
|
||||
|
@ -244,6 +246,7 @@ case "$data" in
|
|||
esac
|
||||
|
||||
no_avx512=0
|
||||
no_avx512bf=0
|
||||
if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
|
||||
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||
tmpf="$tmpd/a.c"
|
||||
|
@ -262,6 +265,25 @@ if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
|
|||
}
|
||||
|
||||
rm -rf "$tmpd"
|
||||
if [ "$no_avx512" -eq 0 ]; then
|
||||
tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
|
||||
tmpf="$tmpd/a.c"
|
||||
code='"__m512 a= _mm512_dpbf16_ps(a, (__m512bh) _mm512_loadu_si512(%1]), (__m512bh) _mm512_loadu_si512(%2]));"'
|
||||
printf "#include <immintrin.h>\n\nint main(void){ %s; }\n" "$code" >> "$tmpf"
|
||||
if [ "$compiler" = "PGI" ]; then
|
||||
args=" -tp cooperlake -c -o $tmpf.o $tmpf"
|
||||
else
|
||||
args=" -march=cooperlake -c -o $tmpf.o $tmpf"
|
||||
fi
|
||||
no_avx512bf=0
|
||||
{
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
no_avx512bf=1
|
||||
}
|
||||
|
||||
rm -rf "$tmpd"
|
||||
fi
|
||||
fi
|
||||
|
||||
no_rv64gv=0
|
||||
|
@ -409,6 +431,7 @@ done
|
|||
[ "$makefile" = "-" ] && {
|
||||
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
|
||||
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
|
||||
[ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
|
||||
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
|
||||
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
|
||||
exit 0
|
||||
|
@ -437,6 +460,7 @@ done
|
|||
[ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n"
|
||||
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
|
||||
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
|
||||
[ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
|
||||
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
|
||||
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
|
||||
[ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"
|
||||
|
|
|
@ -97,6 +97,7 @@ $architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
|||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
|
||||
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
||||
$architecture = csky if ($data =~ /ARCH_CSKY/);
|
||||
|
||||
$defined = 0;
|
||||
|
||||
|
@ -156,6 +157,11 @@ if ($architecture eq "loongarch64") {
|
|||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "csky") {
|
||||
$defined = 1;
|
||||
$binary = 32;
|
||||
}
|
||||
|
||||
if ($compiler eq "PGI") {
|
||||
$compiler_name .= " -tp p7" if ($binary eq "32");
|
||||
$compiler_name .= " -tp p7-64" if ($binary eq "64");
|
||||
|
@ -284,6 +290,7 @@ $architecture = arm if ($data =~ /ARCH_ARM/);
|
|||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
||||
$architecture = csky if ($data =~ /ARCH_CSKY/);
|
||||
|
||||
$binformat = bin32;
|
||||
$binformat = bin64 if ($data =~ /BINARY_64/);
|
||||
|
|
22
cblas.h
22
cblas.h
|
@ -12,6 +12,7 @@ extern "C" {
|
|||
/*Set the number of threads on runtime.*/
|
||||
void openblas_set_num_threads(int num_threads);
|
||||
void goto_set_num_threads(int num_threads);
|
||||
int openblas_set_num_threads_local(int num_threads);
|
||||
|
||||
/*Get the number of threads on runtime.*/
|
||||
int openblas_get_num_threads(void);
|
||||
|
@ -100,6 +101,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
|
|||
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||
float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||
float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
||||
CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||
CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||
|
@ -115,6 +126,9 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS
|
|||
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
|
||||
void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
|
||||
void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||
|
@ -289,6 +303,14 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA
|
|||
void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
|
||||
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
|
||||
|
||||
void cblas_sgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
|
||||
OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
|
||||
void cblas_dgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
|
||||
OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
|
||||
void cblas_cgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
|
||||
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
|
||||
void cblas_zgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
|
||||
OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
|
||||
|
||||
void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
|
||||
OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
|
||||
|
|
|
@ -64,6 +64,7 @@ else ()
|
|||
"#define NEEDBUNDERSCORE 1\n")
|
||||
endif()
|
||||
|
||||
if (CMAKE_Fortran_COMPILER)
|
||||
get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE)
|
||||
string(TOUPPER ${F_COMPILER} F_COMPILER)
|
||||
|
||||
endif()
|
||||
|
|
|
@ -6,9 +6,6 @@
|
|||
if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
|
||||
# This is for classic Flang. LLVM Flang is handled with gfortran below.
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
|
||||
if (BINARY64 AND INTERFACE64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
|
||||
endif ()
|
||||
if (USE_OPENMP)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
|
||||
endif ()
|
||||
|
@ -55,6 +52,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
|
|||
if (MIPS64)
|
||||
if (BINARY64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64")
|
||||
if (INTERFACE64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
|
||||
endif ()
|
||||
else ()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32")
|
||||
endif ()
|
||||
|
@ -83,9 +83,14 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
|
|||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
if (ARM64 AND INTERFACE64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
|
||||
endif ()
|
||||
else ()
|
||||
if (BINARY64)
|
||||
if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
|
||||
endif ()
|
||||
if (INTERFACE64)
|
||||
if (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel")
|
||||
if (WIN32)
|
||||
|
@ -98,7 +103,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
|
|||
endif ()
|
||||
endif ()
|
||||
else ()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -m32")
|
||||
if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -m32")
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
||||
libnameprefix=@LIBNAMEPREFIX@
|
||||
libnamesuffix=@LIBNAMESUFFIX@
|
||||
libsuffix=@SUFFIX64_UNDERSCORE@
|
||||
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
||||
|
||||
|
@ -7,5 +9,5 @@ Name: OpenBLAS
|
|||
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
||||
Version: @OpenBLAS_VERSION@
|
||||
URL: https://github.com/OpenMathLib/OpenBLAS
|
||||
Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix}
|
||||
Libs: @OpenMP_C_FLAGS@ -L${libdir} -l${libnameprefix}openblas${libnamesuffix}${libsuffix}
|
||||
Cflags: -I${includedir}
|
||||
|
|
|
@ -932,7 +932,7 @@ endif ()
|
|||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73")
|
||||
elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73" OR "${TCORE}" STREQUAL "CORTEXA76")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t49152\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
|
|
|
@ -501,10 +501,11 @@ set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_TH
|
|||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(LIBPREFIX "lib${LIBNAMEPREFIX}openblas")
|
||||
|
||||
if (DEFINED LIBNAMESUFFIX)
|
||||
set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}")
|
||||
else ()
|
||||
set(LIBPREFIX "libopenblas")
|
||||
set(LIBPREFIX "${LIBNAMEPREFIX}_${LIBNAMESUFFIX}")
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED SYMBOLPREFIX)
|
||||
|
@ -615,13 +616,19 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
|
|||
endforeach ()
|
||||
endif ()
|
||||
|
||||
if ("${F_COMPILER}" STREQUAL "NAG" OR "${F_COMPILER}" STREQUAL "CRAY")
|
||||
if (CMAKE_Fortran_COMPILER)
|
||||
if (${F_COMPILER} STREQUAL "NAG" OR ${F_COMPILER} STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
|
||||
set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512")
|
||||
if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*")
|
||||
message(STATUS "removing fortran flags")
|
||||
set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64")
|
||||
endif ()
|
||||
foreach (FILTER_FLAG ${FILTER_FLAGS})
|
||||
string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS})
|
||||
string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS})
|
||||
endforeach ()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if ("${F_COMPILER}" STREQUAL "GFORTRAN")
|
||||
# lapack-netlib is rife with uninitialized warnings -hpa
|
||||
|
@ -679,6 +686,10 @@ else ()
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (DEFINED FIXED_LIBNAME)
|
||||
set (LIBNAME "${LIBPREFIX}.${LIBSUFFIX}")
|
||||
set (LIBNAME "${LIBPREFIX}_p.${LIBSUFFIX}")
|
||||
endif()
|
||||
|
||||
set(LIBDLLNAME "${LIBPREFIX}.dll")
|
||||
set(LIBSONAME "${LIBNAME}.${LIBSUFFIX}.so")
|
||||
|
|
24
common.h
24
common.h
|
@ -358,12 +358,6 @@ typedef int blasint;
|
|||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop; \n");
|
||||
#endif
|
||||
|
||||
#ifdef BULLDOZER
|
||||
#ifndef YIELDING
|
||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#ifndef YIELDING
|
||||
|
@ -371,21 +365,13 @@ typedef int blasint;
|
|||
#endif
|
||||
#endif
|
||||
|
||||
/*
|
||||
#ifdef PILEDRIVER
|
||||
#ifndef YIELDING
|
||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
||||
#endif
|
||||
#endif
|
||||
*/
|
||||
|
||||
/*
|
||||
#ifdef STEAMROLLER
|
||||
#if defined(ARCH_X86_64)
|
||||
#ifndef YIELDING
|
||||
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
||||
#endif
|
||||
#endif
|
||||
*/
|
||||
|
||||
|
||||
#ifdef __EMSCRIPTEN__
|
||||
#define YIELDING
|
||||
|
@ -396,7 +382,7 @@ typedef int blasint;
|
|||
#endif
|
||||
|
||||
/***
|
||||
To alloc job_t on heap or statck.
|
||||
To alloc job_t on heap or stack.
|
||||
please https://github.com/xianyi/OpenBLAS/issues/246
|
||||
***/
|
||||
#if defined(OS_WINDOWS)
|
||||
|
@ -482,6 +468,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
|||
#include "common_e2k.h"
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_CSKY
|
||||
#include "common_csky.h"
|
||||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#ifdef OS_WINDOWSSTORE
|
||||
typedef char env_var_t[MAX_PATH];
|
||||
|
|
|
@ -0,0 +1,56 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2011-2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
|
||||
#ifndef COMMON_CSKY
|
||||
#define COMMON_CSKY
|
||||
|
||||
#define MB __sync_synchronize()
|
||||
#define WMB __sync_synchronize()
|
||||
#define RMB __sync_synchronize()
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#define BUFFER_SIZE ( 32 << 20)
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#endif
|
|
@ -498,6 +498,15 @@ void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double *
|
|||
void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
|
||||
xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);
|
||||
|
||||
void BLASFUNC(sgemmt)(char*, char *, char *, blasint *, blasint *, float *,
|
||||
float *, blasint *, float *, blasint *, float *, float *, blasint *);
|
||||
void BLASFUNC(dgemmt)(char*, char *, char *, blasint *, blasint *, double *,
|
||||
double *, blasint *, double *, blasint *, double *, double *, blasint *);
|
||||
void BLASFUNC(cgemmt)(char*, char *, char *, blasint *, blasint *, float *,
|
||||
float *, blasint *, float *, blasint *, float *, float *, blasint *);
|
||||
void BLASFUNC(zgemmt)(char*, char *, char *, blasint *, blasint *, double *,
|
||||
double *, blasint *, double *, blasint *, double *, double *, blasint *);
|
||||
|
||||
int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *,
|
||||
float *, float *, blasint *, float *, blasint *,
|
||||
float *, float *, blasint *);
|
||||
|
@ -764,8 +773,8 @@ xdouble BLASFUNC(qlamc3)(xdouble *, xdouble *);
|
|||
|
||||
void BLASFUNC(saxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *);
|
||||
void BLASFUNC(daxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *);
|
||||
void BLASFUNC(caxpby) (blasint *, float *, float *, blasint *, float *, float *, blasint *);
|
||||
void BLASFUNC(zaxpby) (blasint *, double *, double *, blasint *, double *, double *, blasint *);
|
||||
void BLASFUNC(caxpby) (blasint *, void *, float *, blasint *, void *, float *, blasint *);
|
||||
void BLASFUNC(zaxpby) (blasint *, void *, double *, blasint *, void *, double *, blasint *);
|
||||
|
||||
void BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *);
|
||||
void BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, double *, blasint *);
|
||||
|
|
|
@ -91,8 +91,26 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#define BUFFER_SIZE ( 32 << 20)
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#if defined(C910V)
|
||||
#include <riscv_vector.h>
|
||||
#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(RISCV64_ZVL128B) || defined(x280)
|
||||
# include <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
#if defined( __riscv_xtheadc ) && defined( __riscv_v ) && ( __riscv_v <= 7000 )
|
||||
// t-head toolchain uses obsolete rvv intrinsics, can't build for C910V without this
|
||||
#define RISCV_0p10_INTRINSICS
|
||||
#define RISCV_RVV(x) x
|
||||
#else
|
||||
#define RISCV_RVV(x) __riscv_ ## x
|
||||
#endif
|
||||
|
||||
#if defined(C910V) || defined(RISCV64_ZVL256B)
|
||||
# if !defined(DOUBLE)
|
||||
# define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f32m1_f32)(v)
|
||||
# else
|
||||
# define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f64m1_f64)(v)
|
||||
# endif
|
||||
#else
|
||||
# define EXTRACT_FLOAT(v) (v[0])
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -137,19 +137,20 @@ typedef struct blas_queue {
|
|||
|
||||
extern int blas_server_avail;
|
||||
extern int blas_omp_number_max;
|
||||
extern int blas_omp_threads_local;
|
||||
|
||||
static __inline int num_cpu_avail(int level) {
|
||||
|
||||
#ifdef USE_OPENMP
|
||||
int openmp_nthreads;
|
||||
openmp_nthreads=omp_get_max_threads();
|
||||
if (omp_in_parallel()) openmp_nthreads = blas_omp_threads_local;
|
||||
#endif
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
if (blas_cpu_number == 1
|
||||
#endif
|
||||
#ifdef USE_OPENMP
|
||||
if (openmp_nthreads == 1 || omp_in_parallel()
|
||||
#else
|
||||
if (openmp_nthreads == 1
|
||||
#endif
|
||||
) return 1;
|
||||
|
||||
|
|
|
@ -42,6 +42,7 @@ size_t length64=sizeof(value64);
|
|||
#define CPU_CORTEXA57 3
|
||||
#define CPU_CORTEXA72 4
|
||||
#define CPU_CORTEXA73 5
|
||||
#define CPU_CORTEXA76 23
|
||||
#define CPU_NEOVERSEN1 11
|
||||
#define CPU_NEOVERSEV1 16
|
||||
#define CPU_NEOVERSEN2 17
|
||||
|
@ -89,7 +90,8 @@ static char *cpuname[] = {
|
|||
"CORTEXX2",
|
||||
"CORTEXA510",
|
||||
"CORTEXA710",
|
||||
"FT2000"
|
||||
"FT2000",
|
||||
"CORTEXA76"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
|
@ -115,7 +117,8 @@ static char *cpuname_lower[] = {
|
|||
"cortexx2",
|
||||
"cortexa510",
|
||||
"cortexa710",
|
||||
"ft2000"
|
||||
"ft2000",
|
||||
"cortexa76"
|
||||
};
|
||||
|
||||
int get_feature(char *search)
|
||||
|
@ -210,6 +213,8 @@ int detect(void)
|
|||
return CPU_CORTEXX2;
|
||||
else if (strstr(cpu_part, "0xd4e")) //X3
|
||||
return CPU_CORTEXX2;
|
||||
else if (strstr(cpu_part, "0xd0b"))
|
||||
return CPU_CORTEXA76;
|
||||
}
|
||||
// Qualcomm
|
||||
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
|
||||
|
@ -391,6 +396,7 @@ void get_cpuconfig(void)
|
|||
break;
|
||||
|
||||
case CPU_NEOVERSEV1:
|
||||
case CPU_CORTEXA76:
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
printf("#define L1_CODE_SIZE 65536\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
|
|
|
@ -70,12 +70,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_C910V 1
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_C910V 1
|
||||
#define CPU_x280 2
|
||||
#define CPU_RISCV64_ZVL256B 3
|
||||
#define CPU_RISCV64_ZVL128B 4
|
||||
|
||||
static char *cpuname[] = {
|
||||
"RISCV64_GENERIC",
|
||||
"C910V"
|
||||
"C910V",
|
||||
"x280",
|
||||
"CPU_RISCV64_ZVL256B",
|
||||
"CPU_RISCV64_ZVL128B"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"riscv64_generic",
|
||||
"c910v",
|
||||
"x280",
|
||||
"riscv64_zvl256b",
|
||||
"riscv64_zvl128b"
|
||||
};
|
||||
|
||||
int detect(void){
|
||||
|
@ -86,23 +100,29 @@ int detect(void){
|
|||
char *pmodel = NULL, *pisa = NULL;
|
||||
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
if (!infile)
|
||||
return CPU_GENERIC;
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if(!strncmp(buffer, "model name", 10)){
|
||||
strcpy(model_buffer, buffer);
|
||||
pmodel = strchr(isa_buffer, ':') + 1;
|
||||
pmodel = strchr(model_buffer, ':');
|
||||
if (pmodel)
|
||||
pmodel++;
|
||||
}
|
||||
|
||||
if(!strncmp(buffer, "isa", 3)){
|
||||
strcpy(isa_buffer, buffer);
|
||||
pisa = strchr(isa_buffer, '4') + 1;
|
||||
pisa = strchr(isa_buffer, '4');
|
||||
if (pisa)
|
||||
pisa++;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
if (!pmodel)
|
||||
if (!pmodel || !pisa)
|
||||
return(CPU_GENERIC);
|
||||
|
||||
|
||||
if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v'))
|
||||
return CPU_C910V;
|
||||
|
||||
|
@ -140,5 +160,5 @@ void get_cpuconfig(void){
|
|||
}
|
||||
|
||||
void get_libname(void){
|
||||
printf("riscv64\n");
|
||||
printf("%s", cpuname_lower[detect()]);
|
||||
}
|
||||
|
|
4
ctest.c
4
ctest.c
|
@ -173,6 +173,10 @@ HAVE_C11
|
|||
ARCH_E2K
|
||||
#endif
|
||||
|
||||
#if defined(__csky__)
|
||||
ARCH_CSKY
|
||||
#endif
|
||||
|
||||
#if defined(__EMSCRIPTEN__)
|
||||
ARCH_RISCV64
|
||||
OS_WINDOWS
|
||||
|
|
|
@ -40,6 +40,10 @@ else()
|
|||
c_${float_char}blas1.c)
|
||||
endif()
|
||||
target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
|
||||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
|
||||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
|
||||
target_link_libraries(x${float_char}cblat1 omp pthread)
|
||||
endif()
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
||||
target_link_libraries(x${float_char}cblat1 m)
|
||||
endif()
|
||||
|
@ -65,6 +69,10 @@ else()
|
|||
constant.c)
|
||||
endif()
|
||||
target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
|
||||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
|
||||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
|
||||
target_link_libraries(x${float_char}cblat2 omp pthread)
|
||||
endif()
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
||||
target_link_libraries(x${float_char}cblat2 m)
|
||||
endif()
|
||||
|
@ -80,6 +88,17 @@ if (NOT NOFORTRAN)
|
|||
auxiliary.c
|
||||
c_xerbla.c
|
||||
constant.c)
|
||||
if (USE_GEMM3M)
|
||||
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
|
||||
add_executable(x${float_char}cblat3_3m
|
||||
c_${float_char}blat3_3m.f
|
||||
c_${float_char}blas3_3m.c
|
||||
c_${float_char}3chke_3m.c
|
||||
auxiliary.c
|
||||
c_xerbla.c
|
||||
constant.c)
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
add_executable(x${float_char}cblat3
|
||||
c_${float_char}blat3c.c
|
||||
|
@ -88,12 +107,44 @@ else()
|
|||
auxiliary.c
|
||||
c_xerbla.c
|
||||
constant.c)
|
||||
if (USE_GEMM3M)
|
||||
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
|
||||
add_executable(x${float_char}cblat3_3m
|
||||
c_${float_char}blat3c_3m.c
|
||||
c_${float_char}blas3_3m.c
|
||||
c_${float_char}3chke_3m.c
|
||||
auxiliary.c
|
||||
c_xerbla.c
|
||||
constant.c)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
|
||||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
|
||||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
|
||||
target_link_libraries(x${float_char}cblat3 omp pthread)
|
||||
endif()
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
||||
target_link_libraries(x${float_char}cblat3 m)
|
||||
endif()
|
||||
if (USE_GEMM3M)
|
||||
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
|
||||
target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME})
|
||||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
|
||||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
|
||||
target_link_libraries(x${float_char}cblat3 omp pthread)
|
||||
endif()
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
||||
target_link_libraries(x${float_char}cblat3_3m m)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
add_test(NAME "x${float_char}cblat3"
|
||||
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
|
||||
|
||||
if (USE_GEMM3M)
|
||||
if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
|
||||
add_test(NAME "x${float_char}cblat3_3m"
|
||||
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3_3m> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3_3m")
|
||||
endif()
|
||||
endif()
|
||||
endforeach()
|
||||
|
|
|
@ -5,6 +5,24 @@
|
|||
TOPDIR = ..
|
||||
include $(TOPDIR)/Makefile.system
|
||||
|
||||
SUPPORT_GEMM3M = 0
|
||||
|
||||
ifeq ($(ARCH), x86)
|
||||
SUPPORT_GEMM3M = 1
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), x86_64)
|
||||
SUPPORT_GEMM3M = 1
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), ia64)
|
||||
SUPPORT_GEMM3M = 1
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), MIPS)
|
||||
SUPPORT_GEMM3M = 1
|
||||
endif
|
||||
|
||||
override CFLAGS += -DADD$(BU) -DCBLAS
|
||||
ifeq ($(F_COMPILER),GFORTRAN)
|
||||
override FFLAGS += -fno-tree-vectorize
|
||||
|
@ -144,9 +162,15 @@ all3targets += xdcblat3
|
|||
endif
|
||||
ifeq ($(BUILD_COMPLEX),1)
|
||||
all3targets += xccblat3
|
||||
ifeq ($(SUPPORT_GEMM3M),1)
|
||||
all3targets += xccblat3_3m
|
||||
endif
|
||||
endif
|
||||
ifeq ($(BUILD_COMPLEX16),1)
|
||||
all3targets += xzcblat3
|
||||
ifeq ($(SUPPORT_GEMM3M),1)
|
||||
all3targets += xzcblat3_3m
|
||||
endif
|
||||
endif
|
||||
|
||||
all3: $(all3targets)
|
||||
|
@ -181,9 +205,9 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
all3_3m: xzcblat3_3m xccblat3_3m
|
||||
ifeq ($(SUPPORT_GEMM3M),1)
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
ifeq ($(BUILD_SINGLE),1)
|
||||
ifeq ($(BUILD_COMPLEX),1)
|
||||
OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
|
||||
endif
|
||||
ifeq ($(BUILD_COMPLEX16),1)
|
||||
|
@ -197,6 +221,7 @@ ifeq ($(BUILD_COMPLEX16),1)
|
|||
OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
@ -218,6 +243,9 @@ ifeq ($(F_COMPILER), IBM)
|
|||
ifeq ($(C_COMPILER), GCC)
|
||||
CEXTRALIB += -lgomp
|
||||
endif
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
CEXTRALIB += -lomp
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
@ -268,8 +296,10 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME)
|
|||
$(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME)
|
||||
$(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
ifeq ($(SUPPORT_GEMM3M),1)
|
||||
xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME)
|
||||
$(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
endif
|
||||
else
|
||||
xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||
|
@ -277,6 +307,10 @@ xccblat2: $(ctestl2o) c_cblat2c.o $(TOPDIR)/$(LIBNAME)
|
|||
$(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||
xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||
ifeq ($(SUPPORT_GEMM3M),1)
|
||||
xccblat3_3m: $(ctestl3o_3m) c_cblat3c_3m.o $(TOPDIR)/$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o xccblat3_3m c_cblat3c_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
@ -290,8 +324,10 @@ xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME)
|
|||
$(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME)
|
||||
$(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
ifeq ($(SUPPORT_GEMM3M),1)
|
||||
xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME)
|
||||
$(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
endif
|
||||
else
|
||||
xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||
|
@ -299,6 +335,10 @@ xzcblat2: $(ztestl2o) c_zblat2c.o $(TOPDIR)/$(LIBNAME)
|
|||
$(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||
xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||
ifeq ($(SUPPORT_GEMM3M),1)
|
||||
xzcblat3_3m: $(ztestl3o_3m) c_zblat3c_3m.o $(TOPDIR)/$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o xzcblat3_3m c_zblat3c_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
|
|
@ -96,7 +96,7 @@
|
|||
INTEGER ICAMAXTEST
|
||||
EXTERNAL SCASUMTEST, SCNRM2TEST, ICAMAXTEST
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1
|
||||
EXTERNAL CSCALTEST, CSSCALTEST, CTEST, ITEST1, STEST1
|
||||
* .. Intrinsic Functions ..
|
||||
INTRINSIC MAX
|
||||
* .. Common blocks ..
|
||||
|
@ -214,8 +214,8 @@
|
|||
CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1),
|
||||
+ STRUE4(NP1),SFAC)
|
||||
ELSE IF (ICASE.EQ.8) THEN
|
||||
* .. CSCAL ..
|
||||
CALL CSCAL(N,CA,CX,INCX)
|
||||
* .. CSCALTEST ..
|
||||
CALL CSCALTEST(N,CA,CX,INCX)
|
||||
CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX),
|
||||
+ SFAC)
|
||||
ELSE IF (ICASE.EQ.9) THEN
|
||||
|
@ -236,14 +236,14 @@
|
|||
*
|
||||
INCX = 1
|
||||
IF (ICASE.EQ.8) THEN
|
||||
* CSCAL
|
||||
* CSCALTEST
|
||||
* Add a test for alpha equal to zero.
|
||||
CA = (0.0E0,0.0E0)
|
||||
DO 80 I = 1, 5
|
||||
MWPCT(I) = (0.0E0,0.0E0)
|
||||
MWPCS(I) = (1.0E0,1.0E0)
|
||||
80 CONTINUE
|
||||
CALL CSCAL(5,CA,CX,INCX)
|
||||
CALL CSCALTEST(5,CA,CX,INCX)
|
||||
CALL CTEST(5,CX,MWPCT,MWPCS,SFAC)
|
||||
ELSE IF (ICASE.EQ.9) THEN
|
||||
* CSSCALTEST
|
||||
|
|
|
@ -440,6 +440,7 @@ static real c_b43 = (float)1.;
|
|||
extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*);
|
||||
static complex mwpcs[5], mwpct[5];
|
||||
extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*);
|
||||
extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_();
|
||||
static complex cx[8];
|
||||
extern real scnrm2test_(integer*, complex*, integer*);
|
||||
static integer np1;
|
||||
|
@ -481,7 +482,7 @@ static real c_b43 = (float)1.;
|
|||
stest1_(&r__1, &strue4[np1 - 1], &strue4[np1 - 1], sfac);
|
||||
} else if (combla_1.icase == 8) {
|
||||
/* .. CSCAL .. */
|
||||
cscal_(&combla_1.n, &ca, cx, &combla_1.incx);
|
||||
cscaltest_(&combla_1.n, &ca, cx, &combla_1.incx);
|
||||
ctest_(&len, cx, &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48],
|
||||
&ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], sfac);
|
||||
} else if (combla_1.icase == 9) {
|
||||
|
@ -515,7 +516,7 @@ static real c_b43 = (float)1.;
|
|||
mwpcs[i__1].r = (float)1., mwpcs[i__1].i = (float)1.;
|
||||
/* L80: */
|
||||
}
|
||||
cscal_(&c__5, &ca, cx, &combla_1.incx);
|
||||
cscaltest_(&c__5, &ca, cx, &combla_1.incx);
|
||||
ctest_(&c__5, cx, mwpct, mwpcs, sfac);
|
||||
} else if (combla_1.icase == 9) {
|
||||
/* CSSCALTEST */
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -545,13 +545,31 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
|||
*range_n, IFLOAT *sa, IFLOAT *sb,
|
||||
BLASLONG nthreads_m, BLASLONG nthreads_n) {
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
#ifndef OS_WINDOWS
|
||||
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
#ifdef USE_OPENMP
|
||||
static omp_lock_t level3_lock, critical_section_lock;
|
||||
static volatile BLASLONG init_lock = 0, omp_lock_initialized = 0,
|
||||
parallel_section_left = MAX_PARALLEL_NUMBER;
|
||||
|
||||
// Lock initialization; Todo : Maybe this part can be moved to blas_init() in blas_server_omp.c
|
||||
while(omp_lock_initialized == 0)
|
||||
{
|
||||
blas_lock(&init_lock);
|
||||
{
|
||||
if(omp_lock_initialized == 0)
|
||||
{
|
||||
omp_init_lock(&level3_lock);
|
||||
omp_init_lock(&critical_section_lock);
|
||||
omp_lock_initialized = 1;
|
||||
WMB;
|
||||
}
|
||||
blas_unlock(&init_lock);
|
||||
}
|
||||
}
|
||||
#elif defined(OS_WINDOWS)
|
||||
CRITICAL_SECTION level3_lock;
|
||||
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#else
|
||||
CRITICAL_SECTION level3_lock;
|
||||
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#endif
|
||||
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
#endif
|
||||
|
||||
blas_arg_t newarg;
|
||||
|
@ -599,12 +617,28 @@ InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
#ifndef OS_WINDOWS
|
||||
pthread_mutex_lock(&level3_lock);
|
||||
#ifdef USE_OPENMP
|
||||
omp_set_lock(&level3_lock);
|
||||
omp_set_lock(&critical_section_lock);
|
||||
|
||||
parallel_section_left--;
|
||||
|
||||
/*
|
||||
How OpenMP locks works with NUM_PARALLEL
|
||||
1) parallel_section_left = Number of available concurrent executions of OpenBLAS - Number of currently executing OpenBLAS executions
|
||||
2) level3_lock is acting like a master lock or barrier which stops OpenBLAS calls when all the parallel_section are currently busy executing other OpenBLAS calls
|
||||
3) critical_section_lock is used for updating variables shared between threads executing OpenBLAS calls concurrently and for unlocking of master lock whenever required
|
||||
4) Unlock master lock only when we have not already exhausted all the parallel_sections and allow another thread with a OpenBLAS call to enter
|
||||
*/
|
||||
if(parallel_section_left != 0)
|
||||
omp_unset_lock(&level3_lock);
|
||||
|
||||
omp_unset_lock(&critical_section_lock);
|
||||
|
||||
#elif defined(OS_WINDOWS)
|
||||
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#else
|
||||
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#endif
|
||||
pthread_mutex_lock(&level3_lock);
|
||||
#endif
|
||||
|
||||
#ifdef USE_ALLOC_HEAP
|
||||
|
@ -732,12 +766,24 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
|||
free(job);
|
||||
#endif
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
#ifndef OS_WINDOWS
|
||||
pthread_mutex_unlock(&level3_lock);
|
||||
#else
|
||||
#ifdef USE_OPENMP
|
||||
omp_set_lock(&critical_section_lock);
|
||||
parallel_section_left++;
|
||||
|
||||
/*
|
||||
Unlock master lock only when all the parallel_sections are already exhausted and one of the thread has completed its OpenBLAS call
|
||||
otherwise just increment the parallel_section_left
|
||||
The master lock is only locked when we have exhausted all the parallel_sections, So only unlock it then and otherwise just increment the count
|
||||
*/
|
||||
if(parallel_section_left == 1)
|
||||
omp_unset_lock(&level3_lock);
|
||||
|
||||
omp_unset_lock(&critical_section_lock);
|
||||
|
||||
#elif defined(OS_WINDOWS)
|
||||
LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#endif
|
||||
#else
|
||||
pthread_mutex_unlock(&level3_lock);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -113,6 +113,8 @@ extern unsigned int openblas_thread_timeout(void);
|
|||
/* We need this global for checking if initialization is finished. */
|
||||
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
|
||||
|
||||
int blas_omp_threads_local = 1;
|
||||
|
||||
/* Local Variables */
|
||||
#if defined(USE_PTHREAD_LOCK)
|
||||
static pthread_mutex_t server_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
|
|
|
@ -69,6 +69,7 @@
|
|||
|
||||
int blas_server_avail = 0;
|
||||
int blas_omp_number_max = 0;
|
||||
int blas_omp_threads_local = 1;
|
||||
|
||||
extern int openblas_omp_adaptive_env(void);
|
||||
|
||||
|
@ -406,7 +407,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
|||
}
|
||||
#endif
|
||||
|
||||
while(true) {
|
||||
while (true) {
|
||||
for(i=0; i < MAX_PARALLEL_NUMBER; i++) {
|
||||
#ifdef HAVE_C11
|
||||
_Bool inuse = false;
|
||||
|
@ -419,10 +420,9 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
|||
break;
|
||||
}
|
||||
}
|
||||
if(i != MAX_PARALLEL_NUMBER)
|
||||
break;
|
||||
}
|
||||
|
||||
if (i != MAX_PARALLEL_NUMBER)
|
||||
break;
|
||||
}
|
||||
if (openblas_omp_adaptive_env() != 0) {
|
||||
#pragma omp parallel for num_threads(num) schedule(OMP_SCHED)
|
||||
for (i = 0; i < num; i ++) {
|
||||
|
|
|
@ -48,6 +48,12 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
# define MT_TRACE(...) fprintf(stderr, __VA_ARGS__)
|
||||
#else
|
||||
# define MT_TRACE(...)
|
||||
#endif
|
||||
|
||||
/* This is a thread implementation for Win32 lazy implementation */
|
||||
|
||||
/* Thread server common information */
|
||||
|
@ -59,6 +65,8 @@ static CRITICAL_SECTION queue_lock;
|
|||
/* We need this global for checking if initialization is finished. */
|
||||
int blas_server_avail = 0;
|
||||
|
||||
int blas_omp_threads_local = 1;
|
||||
|
||||
/* Local Variables */
|
||||
static BLASULONG server_lock = 0;
|
||||
|
||||
|
@ -66,19 +74,12 @@ static HANDLE blas_threads [MAX_CPU_NUMBER];
|
|||
static DWORD blas_threads_id[MAX_CPU_NUMBER];
|
||||
static volatile int thread_target; // target num of live threads, volatile for cross-thread reads
|
||||
|
||||
#if defined (__GNUC__) && (__GNUC__ < 6)
|
||||
#define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch)
|
||||
#else
|
||||
#if defined(_WIN64)
|
||||
#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp)
|
||||
#else
|
||||
#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp)
|
||||
#endif
|
||||
#endif
|
||||
//
|
||||
// Legacy code path
|
||||
//
|
||||
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) {
|
||||
|
||||
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
||||
|
||||
if (!(mode & BLAS_COMPLEX)){
|
||||
if (!(mode & BLAS_COMPLEX)) {
|
||||
#ifdef EXPRECISION
|
||||
if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
|
||||
/* REAL / Extended Double */
|
||||
|
@ -93,7 +94,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
args -> c, args -> ldc, sb);
|
||||
} else
|
||||
#endif
|
||||
if ((mode & BLAS_PREC) == BLAS_DOUBLE){
|
||||
if ((mode & BLAS_PREC) == BLAS_DOUBLE) {
|
||||
/* REAL / Double */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
|
||||
double *, BLASLONG, double *, BLASLONG,
|
||||
|
@ -104,7 +105,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
args -> a, args -> lda,
|
||||
args -> b, args -> ldb,
|
||||
args -> c, args -> ldc, sb);
|
||||
} else if ((mode & BLAS_PREC) == BLAS_SINGLE){
|
||||
} else if ((mode & BLAS_PREC) == BLAS_SINGLE) {
|
||||
/* REAL / Single */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
|
||||
float *, BLASLONG, float *, BLASLONG,
|
||||
|
@ -116,7 +117,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
args -> b, args -> ldb,
|
||||
args -> c, args -> ldc, sb);
|
||||
#ifdef BUILD_BFLOAT16
|
||||
} else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){
|
||||
} else if ((mode & BLAS_PREC) == BLAS_BFLOAT16) {
|
||||
/* REAL / BFLOAT16 */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
|
||||
bfloat16 *, BLASLONG, bfloat16 *, BLASLONG,
|
||||
|
@ -127,7 +128,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
args -> a, args -> lda,
|
||||
args -> b, args -> ldb,
|
||||
args -> c, args -> ldc, sb);
|
||||
} else if ((mode & BLAS_PREC) == BLAS_STOBF16){
|
||||
} else if ((mode & BLAS_PREC) == BLAS_STOBF16) {
|
||||
/* REAL / BLAS_STOBF16 */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
|
||||
float *, BLASLONG, bfloat16 *, BLASLONG,
|
||||
|
@ -138,7 +139,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
args -> a, args -> lda,
|
||||
args -> b, args -> ldb,
|
||||
args -> c, args -> ldc, sb);
|
||||
} else if ((mode & BLAS_PREC) == BLAS_DTOBF16){
|
||||
} else if ((mode & BLAS_PREC) == BLAS_DTOBF16) {
|
||||
/* REAL / BLAS_DTOBF16 */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
|
||||
double *, BLASLONG, bfloat16 *, BLASLONG,
|
||||
|
@ -155,7 +156,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
}
|
||||
} else {
|
||||
#ifdef EXPRECISION
|
||||
if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
|
||||
if ((mode & BLAS_PREC) == BLAS_XDOUBLE) {
|
||||
/* COMPLEX / Extended Double */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
|
||||
xdouble *, BLASLONG, xdouble *, BLASLONG,
|
||||
|
@ -169,7 +170,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
args -> c, args -> ldc, sb);
|
||||
} else
|
||||
#endif
|
||||
if ((mode & BLAS_PREC) == BLAS_DOUBLE){
|
||||
if ((mode & BLAS_PREC) == BLAS_DOUBLE) {
|
||||
/* COMPLEX / Double */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
|
||||
double *, BLASLONG, double *, BLASLONG,
|
||||
|
@ -199,10 +200,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
}
|
||||
}
|
||||
|
||||
/* This is a main routine of threads. Each thread waits until job is */
|
||||
/* queued. */
|
||||
|
||||
static DWORD WINAPI blas_thread_server(void *arg){
|
||||
//
|
||||
// This is a main routine of threads. Each thread waits until job is queued.
|
||||
//
|
||||
static DWORD WINAPI blas_thread_server(void *arg) {
|
||||
|
||||
/* Thread identifier */
|
||||
BLASLONG cpu = (BLASLONG)arg;
|
||||
|
@ -213,31 +214,24 @@ static DWORD WINAPI blas_thread_server(void *arg){
|
|||
/* Each server needs each buffer */
|
||||
buffer = blas_memory_alloc(2);
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu);
|
||||
#endif
|
||||
MT_TRACE("Server[%2ld] Thread is started!\n", cpu);
|
||||
|
||||
while (1){
|
||||
while (1) {
|
||||
|
||||
/* Waiting for Queue */
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu);
|
||||
#endif
|
||||
// event raised when work is added to the queue
|
||||
WaitForSingleObject(kickoff_event, INFINITE);
|
||||
MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu);
|
||||
|
||||
if (cpu > thread_target - 2)
|
||||
{
|
||||
//printf("thread [%d] exiting.\n", cpu);
|
||||
break; // excess thread, so worker thread exits
|
||||
}
|
||||
// event raised when work is added to the queue
|
||||
WaitForSingleObject(kickoff_event, INFINITE);
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Server[%2ld] Got it.\n", cpu);
|
||||
#endif
|
||||
if (cpu > thread_target - 2) {
|
||||
//MT_TRACE("thread [%d] exiting.\n", cpu);
|
||||
break; // excess thread, so worker thread exits
|
||||
}
|
||||
|
||||
MT_TRACE("Server[%2ld] Got it.\n", cpu);
|
||||
|
||||
#if 1
|
||||
EnterCriticalSection(&queue_lock);
|
||||
|
||||
queue = work_queue;
|
||||
|
@ -245,53 +239,39 @@ static DWORD WINAPI blas_thread_server(void *arg){
|
|||
work_queue = work_queue->next;
|
||||
|
||||
LeaveCriticalSection(&queue_lock);
|
||||
#else
|
||||
volatile blas_queue_t* queue_next;
|
||||
|
||||
INT_PTR prev_value;
|
||||
do {
|
||||
queue = (volatile blas_queue_t*)work_queue;
|
||||
if (!queue)
|
||||
break;
|
||||
|
||||
queue_next = (volatile blas_queue_t*)queue->next;
|
||||
prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue);
|
||||
} while (prev_value != queue);
|
||||
#endif
|
||||
|
||||
if (queue) {
|
||||
if (queue) {
|
||||
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
|
||||
|
||||
sa = queue -> sa;
|
||||
sb = queue -> sb;
|
||||
|
||||
#ifdef CONSISTENT_FPCSR
|
||||
__asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
|
||||
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
|
||||
#endif
|
||||
#ifdef CONSISTENT_FPCSR
|
||||
__asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
|
||||
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
|
||||
#endif
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
|
||||
MT_TRACE("Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
|
||||
cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
|
||||
#endif
|
||||
|
||||
// fprintf(stderr, "queue start[%ld]!!!\n", cpu);
|
||||
|
||||
#ifdef MONITOR
|
||||
main_status[cpu] = MAIN_RUNNING1;
|
||||
#endif
|
||||
#ifdef MONITOR
|
||||
main_status[cpu] = MAIN_RUNNING1;
|
||||
#endif
|
||||
|
||||
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
||||
if (sa == NULL)
|
||||
sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
||||
|
||||
if (sb == NULL) {
|
||||
if (!(queue -> mode & BLAS_COMPLEX)){
|
||||
if (!(queue -> mode & BLAS_COMPLEX)) {
|
||||
#ifdef EXPRECISION
|
||||
if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
|
||||
if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE) {
|
||||
sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble)
|
||||
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
} else
|
||||
#endif
|
||||
if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
|
||||
if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) {
|
||||
#ifdef BUILD_DOUBLE
|
||||
sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
|
||||
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
|
@ -325,65 +305,58 @@ static DWORD WINAPI blas_thread_server(void *arg){
|
|||
/* Other types in future */
|
||||
}
|
||||
}
|
||||
queue->sb=sb;
|
||||
queue->sb=sb;
|
||||
}
|
||||
|
||||
#ifdef MONITOR
|
||||
main_status[cpu] = MAIN_RUNNING2;
|
||||
#endif
|
||||
#ifdef MONITOR
|
||||
main_status[cpu] = MAIN_RUNNING2;
|
||||
#endif
|
||||
|
||||
if (!(queue -> mode & BLAS_LEGACY)) {
|
||||
|
||||
(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
|
||||
(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
|
||||
} else {
|
||||
legacy_exec(routine, queue -> mode, queue -> args, sb);
|
||||
legacy_exec(routine, queue -> mode, queue -> args, sb);
|
||||
}
|
||||
}else{
|
||||
continue; //if queue == NULL
|
||||
}
|
||||
} else {
|
||||
continue; //if queue == NULL
|
||||
}
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
|
||||
#endif
|
||||
MT_TRACE("Server[%2ld] Finished!\n", cpu);
|
||||
|
||||
queue->finished = 1;
|
||||
|
||||
queue->finished = 1;
|
||||
}
|
||||
|
||||
/* Shutdown procedure */
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu);
|
||||
#endif
|
||||
MT_TRACE("Server[%2ld] Shutdown!\n", cpu);
|
||||
|
||||
blas_memory_free(buffer);
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* Initializing routine */
|
||||
int blas_thread_init(void){
|
||||
//
|
||||
// Initializing routine
|
||||
//
|
||||
int blas_thread_init(void) {
|
||||
BLASLONG i;
|
||||
|
||||
if (blas_server_avail || (blas_cpu_number <= 1)) return 0;
|
||||
|
||||
LOCK_COMMAND(&server_lock);
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n",
|
||||
blas_cpu_number);
|
||||
#endif
|
||||
MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number);
|
||||
|
||||
if (!blas_server_avail){
|
||||
// create the kickoff Event
|
||||
kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
|
||||
if (!blas_server_avail) {
|
||||
// create the kickoff Event
|
||||
kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
|
||||
|
||||
thread_target = blas_cpu_number;
|
||||
thread_target = blas_cpu_number;
|
||||
|
||||
InitializeCriticalSection(&queue_lock);
|
||||
|
||||
for(i = 0; i < blas_cpu_number - 1; i++){
|
||||
//printf("thread_init: creating thread [%d]\n", i);
|
||||
for(i = 0; i < blas_cpu_number - 1; i++) {
|
||||
//MT_TRACE("thread_init: creating thread [%d]\n", i);
|
||||
|
||||
blas_threads[i] = CreateThread(NULL, 0,
|
||||
blas_thread_server, (void *)i,
|
||||
|
@ -398,15 +371,12 @@ int blas_thread_init(void){
|
|||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
User can call one of two routines.
|
||||
|
||||
exec_blas_async ... immediately returns after jobs are queued.
|
||||
|
||||
exec_blas ... returns after jobs are finished.
|
||||
*/
|
||||
|
||||
int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
|
||||
//
|
||||
// User can call one of two routines.
|
||||
// exec_blas_async ... immediately returns after jobs are queued.
|
||||
// exec_blas ... returns after jobs are finished.
|
||||
//
|
||||
int exec_blas_async(BLASLONG pos, blas_queue_t *queue) {
|
||||
|
||||
#if defined(SMP_SERVER)
|
||||
// Handle lazy re-init of the thread-pool after a POSIX fork
|
||||
|
@ -426,7 +396,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
|
|||
__asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode));
|
||||
#endif
|
||||
|
||||
current->finished = 0;
|
||||
current->finished = 0;
|
||||
current = current -> next;
|
||||
pos ++;
|
||||
}
|
||||
|
@ -435,18 +405,18 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
|
|||
|
||||
if (!work_queue)
|
||||
{
|
||||
work_queue = queue;
|
||||
work_queue = queue;
|
||||
}
|
||||
else
|
||||
{
|
||||
blas_queue_t *next_item = work_queue;
|
||||
blas_queue_t *queue_item = work_queue;
|
||||
|
||||
// find the end of the work queue
|
||||
while (next_item)
|
||||
next_item = next_item->next;
|
||||
// find the end of the work queue
|
||||
while (queue_item->next)
|
||||
queue_item = queue_item->next;
|
||||
|
||||
// add new work to the end
|
||||
next_item = queue;
|
||||
// add new work to the end
|
||||
queue_item->next = queue;
|
||||
}
|
||||
|
||||
LeaveCriticalSection(&queue_lock);
|
||||
|
@ -456,26 +426,25 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
|
|||
return 0;
|
||||
}
|
||||
|
||||
int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
|
||||
//
|
||||
// Join. Wait for all queued tasks to complete
|
||||
//
|
||||
int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) {
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Synchronization Waiting.\n");
|
||||
#endif
|
||||
MT_TRACE("Synchronization Waiting.\n");
|
||||
|
||||
while (num){
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Waiting Queue ..\n");
|
||||
#endif
|
||||
while (!queue->finished)
|
||||
YIELDING;
|
||||
while (num) {
|
||||
MT_TRACE("Waiting Queue ..\n");
|
||||
|
||||
queue = queue->next;
|
||||
num--;
|
||||
}
|
||||
while (!queue->finished)
|
||||
YIELDING;
|
||||
|
||||
queue = queue->next;
|
||||
num--;
|
||||
}
|
||||
|
||||
MT_TRACE("Completely Done.\n\n");
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Completely Done.\n\n");
|
||||
#endif
|
||||
// if work was added to the queue after this batch we can't sleep the worker threads
|
||||
// by resetting the event
|
||||
EnterCriticalSection(&queue_lock);
|
||||
|
@ -488,8 +457,10 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* Execute Threads */
|
||||
int exec_blas(BLASLONG num, blas_queue_t *queue){
|
||||
//
|
||||
// Execute Threads
|
||||
//
|
||||
int exec_blas(BLASLONG num, blas_queue_t *queue) {
|
||||
|
||||
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
|
||||
// Handle lazy re-init of the thread-pool after a POSIX fork
|
||||
|
@ -502,29 +473,33 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
|||
|
||||
if ((num <= 0) || (queue == NULL)) return 0;
|
||||
|
||||
if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next);
|
||||
if ((num > 1) && queue -> next)
|
||||
exec_blas_async(1, queue -> next);
|
||||
|
||||
routine = queue -> routine;
|
||||
|
||||
if (queue -> mode & BLAS_LEGACY) {
|
||||
legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
|
||||
} else
|
||||
} else {
|
||||
if (queue -> mode & BLAS_PTHREAD) {
|
||||
void (*pthreadcompat)(void *) = queue -> routine;
|
||||
(pthreadcompat)(queue -> args);
|
||||
} else
|
||||
(routine)(queue -> args, queue -> range_m, queue -> range_n,
|
||||
queue -> sa, queue -> sb, 0);
|
||||
queue -> sa, queue -> sb, 0);
|
||||
}
|
||||
|
||||
if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next);
|
||||
if ((num > 1) && queue -> next)
|
||||
exec_blas_async_wait(num - 1, queue -> next);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Shutdown procedure, but user don't have to call this routine. The */
|
||||
/* kernel automatically kill threads. */
|
||||
|
||||
int BLASFUNC(blas_thread_shutdown)(void){
|
||||
//
|
||||
// Shutdown procedure, but user don't have to call this routine. The
|
||||
// kernel automatically kill threads.
|
||||
//
|
||||
int BLASFUNC(blas_thread_shutdown)(void) {
|
||||
|
||||
int i;
|
||||
|
||||
|
@ -532,9 +507,9 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
|||
|
||||
LOCK_COMMAND(&server_lock);
|
||||
|
||||
if (blas_server_avail){
|
||||
if (blas_server_avail) {
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
for (i = 0; i < blas_num_threads - 1; i++) {
|
||||
// Could also just use WaitForMultipleObjects
|
||||
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
|
||||
|
||||
|
@ -556,6 +531,9 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
|||
return 0;
|
||||
}
|
||||
|
||||
//
|
||||
// Legacy function to set numbef of threads
|
||||
//
|
||||
void goto_set_num_threads(int num_threads)
|
||||
{
|
||||
long i;
|
||||
|
@ -569,7 +547,7 @@ void goto_set_num_threads(int num_threads)
|
|||
|
||||
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
if (blas_server_avail && num_threads < blas_num_threads) {
|
||||
if (blas_server_avail && num_threads < blas_num_threads) {
|
||||
LOCK_COMMAND(&server_lock);
|
||||
|
||||
thread_target = num_threads;
|
||||
|
@ -577,11 +555,11 @@ void goto_set_num_threads(int num_threads)
|
|||
SetEvent(kickoff_event);
|
||||
|
||||
for (i = num_threads - 1; i < blas_num_threads - 1; i++) {
|
||||
//printf("set_num_threads: waiting on thread [%d] to quit.\n", i);
|
||||
//MT_TRACE("set_num_threads: waiting on thread [%d] to quit.\n", i);
|
||||
|
||||
WaitForSingleObject(blas_threads[i], INFINITE);
|
||||
|
||||
//printf("set_num_threads: thread [%d] has quit.\n", i);
|
||||
//MT_TRACE("set_num_threads: thread [%d] has quit.\n", i);
|
||||
|
||||
CloseHandle(blas_threads[i]);
|
||||
}
|
||||
|
@ -599,8 +577,8 @@ void goto_set_num_threads(int num_threads)
|
|||
|
||||
thread_target = num_threads;
|
||||
|
||||
//increased_threads = 1;
|
||||
if (!blas_server_avail){
|
||||
//increased_threads = 1;
|
||||
if (!blas_server_avail) {
|
||||
// create the kickoff Event
|
||||
kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
|
||||
|
||||
|
@ -609,8 +587,8 @@ void goto_set_num_threads(int num_threads)
|
|||
blas_server_avail = 1;
|
||||
}
|
||||
|
||||
for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
|
||||
//printf("set_num_threads: creating thread [%d]\n", i);
|
||||
for (i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) {
|
||||
//MT_TRACE("set_num_threads: creating thread [%d]\n", i);
|
||||
|
||||
blas_threads[i] = CreateThread(NULL, 0,
|
||||
blas_thread_server, (void *)i,
|
||||
|
@ -625,6 +603,9 @@ void goto_set_num_threads(int num_threads)
|
|||
blas_cpu_number = num_threads;
|
||||
}
|
||||
|
||||
//
|
||||
// Openblas function to set thread count
|
||||
//
|
||||
void openblas_set_num_threads(int num)
|
||||
{
|
||||
goto_set_num_threads(num);
|
||||
|
|
|
@ -275,6 +275,7 @@ extern gotoblas_t gotoblas_EXCAVATOR;
|
|||
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
|
||||
#define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE
|
||||
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
|
||||
#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE
|
||||
#else
|
||||
extern gotoblas_t gotoblas_HASWELL;
|
||||
extern gotoblas_t gotoblas_ZEN;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* Copyright 2023-2024 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -143,12 +143,13 @@ extern gotoblas_t gotoblas_ARMV8SVE;
|
|||
#endif
|
||||
extern gotoblas_t gotoblas_THUNDERX3T110;
|
||||
#endif
|
||||
#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
#define FALLBACK_VERBOSE 1
|
||||
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"
|
||||
|
||||
#define NUM_CORETYPES 16
|
||||
#define NUM_CORETYPES 17
|
||||
|
||||
/*
|
||||
* In case asm/hwcap.h is outdated on the build system, make sure
|
||||
|
@ -178,6 +179,7 @@ static char *corename[] = {
|
|||
"emag8180",
|
||||
"neoversen1",
|
||||
"neoversev1",
|
||||
"neoversev2",
|
||||
"neoversen2",
|
||||
"thunderx3t110",
|
||||
"cortexa55",
|
||||
|
@ -198,10 +200,11 @@ char *gotoblas_corename(void) {
|
|||
if (gotoblas == &gotoblas_EMAG8180) return corename[ 9];
|
||||
if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10];
|
||||
if (gotoblas == &gotoblas_NEOVERSEV1) return corename[11];
|
||||
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12];
|
||||
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
|
||||
if (gotoblas == &gotoblas_CORTEXA55) return corename[14];
|
||||
if (gotoblas == &gotoblas_ARMV8SVE) return corename[15];
|
||||
if (gotoblas == &gotoblas_NEOVERSEV2) return corename[12];
|
||||
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[13];
|
||||
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[14];
|
||||
if (gotoblas == &gotoblas_CORTEXA55) return corename[15];
|
||||
if (gotoblas == &gotoblas_ARMV8SVE) return corename[16];
|
||||
return corename[NUM_CORETYPES];
|
||||
}
|
||||
|
||||
|
@ -233,10 +236,11 @@ static gotoblas_t *force_coretype(char *coretype) {
|
|||
case 9: return (&gotoblas_EMAG8180);
|
||||
case 10: return (&gotoblas_NEOVERSEN1);
|
||||
case 11: return (&gotoblas_NEOVERSEV1);
|
||||
case 12: return (&gotoblas_NEOVERSEN2);
|
||||
case 13: return (&gotoblas_THUNDERX3T110);
|
||||
case 14: return (&gotoblas_CORTEXA55);
|
||||
case 15: return (&gotoblas_ARMV8SVE);
|
||||
case 12: return (&gotoblas_NEOVERSEV2);
|
||||
case 13: return (&gotoblas_NEOVERSEN2);
|
||||
case 14: return (&gotoblas_THUNDERX3T110);
|
||||
case 15: return (&gotoblas_CORTEXA55);
|
||||
case 16: return (&gotoblas_ARMV8SVE);
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
|
@ -312,6 +316,13 @@ static gotoblas_t *get_coretype(void) {
|
|||
return &gotoblas_NEOVERSEN1;
|
||||
}else
|
||||
return &gotoblas_NEOVERSEV1;
|
||||
case 0xd4f:
|
||||
if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
|
||||
return &gotoblas_NEOVERSEN1;
|
||||
} else {
|
||||
return &gotoblas_NEOVERSEV2;
|
||||
}
|
||||
#endif
|
||||
case 0xd05: // Cortex A55
|
||||
return &gotoblas_CORTEXA55;
|
||||
|
|
|
@ -43,6 +43,13 @@ char *gotoblas_corename(void) {
|
|||
#define CPU_POWER9 9
|
||||
#define CPU_POWER10 10
|
||||
|
||||
#ifndef POWER_9
|
||||
#define POWER_9 0x20000 /* 9 class CPU */
|
||||
#endif
|
||||
#ifndef POWER_10
|
||||
#define POWER_10 0x40000 /* 10 class CPU */
|
||||
#endif
|
||||
|
||||
#ifdef _AIX
|
||||
#include <sys/systemcfg.h>
|
||||
|
||||
|
@ -62,7 +69,7 @@ static int cpuid(void)
|
|||
else if (arch == POWER_9) return CPU_POWER9;
|
||||
#endif
|
||||
#ifdef POWER_10
|
||||
else if (arch == POWER_10) return CPU_POWER10;
|
||||
else if (arch >= POWER_10) return CPU_POWER10;
|
||||
#endif
|
||||
return CPU_UNKNOWN;
|
||||
}
|
||||
|
@ -332,6 +339,9 @@ void gotoblas_dynamic_init(void) {
|
|||
if (gotoblas && gotoblas -> init) {
|
||||
strncpy(coren,gotoblas_corename(),20);
|
||||
sprintf(coremsg, "Core: %s\n",coren);
|
||||
if (getenv("GET_OPENBLAS_CORETYPE")) {
|
||||
fprintf(stderr, "%s", coremsg);
|
||||
}
|
||||
openblas_warning(2, coremsg);
|
||||
gotoblas -> init();
|
||||
} else {
|
||||
|
|
|
@ -3214,7 +3214,7 @@ void blas_shutdown(void){
|
|||
#endif
|
||||
memory[pos].lock = 0;
|
||||
}
|
||||
if (memory_overflowed)
|
||||
if (memory_overflowed) {
|
||||
for (pos = 0; pos < NEW_BUFFERS; pos ++){
|
||||
newmemory[pos].addr = (void *)0;
|
||||
newmemory[pos].used = 0;
|
||||
|
@ -3222,6 +3222,10 @@ void blas_shutdown(void){
|
|||
newmemory[pos].pos = -1;
|
||||
#endif
|
||||
newmemory[pos].lock = 0;
|
||||
}
|
||||
free(newmemory);
|
||||
newmemory = NULL;
|
||||
memory_overflowed = 0;
|
||||
}
|
||||
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
|
|
@ -36,11 +36,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifdef SMP_SERVER
|
||||
|
||||
extern void openblas_set_num_threads(int num_threads) ;
|
||||
extern int openblas_get_num_threads(void) ;
|
||||
|
||||
void openblas_set_num_threads_(int* num_threads){
|
||||
openblas_set_num_threads(*num_threads);
|
||||
}
|
||||
|
||||
int openblas_set_num_threads_local(int num_threads){
|
||||
int ret = openblas_get_num_threads();
|
||||
openblas_set_num_threads(num_threads);
|
||||
blas_omp_threads_local=num_threads;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
#else
|
||||
//Single thread
|
||||
|
||||
|
@ -50,4 +59,8 @@ void openblas_set_num_threads(int num_threads) {
|
|||
void openblas_set_num_threads_(int* num_threads){
|
||||
|
||||
}
|
||||
|
||||
int openblas_set_num_threads_local(int num_threads){
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -73,6 +73,10 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER)$(OSNAME), IBMAIX)
|
||||
EXTRALIB += -lxlf90
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
EXTRALIB += -pgf90libs
|
||||
endif
|
||||
|
@ -132,8 +136,12 @@ libgoto_hpl.def : $(GENSYM)
|
|||
./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
|
||||
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
ifeq ($(FIXED_LIBNAME),1)
|
||||
INTERNALNAME = $(LIBPREFIX)$(LIBNAMESUFFIX).dylib
|
||||
else
|
||||
INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
|
||||
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
|
||||
|
@ -169,8 +177,12 @@ INTERNALNAME = $(LIBPREFIX).so
|
|||
FEXTRALIB += -lm
|
||||
EXTRALIB += -lm
|
||||
else
|
||||
ifeq ($(FIXED_LIBNAME),1)
|
||||
INTERNALNAME = $(LIBPREFIX)$(LIBNAMESUFFIX).so
|
||||
else
|
||||
INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
|
||||
../$(LIBSONAME) : ../$(LIBNAME) linktest.c
|
||||
|
@ -248,6 +260,20 @@ endif
|
|||
|
||||
ifeq ($(OSNAME), AIX)
|
||||
|
||||
so : ../$(LIBSONAME) linktest.c
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(EXTRALIB) && echo OK.
|
||||
rm -f linktest
|
||||
|
||||
../$(LIBSONAME) : aix.exp
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||
-Wl,-bcdtors:all:-2147481648:s,-bE:aix.exp -Wl,-bbigtoc ../$(LIBNAME) $(EXTRALIB)
|
||||
|
||||
aix.exp :
|
||||
/usr/bin/nm -X32_64 -PCpgl ../$(LIBNAME) | /usr/bin/awk '{ if ((($$ 2 == "T") \
|
||||
|| ($$ 2 == "D") || ($$ 2 == "B") || ($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) && (substr($$ 1,1,1) != ".")) \
|
||||
{ if (($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) { print $$ 1 " weak" } else { print $$ 1 } } }' | \
|
||||
/usr/bin/sort -u > aix.exp
|
||||
|
||||
ifeq ($(COMPILER_F77), xlf)
|
||||
|
||||
goto32.$(SUFFIX) : ../$(LIBNAME) aix.def
|
||||
|
@ -289,6 +315,11 @@ test : linktest.c
|
|||
|
||||
linktest.c : $(GENSYM) ../Makefile.system ../getarch.c
|
||||
./$(GENSYM) linktest $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c
|
||||
ifeq ($(F_COMPILER), IBM)
|
||||
mv linktest.c linktest.c.FIRST
|
||||
egrep -v 'second_|dsecnd_' linktest.c.FIRST > linktest.c
|
||||
rm linktest.c.FIRST
|
||||
endif
|
||||
|
||||
clean ::
|
||||
@rm -f *.def *.dylib __.SYMDEF* *.renamed
|
||||
|
|
|
@ -60,6 +60,7 @@ cblasobjsc="
|
|||
cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv
|
||||
cblas_scnrm2 cblas_scasum cblas_cgemmt
|
||||
cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy
|
||||
cblas_caxpyc cblas_crotg cblas_csrot cblas_scamax cblas_scamin
|
||||
"
|
||||
cblasobjsd="
|
||||
cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot
|
||||
|
@ -69,6 +70,7 @@ cblasobjsd="
|
|||
cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv
|
||||
cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt
|
||||
cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy
|
||||
cblas_damax cblas_damin
|
||||
"
|
||||
|
||||
cblasobjss="
|
||||
|
@ -80,6 +82,7 @@ cblasobjss="
|
|||
cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm
|
||||
cblas_strsv cblas_sgeadd cblas_sgemmt
|
||||
cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy
|
||||
cblas_samax cblas_samin
|
||||
"
|
||||
|
||||
cblasobjsz="
|
||||
|
@ -91,6 +94,7 @@ cblasobjsz="
|
|||
cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub
|
||||
cblas_zaxpby cblas_zgeadd cblas_zgemmt
|
||||
cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy
|
||||
cblas_zaxpyc cblas_zdrot cblas_zrotg cblas_dzamax cblas_dzamin
|
||||
"
|
||||
|
||||
cblasobjs="cblas_xerbla"
|
||||
|
@ -861,6 +865,53 @@ lapackobjs2z="$lapackobjs2z
|
|||
zgedmd
|
||||
zgedmdq
|
||||
"
|
||||
|
||||
#functions added post 3.11
|
||||
|
||||
lapackobjs2c="$lapackobjs2c
|
||||
claqp2rk
|
||||
claqp3rk
|
||||
ctrsyl3
|
||||
"
|
||||
# claqz0
|
||||
# claqz1
|
||||
# claqz2
|
||||
# claqz3
|
||||
# clatrs3
|
||||
|
||||
lapackobjs2d="$lapackobjs2d
|
||||
dgelqs
|
||||
dgelst
|
||||
dgeqp3rk
|
||||
dgeqrs
|
||||
dlaqp2rk
|
||||
dlaqp3rk
|
||||
dlarmm
|
||||
dlatrs3
|
||||
dtrsyl3
|
||||
"
|
||||
# dlaqz0
|
||||
# dlaqz1
|
||||
# dlaqz2
|
||||
# dlaqz3
|
||||
# dlaqz4
|
||||
|
||||
lapackobjs2z="$lapackobjs2z
|
||||
zgelqs
|
||||
zgelst
|
||||
zgeqp3rk
|
||||
zgeqrs
|
||||
zlaqp2rk
|
||||
zlaqp3rk
|
||||
zlatrs3
|
||||
zrscl
|
||||
ztrsyl3
|
||||
"
|
||||
# zlaqz0
|
||||
# zlaqz1
|
||||
# zlaqz2
|
||||
# zlaqz3
|
||||
|
||||
lapack_extendedprecision_objs="
|
||||
zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx
|
||||
dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx
|
||||
|
@ -1622,6 +1673,14 @@ lapackeobjsc="
|
|||
LAPACKE_cgetsqrhrt_work
|
||||
LAPACKE_cungtsqr_row
|
||||
LAPACKE_cungtsqr_row_work
|
||||
LAPACKE_clangb
|
||||
LAPACKE_clangb_work
|
||||
LAPACKE_ctrsyl3
|
||||
LAPACKE_ctrsyl3_work
|
||||
LAPACKE_ctz_nancheck
|
||||
LAPACKE_ctz_trans
|
||||
LAPACKE_cunhr_col
|
||||
LAPACKE_cunhr_col_work
|
||||
"
|
||||
|
||||
lapackeobjsd="
|
||||
|
@ -2239,6 +2298,14 @@ lapackeobjsd="
|
|||
LAPACKE_dgetsqrhrt_work
|
||||
LAPACKE_dorgtsqr_row
|
||||
LAPACKE_dorgtsqr_row_work
|
||||
LAPACKE_dlangb
|
||||
LAPACKE_dlangb_work
|
||||
LAPACKE_dorhr_col
|
||||
LAPACKE_dorhr_col_work
|
||||
LAPACKE_dtrsyl3
|
||||
LAPACKE_dtrsyl3_work
|
||||
LAPACKE_dtz_nancheck
|
||||
LAPACKE_dtz_trans
|
||||
"
|
||||
|
||||
lapackeobjss="
|
||||
|
@ -2848,6 +2915,14 @@ lapackeobjss="
|
|||
LAPACKE_sgetsqrhrt_work
|
||||
LAPACKE_sorgtsqr_row
|
||||
LAPACKE_sorgtsqr_row_work
|
||||
LAPACKE_slangb
|
||||
LAPACKE_slangb_work
|
||||
LAPACKE_sorhr_col
|
||||
LAPACKE_sorhr_col_work
|
||||
LAPACKE_strsyl3
|
||||
LAPACKE_strsyl3_work
|
||||
LAPACKE_stz_nancheck
|
||||
LAPACKE_stz_trans
|
||||
"
|
||||
|
||||
lapackeobjsz="
|
||||
|
@ -3515,6 +3590,14 @@ lapackeobjsz="
|
|||
LAPACKE_zgetsqrhrt_work
|
||||
LAPACKE_zungtsqr_row
|
||||
LAPACKE_zungtsqr_row_work
|
||||
LAPACKE_zlangb
|
||||
LAPACKE_zlangb_work
|
||||
LAPACKE_ztrsyl3
|
||||
LAPACKE_ztrsyl3_work
|
||||
LAPACKE_ztz_nancheck
|
||||
LAPACKE_ztz_trans
|
||||
LAPACKE_zunhr_col
|
||||
LAPACKE_zunhr_col_work
|
||||
"
|
||||
## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile`
|
||||
## Not exported: requires LAPACKE_EXTENDED to be set and depends on the
|
||||
|
@ -3616,6 +3699,7 @@ lapack_embeded_underscore_objs_s="
|
|||
ssysv_aa_2stage ssytrf_aa_2stage
|
||||
ssytrs_aa_2stage
|
||||
slaorhr_col_getrfnp slaorhr_col_getrfnp2 sorhr_col
|
||||
slarfb_gett
|
||||
"
|
||||
lapack_embeded_underscore_objs_c="
|
||||
chetf2_rook chetrf_rook chetri_rook
|
||||
|
@ -3641,6 +3725,7 @@ lapack_embeded_underscore_objs_c="
|
|||
csysv_aa_2stage csytrf_aa_2stage
|
||||
csytrs_aa_2stage
|
||||
claunhr_col_getrfnp claunhr_col_getrfnp2 cunhr_col
|
||||
clarfb_gett
|
||||
"
|
||||
lapack_embeded_underscore_objs_d="
|
||||
dlasyf_rook
|
||||
|
@ -3658,6 +3743,7 @@ lapack_embeded_underscore_objs_d="
|
|||
dsysv_aa_2stage
|
||||
dsytrf_aa_2stage dsytrs_aa_2stage
|
||||
dlaorhr_col_getrfnp dlaorhr_col_getrfnp2 dorhr_col
|
||||
dlarfb_gett
|
||||
"
|
||||
lapack_embeded_underscore_objs_z="
|
||||
zhetf2_rook zhetrf_rook zhetri_rook
|
||||
|
@ -3682,6 +3768,7 @@ lapack_embeded_underscore_objs_z="
|
|||
zhetrs_aa_2stage zsysv_aa_2stage
|
||||
zsytrf_aa_2stage zsytrs_aa_2stage
|
||||
zlaunhr_col_getrfnp zlaunhr_col_getrfnp2 zunhr_col
|
||||
zlarfb_gett
|
||||
"
|
||||
|
||||
dirname=`pwd -P`/../lapack-netlib
|
||||
|
|
10
f_check
10
f_check
|
@ -45,7 +45,7 @@ if [ -z "$compiler" ]; then
|
|||
pathf90 pathf95
|
||||
pgf95 pgf90 pgf77 pgfortran nvfortran
|
||||
flang egfortran
|
||||
ifort nagfor ifx ftn crayftn"
|
||||
ifort nagfor ifx ftn crayftn armflang"
|
||||
|
||||
for list in $lists; do
|
||||
for p in $path; do
|
||||
|
@ -85,7 +85,11 @@ else
|
|||
*Hewlett*)
|
||||
vendor=CRAY
|
||||
openmp='-fopenmp'
|
||||
;;
|
||||
;;
|
||||
*Arm\ F90*)
|
||||
vendor=FLANG
|
||||
openmp='-fopenmp'
|
||||
;;
|
||||
*GNU*|*GCC*)
|
||||
|
||||
v="${data#*GCC: *\) }"
|
||||
|
@ -108,7 +112,7 @@ else
|
|||
if [ "$major" -ge 17 ]; then
|
||||
vendor=FLANGNEW
|
||||
fi
|
||||
;;
|
||||
;;
|
||||
*ifort*|*ifx*)
|
||||
vendor=INTEL
|
||||
openmp='-fopenmp'
|
||||
|
|
96
getarch.c
96
getarch.c
|
@ -90,7 +90,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include <sys/sysinfo.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#if defined(AIX)
|
||||
#if defined(_AIX)
|
||||
#include <unistd.h>
|
||||
#include <sys/systemcfg.h>
|
||||
#include <sys/sysinfo.h>
|
||||
#endif
|
||||
|
||||
|
@ -150,6 +152,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/* #define FORCE_EV4 */
|
||||
/* #define FORCE_EV5 */
|
||||
/* #define FORCE_EV6 */
|
||||
/* #define FORCE_CSKY */
|
||||
/* #define FORCE_CK860FV */
|
||||
/* #define FORCE_GENERIC */
|
||||
|
||||
#ifdef FORCE_P2
|
||||
|
@ -1327,6 +1331,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CORENAME "CORTEXA73"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXA76
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "CORTEXA76"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DCORTEXA76 " \
|
||||
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
|
||||
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "cortexa76"
|
||||
#define CORENAME "CORTEXA76"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXX1
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
|
@ -1677,9 +1696,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define LIBNAME "c910v"
|
||||
#define CORENAME "C910V"
|
||||
#endif
|
||||
#endif
|
||||
#ifdef FORCE_x280
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "RISCV64"
|
||||
#define SUBARCHITECTURE "x280"
|
||||
#define SUBDIRNAME "riscv64"
|
||||
#define ARCHCONFIG "-Dx280 " \
|
||||
"-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
|
||||
#define LIBNAME "x280"
|
||||
#define CORENAME "x280"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_RISCV64_ZVL256B
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "RISCV64"
|
||||
#define SUBARCHITECTURE "RISCV64_ZVL256B"
|
||||
#define SUBDIRNAME "riscv64"
|
||||
#define ARCHCONFIG "-DRISCV64_ZVL256B " \
|
||||
"-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
|
||||
#define LIBNAME "riscv64_zvl256b"
|
||||
#define CORENAME "RISCV64_ZVL256B"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_RISCV64_ZVL128B
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "RISCV64"
|
||||
#define SUBARCHITECTURE "RISCV64_ZVL128B"
|
||||
#define SUBDIRNAME "riscv64"
|
||||
#define ARCHCONFIG "-DRISCV64_ZVL128B " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
|
||||
#define LIBNAME "riscv64_zvl128b"
|
||||
#define CORENAME "RISCV64_ZVL128B"
|
||||
#endif
|
||||
|
||||
#if defined(FORCE_E2K) || defined(__e2k__)
|
||||
#define FORCE
|
||||
|
@ -1692,6 +1748,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CORENAME "generic"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CSKY
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "CSKY"
|
||||
#define SUBARCHITECTURE "CSKY"
|
||||
#define SUBDIRNAME "csky"
|
||||
#define ARCHCONFIG "-DCSKY" \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=524288 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||
#define LIBNAME "csky"
|
||||
#define CORENAME "CSKY"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CK860FV
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "CSKY"
|
||||
#define SUBARCHITECTURE "CK860V"
|
||||
#define SUBDIRNAME "csky"
|
||||
#define ARCHCONFIG "-DCK860FV " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=524288 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||
#define LIBNAME "ck860fv"
|
||||
#define CORENAME "CK860FV"
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef FORCE
|
||||
|
||||
#ifdef USER_TARGET
|
||||
|
@ -1766,7 +1849,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define OPENBLAS_SUPPORTED
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef OPENBLAS_SUPPORTED
|
||||
#error "This arch/CPU is not supported by OpenBLAS."
|
||||
#endif
|
||||
|
@ -1805,11 +1887,13 @@ static int get_num_cores(void) {
|
|||
|
||||
return count;
|
||||
|
||||
#elif defined(AIX)
|
||||
#elif defined(_AIX)
|
||||
//returns the number of processors which are currently online
|
||||
count = sysconf(_SC_NPROCESSORS_ONLN);
|
||||
if (count <= 0) count = 2;
|
||||
|
||||
|
||||
return count;
|
||||
|
||||
#else
|
||||
return 2;
|
||||
#endif
|
||||
|
@ -1831,7 +1915,7 @@ int main(int argc, char *argv[]){
|
|||
#ifdef FORCE
|
||||
printf("CORE=%s\n", CORENAME);
|
||||
#else
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__)
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) || defined(__csky__)
|
||||
printf("CORE=%s\n", get_corename());
|
||||
#endif
|
||||
#endif
|
||||
|
@ -1979,7 +2063,7 @@ printf("ELF_VERSION=2\n");
|
|||
#ifdef FORCE
|
||||
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
|
||||
#else
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv)
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__csky__)
|
||||
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -119,6 +119,7 @@ endif ()
|
|||
if (BUILD_BFLOAT16)
|
||||
GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
|
||||
|
@ -130,6 +131,8 @@ endif ()
|
|||
foreach (float_type ${FLOAT_TYPES})
|
||||
|
||||
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
|
||||
GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type})
|
||||
|
||||
GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type})
|
||||
GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type})
|
||||
GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type})
|
||||
|
|
|
@ -270,7 +270,8 @@ CSBLAS1OBJS = \
|
|||
cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
|
||||
cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
|
||||
cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
|
||||
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX)
|
||||
cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_samax.$(SUFFIX) \
|
||||
cblas_samin.$(SUFFIX)
|
||||
|
||||
CSBLAS2OBJS = \
|
||||
cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
|
||||
|
@ -295,7 +296,8 @@ CDBLAS1OBJS = \
|
|||
cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
|
||||
cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
|
||||
cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
|
||||
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX)
|
||||
cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_damax.$(SUFFIX) \
|
||||
cblas_damin.$(SUFFIX)
|
||||
|
||||
CDBLAS2OBJS = \
|
||||
cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
|
||||
|
@ -315,7 +317,7 @@ CCBLAS1OBJS = \
|
|||
cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
|
||||
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
|
||||
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
|
||||
cblas_caxpby.$(SUFFIX) \
|
||||
cblas_caxpby.$(SUFFIX) cblas_scamax.$(SUFFIX) cblas_caxpyc.$(SUFFIX) cblas_scamin.$(SUFFIX) \
|
||||
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX)
|
||||
|
||||
CCBLAS2OBJS = \
|
||||
|
@ -340,12 +342,12 @@ CXERBLAOBJ = \
|
|||
|
||||
CZBLAS1OBJS = \
|
||||
cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX) cblas_zaxpy.$(SUFFIX) \
|
||||
cblas_zcopy.$(SUFFIX) \
|
||||
cblas_zcopy.$(SUFFIX) cblas_dzamax.$(SUFFIX) cblas_dzamin.$(SUFFIX) \
|
||||
cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
|
||||
cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
|
||||
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
|
||||
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
|
||||
cblas_zaxpby.$(SUFFIX) \
|
||||
cblas_zaxpby.$(SUFFIX) cblas_zaxpyc.$(SUFFIX) \
|
||||
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX)
|
||||
|
||||
|
||||
|
@ -1301,7 +1303,7 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c
|
|||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : gemmt.c ../param.h
|
||||
sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
endif
|
||||
|
||||
|
@ -1533,6 +1535,30 @@ cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c
|
|||
cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_scamax.$(SUFFIX) cblas_scamax.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_dzamax.$(SUFFIX) cblas_dzamax.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_scamin.$(SUFFIX) cblas_scamin.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_dzamin.$(SUFFIX) cblas_dzamin.$(PSUFFIX) : max.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
|
||||
|
||||
cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
|
@ -1627,6 +1653,15 @@ cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c
|
|||
cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_caxpyc.$(SUFFIX) cblas_caxpyc.$(PSUFFIX) : zaxpy.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
|
||||
|
||||
cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
|
||||
|
||||
cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
|
||||
|
||||
cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
|
@ -1932,7 +1967,7 @@ cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h
|
|||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h
|
||||
cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
|
||||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||
endif
|
||||
|
||||
|
|
|
@ -117,8 +117,8 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
|
||||
if (ldc < MAX(1, m)) info = 8;
|
||||
if (lda < MAX(1, m)) info = 5;
|
||||
if (n < 0) info = 2;
|
||||
if (m < 0) info = 1;
|
||||
if (n < 0) info = 1;
|
||||
if (m < 0) info = 2;
|
||||
}
|
||||
|
||||
if (info >= 0) {
|
||||
|
|
|
@ -533,8 +533,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
|||
MNK = (double) args.m * (double) args.n * (double) args.k;
|
||||
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
||||
args.nthreads = 1;
|
||||
else
|
||||
else {
|
||||
args.nthreads = num_cpu_avail(3);
|
||||
if (MNK/args.nthreads < SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD)
|
||||
args.nthreads = MNK/(SMP_THRESHOLD_MIN*(double)GEMM_MULTITHREAD_THRESHOLD);
|
||||
}
|
||||
|
||||
args.common = NULL;
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
|
|
|
@ -78,6 +78,9 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
|||
|
||||
char transA, transB, Uplo;
|
||||
blasint nrowa, nrowb;
|
||||
#if defined(COMPLEX)
|
||||
blasint ncolb;
|
||||
#endif
|
||||
IFLOAT *buffer;
|
||||
IFLOAT *aa, *bb;
|
||||
FLOAT *cc;
|
||||
|
@ -155,19 +158,27 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
|||
uplo = 0;
|
||||
if (Uplo == 'L')
|
||||
uplo = 1;
|
||||
|
||||
|
||||
nrowa = m;
|
||||
if (transa) nrowa = k;
|
||||
if (transa & 1) nrowa = k;
|
||||
nrowb = k;
|
||||
if (transb) nrowb = m;
|
||||
#if defined(COMPLEX)
|
||||
ncolb = m;
|
||||
#endif
|
||||
if (transb & 1) {
|
||||
nrowb = m;
|
||||
#if defined(COMPLEX)
|
||||
ncolb = k;
|
||||
#endif
|
||||
}
|
||||
|
||||
info = 0;
|
||||
|
||||
if (ldc < MAX(1, m))
|
||||
info = 13;
|
||||
if (ldb < MAX(1, nrowa))
|
||||
if (ldb < MAX(1, nrowb))
|
||||
info = 10;
|
||||
if (lda < MAX(1, nrowb))
|
||||
if (lda < MAX(1, nrowa))
|
||||
info = 8;
|
||||
if (k < 0)
|
||||
info = 5;
|
||||
|
@ -211,6 +222,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
blasint info;
|
||||
blasint lda, ldb;
|
||||
FLOAT *a, *b;
|
||||
#if defined(COMPLEX)
|
||||
blasint nrowb, ncolb;
|
||||
#endif
|
||||
XFLOAT *buffer;
|
||||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
@ -262,11 +276,22 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
|
||||
info = -1;
|
||||
|
||||
blasint nrowa, nrowb;
|
||||
blasint nrowa;
|
||||
#if !defined(COMPLEX)
|
||||
blasint nrowb;
|
||||
#endif
|
||||
nrowa = m;
|
||||
if (transa) nrowa = k;
|
||||
if (transa & 1) nrowa = k;
|
||||
nrowb = k;
|
||||
if (transb) nrowb = m;
|
||||
#if defined(COMPLEX)
|
||||
ncolb = m;
|
||||
#endif
|
||||
if (transb & 1) {
|
||||
nrowb = m;
|
||||
#if defined(COMPLEX)
|
||||
ncolb = k;
|
||||
#endif
|
||||
}
|
||||
|
||||
if (ldc < MAX(1, m))
|
||||
info = 13;
|
||||
|
@ -330,26 +355,38 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
|
||||
info = -1;
|
||||
|
||||
blasint ncola, ncolb;
|
||||
ncola = k;
|
||||
if (transa) ncola = m;
|
||||
ncolb = m;
|
||||
if (transb) ncolb = k;
|
||||
blasint ncola;
|
||||
#if !defined(COMPLEX)
|
||||
blasint ncolb;
|
||||
#endif
|
||||
ncola = m;
|
||||
if (transa & 1) ncola = k;
|
||||
ncolb = k;
|
||||
#if defined(COMPLEX)
|
||||
nrowb = m;
|
||||
#endif
|
||||
|
||||
if (transb & 1) {
|
||||
#if defined(COMPLEX)
|
||||
nrowb = k;
|
||||
#endif
|
||||
ncolb = m;
|
||||
}
|
||||
|
||||
if (ldc < MAX(1,m))
|
||||
info = 13;
|
||||
if (ldb < MAX(1, ncolb))
|
||||
info = 10;
|
||||
if (lda < MAX(1, ncola))
|
||||
info = 8;
|
||||
if (lda < MAX(1, ncola))
|
||||
info = 10;
|
||||
if (k < 0)
|
||||
info = 5;
|
||||
if (m < 0)
|
||||
info = 4;
|
||||
if (transb < 0)
|
||||
info = 3;
|
||||
if (transa < 0)
|
||||
info = 2;
|
||||
if (transa < 0)
|
||||
info = 3;
|
||||
if (uplo < 0)
|
||||
info = 1;
|
||||
}
|
||||
|
@ -428,7 +465,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
|
||||
IDEBUG_START;
|
||||
|
||||
const blasint incb = (transb == 0) ? 1 : ldb;
|
||||
#if defined(COMPLEX)
|
||||
if (transb > 1){
|
||||
#ifndef CBLAS
|
||||
IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
|
||||
#else
|
||||
if (order == CblasColMajor)
|
||||
IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
|
||||
if (order == CblasRowMajor)
|
||||
IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
const blasint incb = ((transb & 1) == 0) ? 1 : ldb;
|
||||
|
||||
if (uplo == 1) {
|
||||
for (i = 0; i < m; i++) {
|
||||
|
@ -438,19 +488,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
#if defined(COMPLEX)
|
||||
aa = a + i * 2;
|
||||
bb = b + i * ldb * 2;
|
||||
if (transa) {
|
||||
if (transa & 1) {
|
||||
aa = a + lda * i * 2;
|
||||
}
|
||||
if (transb)
|
||||
if (transb & 1)
|
||||
bb = b + i * 2;
|
||||
cc = c + i * 2 * ldc + i * 2;
|
||||
#else
|
||||
aa = a + i;
|
||||
bb = b + i * ldb;
|
||||
if (transa) {
|
||||
if (transa & 1) {
|
||||
aa = a + lda * i;
|
||||
}
|
||||
if (transb)
|
||||
if (transb & 1)
|
||||
bb = b + i;
|
||||
cc = c + i * ldc + i;
|
||||
#endif
|
||||
|
@ -461,7 +511,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
NULL, 0);
|
||||
|
||||
if (alpha_r == ZERO && alpha_i == ZERO)
|
||||
return;
|
||||
continue;
|
||||
#else
|
||||
if (beta != ONE)
|
||||
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
|
||||
|
@ -472,13 +522,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
|
||||
IDEBUG_START;
|
||||
|
||||
buffer_size = j + k + 128 / sizeof(FLOAT);
|
||||
buffer_size = 2 * (j + k) + 128 / sizeof(FLOAT);
|
||||
#ifdef WINDOWS_ABI
|
||||
buffer_size += 160 / sizeof(FLOAT);
|
||||
#endif
|
||||
// for alignment
|
||||
buffer_size = (buffer_size + 3) & ~3;
|
||||
STACK_ALLOC(buffer_size, FLOAT, buffer);
|
||||
STACK_ALLOC(buffer_size, IFLOAT, buffer);
|
||||
|
||||
#ifdef SMP
|
||||
|
||||
|
@ -491,7 +541,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
#endif
|
||||
|
||||
#if defined(COMPLEX)
|
||||
if (!transa)
|
||||
if (!(transa & 1))
|
||||
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
|
||||
aa, lda, bb, incb, cc, 1,
|
||||
buffer);
|
||||
|
@ -500,7 +550,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
aa, lda, bb, incb, cc, 1,
|
||||
buffer);
|
||||
#else
|
||||
if (!transa)
|
||||
if (!(transa & 1))
|
||||
(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
|
||||
bb, incb, cc, 1, buffer);
|
||||
else
|
||||
|
@ -509,7 +559,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
#endif
|
||||
#ifdef SMP
|
||||
} else {
|
||||
if (!transa)
|
||||
if (!(transa & 1))
|
||||
(gemv_thread[(int)transa]) (j, k, alpha, aa,
|
||||
lda, bb, incb, cc,
|
||||
1, buffer,
|
||||
|
@ -533,13 +583,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
l = j;
|
||||
#if defined COMPLEX
|
||||
bb = b + i * ldb * 2;
|
||||
if (transb) {
|
||||
if (transb & 1) {
|
||||
bb = b + i * 2;
|
||||
}
|
||||
cc = c + i * 2 * ldc;
|
||||
#else
|
||||
bb = b + i * ldb;
|
||||
if (transb) {
|
||||
if (transb & 1) {
|
||||
bb = b + i;
|
||||
}
|
||||
cc = c + i * ldc;
|
||||
|
@ -551,7 +601,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
NULL, 0);
|
||||
|
||||
if (alpha_r == ZERO && alpha_i == ZERO)
|
||||
return;
|
||||
continue;
|
||||
#else
|
||||
if (beta != ONE)
|
||||
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
|
||||
|
@ -561,13 +611,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
#endif
|
||||
IDEBUG_START;
|
||||
|
||||
buffer_size = j + k + 128 / sizeof(FLOAT);
|
||||
buffer_size = 2 * (j + k) + 128 / sizeof(FLOAT);
|
||||
#ifdef WINDOWS_ABI
|
||||
buffer_size += 160 / sizeof(FLOAT);
|
||||
#endif
|
||||
// for alignment
|
||||
buffer_size = (buffer_size + 3) & ~3;
|
||||
STACK_ALLOC(buffer_size, FLOAT, buffer);
|
||||
STACK_ALLOC(buffer_size, IFLOAT, buffer);
|
||||
|
||||
#ifdef SMP
|
||||
|
||||
|
@ -580,7 +630,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
#endif
|
||||
|
||||
#if defined(COMPLEX)
|
||||
if (!transa)
|
||||
if (!(transa & 1))
|
||||
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
|
||||
a, lda, bb, incb, cc, 1,
|
||||
buffer);
|
||||
|
@ -589,7 +639,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
a, lda, bb, incb, cc, 1,
|
||||
buffer);
|
||||
#else
|
||||
if (!transa)
|
||||
if (!(transa & 1))
|
||||
(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
|
||||
incb, cc, 1, buffer);
|
||||
else
|
||||
|
@ -599,7 +649,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
|||
|
||||
#ifdef SMP
|
||||
} else {
|
||||
if (!transa)
|
||||
if (!(transa & 1))
|
||||
(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
|
||||
bb, incb, cc, 1,
|
||||
buffer, nthreads);
|
||||
|
|
|
@ -226,7 +226,7 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
|
||||
#ifdef SMP
|
||||
|
||||
if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD )
|
||||
if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD )
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = num_cpu_avail(2);
|
||||
|
|
|
@ -154,7 +154,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
|||
}
|
||||
#endif
|
||||
|
||||
msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT);
|
||||
if ( *rows > *cols )
|
||||
msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT);
|
||||
else
|
||||
msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT);
|
||||
|
||||
b = malloc(msize);
|
||||
if ( b == NULL )
|
||||
|
|
|
@ -95,14 +95,19 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
|
|||
|
||||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
|
||||
#ifndef DOUBLE
|
||||
if (args.m*args.n < 40000)
|
||||
#else
|
||||
if (args.m*args.n < 10000)
|
||||
int nmax = 40000;
|
||||
#else
|
||||
int nmax = 10000;
|
||||
#endif
|
||||
args.nthreads=1;
|
||||
else
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
if (args.m*args.n <nmax) {
|
||||
args.nthreads = 1;
|
||||
} else {
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
if ((args.m*args.n)/args.nthreads <nmax)
|
||||
args.nthreads = (args.m*args.n)/nmax;
|
||||
}
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
|
|
@ -113,13 +113,17 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
|||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
#ifndef DOUBLE
|
||||
if (args.n <128)
|
||||
#else
|
||||
if (args.n <64)
|
||||
int nmax = 128;
|
||||
#else
|
||||
int nmax = 64;
|
||||
#endif
|
||||
if (args.n <nmax) {
|
||||
args.nthreads = 1;
|
||||
else
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
} else {
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
if (args.n/args.nthreads <nmax)
|
||||
args.nthreads = args.n/nmax;
|
||||
}
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
|
|
@ -95,10 +95,12 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *
|
|||
if (trans_arg == 'R') trans = 0;
|
||||
if (trans_arg == 'C') trans = 1;
|
||||
|
||||
TOUPPER(uplo_arg);
|
||||
uplo = -1;
|
||||
if (uplo_arg == 'U') uplo = 0;
|
||||
if (uplo_arg == 'L') uplo = 1;
|
||||
|
||||
TOUPPER(diag_arg);
|
||||
diag = -1;
|
||||
if (diag_arg == 'U') diag = 0;
|
||||
if (diag_arg == 'N') diag = 1;
|
||||
|
|
|
@ -95,10 +95,12 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *
|
|||
if (trans_arg == 'R') trans = 2;
|
||||
if (trans_arg == 'C') trans = 3;
|
||||
|
||||
TOUPPER(uplo_arg);
|
||||
uplo = -1;
|
||||
if (uplo_arg == 'U') uplo = 0;
|
||||
if (uplo_arg == 'L') uplo = 1;
|
||||
|
||||
TOUPPER(diag_arg);
|
||||
diag = -1;
|
||||
if (diag_arg == 'U') diag = 0;
|
||||
if (diag_arg == 'N') diag = 1;
|
||||
|
|
|
@ -46,6 +46,12 @@
|
|||
|
||||
#ifdef USE_ABS
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define ABS fabs
|
||||
#else
|
||||
#define ABS fabsf
|
||||
#endif
|
||||
|
||||
#ifndef USE_MIN
|
||||
|
||||
/* ABS & MAX */
|
||||
|
@ -92,6 +98,8 @@
|
|||
|
||||
#else
|
||||
|
||||
#define ABS
|
||||
|
||||
#ifndef USE_MIN
|
||||
|
||||
/* MAX */
|
||||
|
@ -130,6 +138,12 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
|
|||
|
||||
if (n <= 0) return 0;
|
||||
|
||||
#ifndef COMPLEX
|
||||
if (incx == 0) return (ABS(*x));
|
||||
#else
|
||||
if (incx == 0) return (ABS(*x) + ABS(*(x+1)));
|
||||
#endif
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
@ -145,14 +159,25 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
|
|||
|
||||
#else
|
||||
|
||||
#ifdef COMPLEX
|
||||
FLOAT CNAME(blasint n, void *vx, blasint incx){
|
||||
FLOAT *x = (FLOAT*) vx;
|
||||
#else
|
||||
FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
|
||||
|
||||
#endif
|
||||
|
||||
FLOAT ret;
|
||||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
if (n <= 0) return 0;
|
||||
|
||||
#ifndef COMPLEX
|
||||
if (incx == 0) return (ABS(*x));
|
||||
#else
|
||||
if (incx == 0) return (ABS(*x) + ABS(*(x+1)));
|
||||
#endif
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
|
|
@ -96,12 +96,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
|||
else
|
||||
{
|
||||
dp2 = *dd2 * dy1;
|
||||
if(dp2 == ZERO)
|
||||
{
|
||||
dflag = -TWO;
|
||||
dparam[0] = dflag;
|
||||
return;
|
||||
}
|
||||
dp1 = *dd1 * *dx1;
|
||||
dq2 = dp2 * dy1;
|
||||
dq1 = dp1 * *dx1;
|
||||
|
@ -113,24 +107,10 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
|||
dh12 = dp2 / dp1;
|
||||
|
||||
du = ONE - dh12 * dh21;
|
||||
if(du > ZERO)
|
||||
{
|
||||
dflag = ZERO;
|
||||
*dd1 = *dd1 / du;
|
||||
*dd2 = *dd2 / du;
|
||||
*dx1 = *dx1 * du;
|
||||
} else {
|
||||
dflag = -ONE;
|
||||
|
||||
dh11 = ZERO;
|
||||
dh12 = ZERO;
|
||||
dh21 = ZERO;
|
||||
dh22 = ZERO;
|
||||
|
||||
*dd1 = ZERO;
|
||||
*dd2 = ZERO;
|
||||
*dx1 = ZERO;
|
||||
}
|
||||
dflag = ZERO;
|
||||
*dd1 = *dd1 / du;
|
||||
*dd2 = *dd2 / du;
|
||||
*dx1 = *dx1 * du;
|
||||
|
||||
}
|
||||
else
|
||||
|
|
|
@ -0,0 +1,447 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2024, The OpenBLAS Project. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "common.h"
|
||||
|
||||
#define SMP_THRESHOLD_MIN 65536.0
|
||||
#define ERROR_NAME "SBGEMMT "
|
||||
|
||||
#ifndef GEMM_MULTITHREAD_THRESHOLD
|
||||
#define GEMM_MULTITHREAD_THRESHOLD 4
|
||||
#endif
|
||||
|
||||
#ifndef CBLAS
|
||||
|
||||
void NAME(char *UPLO, char *TRANSA, char *TRANSB,
|
||||
blasint * M, blasint * K,
|
||||
FLOAT * Alpha,
|
||||
IFLOAT * a, blasint * ldA,
|
||||
IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
|
||||
{
|
||||
|
||||
blasint m, k;
|
||||
blasint lda, ldb, ldc;
|
||||
int transa, transb, uplo;
|
||||
blasint info;
|
||||
|
||||
char transA, transB, Uplo;
|
||||
blasint nrowa, nrowb;
|
||||
IFLOAT *buffer;
|
||||
IFLOAT *aa, *bb;
|
||||
FLOAT *cc;
|
||||
FLOAT alpha, beta;
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
m = *M;
|
||||
k = *K;
|
||||
|
||||
alpha = *Alpha;
|
||||
beta = *Beta;
|
||||
|
||||
lda = *ldA;
|
||||
ldb = *ldB;
|
||||
ldc = *ldC;
|
||||
|
||||
transA = *TRANSA;
|
||||
transB = *TRANSB;
|
||||
Uplo = *UPLO;
|
||||
TOUPPER(transA);
|
||||
TOUPPER(transB);
|
||||
TOUPPER(Uplo);
|
||||
|
||||
transa = -1;
|
||||
transb = -1;
|
||||
uplo = -1;
|
||||
|
||||
if (transA == 'N')
|
||||
transa = 0;
|
||||
if (transA == 'T')
|
||||
transa = 1;
|
||||
|
||||
if (transA == 'R')
|
||||
transa = 0;
|
||||
if (transA == 'C')
|
||||
transa = 1;
|
||||
|
||||
if (transB == 'N')
|
||||
transb = 0;
|
||||
if (transB == 'T')
|
||||
transb = 1;
|
||||
|
||||
if (transB == 'R')
|
||||
transb = 0;
|
||||
if (transB == 'C')
|
||||
transb = 1;
|
||||
|
||||
if (Uplo == 'U')
|
||||
uplo = 0;
|
||||
if (Uplo == 'L')
|
||||
uplo = 1;
|
||||
nrowa = m;
|
||||
if (transa & 1) nrowa = k;
|
||||
nrowb = k;
|
||||
if (transb & 1) nrowb = m;
|
||||
|
||||
info = 0;
|
||||
|
||||
if (ldc < MAX(1, m))
|
||||
info = 13;
|
||||
if (ldb < MAX(1, nrowb))
|
||||
info = 10;
|
||||
if (lda < MAX(1, nrowa))
|
||||
info = 8;
|
||||
if (k < 0)
|
||||
info = 5;
|
||||
if (m < 0)
|
||||
info = 4;
|
||||
if (transb < 0)
|
||||
info = 3;
|
||||
if (transa < 0)
|
||||
info = 2;
|
||||
if (uplo < 0)
|
||||
info = 1;
|
||||
|
||||
if (info != 0) {
|
||||
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
return;
|
||||
}
|
||||
#else
|
||||
|
||||
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m,
|
||||
blasint k,
|
||||
FLOAT alpha,
|
||||
IFLOAT * A, blasint LDA,
|
||||
IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc)
|
||||
{
|
||||
IFLOAT *aa, *bb;
|
||||
FLOAT *cc;
|
||||
|
||||
int transa, transb, uplo;
|
||||
blasint info;
|
||||
blasint lda, ldb;
|
||||
IFLOAT *a, *b;
|
||||
XFLOAT *buffer;
|
||||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
uplo = -1;
|
||||
transa = -1;
|
||||
transb = -1;
|
||||
info = 0;
|
||||
|
||||
if (order == CblasColMajor) {
|
||||
if (Uplo == CblasUpper) uplo = 0;
|
||||
if (Uplo == CblasLower) uplo = 1;
|
||||
|
||||
if (TransA == CblasNoTrans)
|
||||
transa = 0;
|
||||
if (TransA == CblasTrans)
|
||||
transa = 1;
|
||||
|
||||
if (TransA == CblasConjNoTrans)
|
||||
transa = 0;
|
||||
if (TransA == CblasConjTrans)
|
||||
transa = 1;
|
||||
|
||||
if (TransB == CblasNoTrans)
|
||||
transb = 0;
|
||||
if (TransB == CblasTrans)
|
||||
transb = 1;
|
||||
|
||||
if (TransB == CblasConjNoTrans)
|
||||
transb = 0;
|
||||
if (TransB == CblasConjTrans)
|
||||
transb = 1;
|
||||
|
||||
a = (void *)A;
|
||||
b = (void *)B;
|
||||
lda = LDA;
|
||||
ldb = LDB;
|
||||
|
||||
info = -1;
|
||||
|
||||
blasint nrowa;
|
||||
blasint nrowb;
|
||||
nrowa = m;
|
||||
if (transa & 1) nrowa = k;
|
||||
nrowb = k;
|
||||
if (transb & 1) nrowb = m;
|
||||
|
||||
if (ldc < MAX(1, m))
|
||||
info = 13;
|
||||
if (ldb < MAX(1, nrowb))
|
||||
info = 10;
|
||||
if (lda < MAX(1, nrowa))
|
||||
info = 8;
|
||||
if (k < 0)
|
||||
info = 5;
|
||||
if (m < 0)
|
||||
info = 4;
|
||||
if (transb < 0)
|
||||
info = 3;
|
||||
if (transa < 0)
|
||||
info = 2;
|
||||
if (uplo < 0)
|
||||
info = 1;
|
||||
}
|
||||
|
||||
if (order == CblasRowMajor) {
|
||||
|
||||
a = (void *)B;
|
||||
b = (void *)A;
|
||||
|
||||
lda = LDB;
|
||||
ldb = LDA;
|
||||
|
||||
if (Uplo == CblasUpper) uplo = 0;
|
||||
if (Uplo == CblasLower) uplo = 1;
|
||||
|
||||
if (TransB == CblasNoTrans)
|
||||
transa = 0;
|
||||
if (TransB == CblasTrans)
|
||||
transa = 1;
|
||||
|
||||
if (TransB == CblasConjNoTrans)
|
||||
transa = 0;
|
||||
if (TransB == CblasConjTrans)
|
||||
transa = 1;
|
||||
|
||||
if (TransA == CblasNoTrans)
|
||||
transb = 0;
|
||||
if (TransA == CblasTrans)
|
||||
transb = 1;
|
||||
|
||||
if (TransA == CblasConjNoTrans)
|
||||
transb = 0;
|
||||
if (TransA == CblasConjTrans)
|
||||
transb = 1;
|
||||
|
||||
info = -1;
|
||||
|
||||
blasint ncola;
|
||||
blasint ncolb;
|
||||
|
||||
ncola = m;
|
||||
if (transa & 1) ncola = k;
|
||||
ncolb = k;
|
||||
|
||||
if (transb & 1) {
|
||||
ncolb = m;
|
||||
}
|
||||
|
||||
if (ldc < MAX(1,m))
|
||||
info = 13;
|
||||
if (ldb < MAX(1, ncolb))
|
||||
info = 8;
|
||||
if (lda < MAX(1, ncola))
|
||||
info = 10;
|
||||
if (k < 0)
|
||||
info = 5;
|
||||
if (m < 0)
|
||||
info = 4;
|
||||
if (transb < 0)
|
||||
info = 2;
|
||||
if (transa < 0)
|
||||
info = 3;
|
||||
if (uplo < 0)
|
||||
info = 1;
|
||||
}
|
||||
|
||||
if (info >= 0) {
|
||||
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
return;
|
||||
}
|
||||
|
||||
#endif
|
||||
int buffer_size;
|
||||
blasint i, j;
|
||||
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef SMP
|
||||
static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *,
|
||||
BLASLONG, IFLOAT *, BLASLONG, FLOAT,
|
||||
FLOAT *, BLASLONG, int) = {
|
||||
sbgemv_thread_n, sbgemv_thread_t,
|
||||
};
|
||||
#endif
|
||||
int (*gemv[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, BLASLONG,
|
||||
IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = {
|
||||
SBGEMV_N, SBGEMV_T,};
|
||||
|
||||
|
||||
if (m == 0)
|
||||
return;
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
const blasint incb = ((transb & 1) == 0) ? 1 : ldb;
|
||||
|
||||
if (uplo == 1) {
|
||||
for (i = 0; i < m; i++) {
|
||||
j = m - i;
|
||||
|
||||
aa = a + i;
|
||||
bb = b + i * ldb;
|
||||
if (transa & 1) {
|
||||
aa = a + lda * i;
|
||||
}
|
||||
if (transb & 1)
|
||||
bb = b + i;
|
||||
cc = c + i * ldc + i;
|
||||
|
||||
#if 0
|
||||
if (beta != ONE)
|
||||
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO)
|
||||
continue;
|
||||
#endif
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
buffer_size = j + k + 128 / sizeof(FLOAT);
|
||||
#ifdef WINDOWS_ABI
|
||||
buffer_size += 160 / sizeof(FLOAT);
|
||||
#endif
|
||||
// for alignment
|
||||
buffer_size = (buffer_size + 3) & ~3;
|
||||
STACK_ALLOC(buffer_size, IFLOAT, buffer);
|
||||
|
||||
#ifdef SMP
|
||||
|
||||
if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = num_cpu_avail(2);
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
||||
if (!(transa & 1))
|
||||
(gemv[(int)transa]) (j, k, alpha, aa, lda,
|
||||
bb, incb, beta, cc, 1);
|
||||
else
|
||||
(gemv[(int)transa]) (k, j, alpha, aa, lda,
|
||||
bb, incb, beta, cc, 1);
|
||||
|
||||
#ifdef SMP
|
||||
} else {
|
||||
if (!(transa & 1))
|
||||
(gemv_thread[(int)transa]) (j, k, alpha, aa,
|
||||
lda, bb, incb, beta, cc,
|
||||
1, nthreads);
|
||||
else
|
||||
(gemv_thread[(int)transa]) (k, j, alpha, aa,
|
||||
lda, bb, incb, beta, cc,
|
||||
1, nthreads);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
STACK_FREE(buffer);
|
||||
}
|
||||
} else {
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
j = i + 1;
|
||||
|
||||
bb = b + i * ldb;
|
||||
if (transb & 1) {
|
||||
bb = b + i;
|
||||
}
|
||||
cc = c + i * ldc;
|
||||
|
||||
#if 0
|
||||
if (beta != ONE)
|
||||
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO)
|
||||
continue;
|
||||
#endif
|
||||
IDEBUG_START;
|
||||
|
||||
buffer_size = j + k + 128 / sizeof(FLOAT);
|
||||
#ifdef WINDOWS_ABI
|
||||
buffer_size += 160 / sizeof(FLOAT);
|
||||
#endif
|
||||
// for alignment
|
||||
buffer_size = (buffer_size + 3) & ~3;
|
||||
STACK_ALLOC(buffer_size, IFLOAT, buffer);
|
||||
|
||||
#ifdef SMP
|
||||
|
||||
if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = num_cpu_avail(2);
|
||||
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
||||
if (!(transa & 1))
|
||||
(gemv[(int)transa]) (j, k, alpha, a, lda, bb,
|
||||
incb, beta, cc, 1);
|
||||
else
|
||||
(gemv[(int)transa]) (k, j, alpha, a, lda, bb,
|
||||
incb, beta, cc, 1);
|
||||
|
||||
#ifdef SMP
|
||||
} else {
|
||||
if (!(transa & 1))
|
||||
(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
|
||||
bb, incb, beta, cc, 1,
|
||||
nthreads);
|
||||
else
|
||||
(gemv_thread[(int)transa]) (k, j, alpha, a, lda,
|
||||
bb, incb, beta, cc, 1,
|
||||
nthreads);
|
||||
}
|
||||
#endif
|
||||
|
||||
STACK_FREE(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
IDEBUG_END;
|
||||
|
||||
return;
|
||||
}
|
|
@ -39,12 +39,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#ifndef CBLAS
|
||||
|
||||
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY)
|
||||
void NAME(blasint *N, void *VALPHA, FLOAT *x, blasint *INCX, void *VBETA, FLOAT *y, blasint *INCY)
|
||||
{
|
||||
|
||||
blasint n = *N;
|
||||
blasint incx = *INCX;
|
||||
blasint incy = *INCY;
|
||||
FLOAT* ALPHA = (FLOAT*) VALPHA;
|
||||
FLOAT* BETA = (FLOAT*) VBETA;
|
||||
|
||||
#else
|
||||
|
||||
|
|
|
@ -66,7 +66,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
|
|||
info = 0;
|
||||
|
||||
|
||||
if (lda < MAX(1, m)) info = 6;
|
||||
if (lda < MAX(1, m)) info = 5;
|
||||
if (ldc < MAX(1, m)) info = 8;
|
||||
|
||||
if (n < 0) info = 2;
|
||||
|
@ -115,8 +115,8 @@ void CNAME(enum CBLAS_ORDER order,
|
|||
|
||||
if (ldc < MAX(1, m)) info = 8;
|
||||
if (lda < MAX(1, m)) info = 5;
|
||||
if (n < 0) info = 2;
|
||||
if (m < 0) info = 1;
|
||||
if (n < 0) info = 1;
|
||||
if (m < 0) info = 2;
|
||||
}
|
||||
|
||||
if (info >= 0) {
|
||||
|
|
|
@ -183,7 +183,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
|||
}
|
||||
#endif
|
||||
|
||||
msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT) * 2;
|
||||
if ( *rows > *cols )
|
||||
msize = (size_t)(*rows) * (*ldb) * sizeof(FLOAT) * 2;
|
||||
else
|
||||
msize = (size_t)(*cols) * (*ldb) * sizeof(FLOAT) * 2;
|
||||
|
||||
b = malloc(msize);
|
||||
if ( b == NULL )
|
||||
|
|
|
@ -102,7 +102,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
|
|||
if (ada >= h *safmin) {
|
||||
*C = sqrt(ada/h);
|
||||
*R = *DA / *C;
|
||||
*(R+1) = *(DA+1) / *(C+1);
|
||||
*(R+1) = *(DA+1) / *C;
|
||||
rtmax *= 2.;
|
||||
if ( ada > rtmin && h < rtmax) { // no risk of intermediate overflow
|
||||
*S = *S1 * (*DA / adahsq) - *(S1+1)* (*(DA+1)/adahsq);
|
||||
|
@ -115,7 +115,7 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
|
|||
*C = ada / adahsq;
|
||||
if (*C >= safmin) {
|
||||
*R = *DA / *C;
|
||||
*(R+1) = *(DA+1) / *(C+1);
|
||||
*(R+1) = *(DA+1) / *C;
|
||||
} else {
|
||||
*R = *DA * (h / adahsq);
|
||||
*(R+1) = *(DA+1) * (h / adahsq);
|
||||
|
|
|
@ -1349,6 +1349,9 @@ endif ()
|
|||
set_target_properties(kernel${TSUFFIX} PROPERTIES COMPILE_FLAGS "${KERNEL_DEFINITIONS}")
|
||||
get_target_property(KERNEL_INCLUDE_DIRECTORIES kernel${TSUFFIX} INCLUDE_DIRECTORIES)
|
||||
set_target_properties(kernel${TSUFFIX} PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDE_DIRECTORIES};${TARGET_CONF_DIR}")
|
||||
if (USE_GEMM3M)
|
||||
target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M)
|
||||
endif()
|
||||
endfunction ()
|
||||
|
||||
|
||||
|
|
|
@ -61,7 +61,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
if ( n == 1 ) return( ABS(x[0]) );
|
||||
|
||||
n *= inc_x;
|
||||
while(i < n)
|
||||
while(abs(i) < abs(n))
|
||||
{
|
||||
|
||||
if ( x[i] != 0.0 )
|
||||
|
|
|
@ -62,7 +62,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
inc_x2 = 2 * inc_x;
|
||||
|
||||
n *= inc_x2;
|
||||
while(i < n)
|
||||
while(abs(i) < abs(n))
|
||||
{
|
||||
|
||||
if ( x[i] != 0.0 )
|
||||
|
|
|
@ -60,6 +60,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
|
|||
else
|
||||
{
|
||||
temp = - da_i * x[ip+1] ;
|
||||
if (isnan(x[ip]) || isinf(x[ip])) temp = NAN;
|
||||
x[ip+1] = da_i * x[ip] ;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
CSUMKERNEL=csum.S
|
||||
|
||||
ifndef SNRM2KERNEL
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
endif
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
|
||||
CSUMKERNEL = csum_thunderx2t99.c
|
||||
ZSUMKERNEL = zsum_thunderx2t99.c
|
||||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
include $(KERNELDIR)/KERNEL.CORTEXA57
|
||||
|
||||
|
|
@ -91,8 +91,8 @@ IDAMAXKERNEL = iamax_thunderx2t99.c
|
|||
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||
|
||||
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
|
|
|
@ -0,0 +1,247 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#define N "x0" /* vector length */
|
||||
#define X "x1" /* "X" vector address */
|
||||
#define INC_X "x2" /* "X" stride */
|
||||
#define J "x5" /* loop variable */
|
||||
|
||||
#define REG0 "wzr"
|
||||
#define SUMF "s0"
|
||||
#define SUMFD "d0"
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
#define KERNEL_F1 \
|
||||
"ldr d1, ["X"] \n" \
|
||||
"add "X", "X", #8 \n" \
|
||||
"ext v2.8b, v1.8b, v1.8b, #4 \n" \
|
||||
"fadd s1, s1, s2 \n" \
|
||||
"fadd "SUMF", "SUMF", s1 \n"
|
||||
|
||||
#define KERNEL_F32 \
|
||||
"ldr q16, ["X"] \n" \
|
||||
"ldr q17, ["X", #16] \n" \
|
||||
"ldr q18, ["X", #32] \n" \
|
||||
"ldr q19, ["X", #48] \n" \
|
||||
"ldp q20, q21, ["X", #64] \n" \
|
||||
"ldp q22, q23, ["X", #96] \n" \
|
||||
"ldp q24, q25, ["X", #128] \n" \
|
||||
"ldp q26, q27, ["X", #160] \n" \
|
||||
"fadd v16.4s, v16.4s, v17.4s \n" \
|
||||
"fadd v18.4s, v18.4s, v19.4s \n" \
|
||||
"ldp q28, q29, ["X", #192] \n" \
|
||||
"ldp q30, q31, ["X", #224] \n" \
|
||||
"add "X", "X", #256 \n" \
|
||||
"fadd v20.4s, v20.4s, v21.4s \n" \
|
||||
"fadd v22.4s, v22.4s, v23.4s \n" \
|
||||
"PRFM PLDL1KEEP, ["X", #1024] \n" \
|
||||
"PRFM PLDL1KEEP, ["X", #1024+64] \n" \
|
||||
"fadd v24.4s, v24.4s, v25.4s \n" \
|
||||
"fadd v26.4s, v26.4s, v27.4s \n" \
|
||||
"fadd v0.4s, v0.4s, v16.4s \n" \
|
||||
"fadd v1.4s, v1.4s, v18.4s \n" \
|
||||
"fadd v2.4s, v2.4s, v20.4s \n" \
|
||||
"fadd v3.4s, v3.4s, v22.4s \n" \
|
||||
"PRFM PLDL1KEEP, ["X", #1024+128] \n" \
|
||||
"PRFM PLDL1KEEP, ["X", #1024+192] \n" \
|
||||
"fadd v28.4s, v28.4s, v29.4s \n" \
|
||||
"fadd v30.4s, v30.4s, v31.4s \n" \
|
||||
"fadd v4.4s, v4.4s, v24.4s \n" \
|
||||
"fadd v5.4s, v5.4s, v26.4s \n" \
|
||||
"fadd v6.4s, v6.4s, v28.4s \n" \
|
||||
"fadd v7.4s, v7.4s, v30.4s \n"
|
||||
|
||||
#define KERNEL_F32_FINALIZE \
|
||||
"fadd v0.4s, v0.4s, v1.4s \n" \
|
||||
"fadd v2.4s, v2.4s, v3.4s \n" \
|
||||
"fadd v4.4s, v4.4s, v5.4s \n" \
|
||||
"fadd v6.4s, v6.4s, v7.4s \n" \
|
||||
"fadd v0.4s, v0.4s, v2.4s \n" \
|
||||
"fadd v4.4s, v4.4s, v6.4s \n" \
|
||||
"fadd v0.4s, v0.4s, v4.4s \n" \
|
||||
"ext v1.16b, v0.16b, v0.16b, #8 \n" \
|
||||
"fadd v0.2s, v0.2s, v1.2s \n" \
|
||||
"faddp "SUMF", v0.2s \n"
|
||||
|
||||
#define INIT_S \
|
||||
"lsl "INC_X", "INC_X", #3 \n"
|
||||
|
||||
#define KERNEL_S1 \
|
||||
"ldr d1, ["X"] \n" \
|
||||
"add "X", "X", "INC_X" \n" \
|
||||
"ext v2.8b, v1.8b, v1.8b, #4 \n" \
|
||||
"fadd s1, s1, s2 \n" \
|
||||
"fadd "SUMF", "SUMF", s1 \n"
|
||||
|
||||
|
||||
#if defined(SMP)
|
||||
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
|
||||
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
|
||||
void *c, BLASLONG ldc, int (*function)(), int nthreads);
|
||||
#endif
|
||||
|
||||
|
||||
static FLOAT casum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
FLOAT asum = 0.0 ;
|
||||
|
||||
if ( n < 0 ) return(asum);
|
||||
|
||||
__asm__ __volatile__ (
|
||||
" mov "N", %[N_] \n"
|
||||
" mov "X", %[X_] \n"
|
||||
" mov "INC_X", %[INCX_] \n"
|
||||
" fmov "SUMF", "REG0" \n"
|
||||
" fmov s1, "REG0" \n"
|
||||
" fmov s2, "REG0" \n"
|
||||
" fmov s3, "REG0" \n"
|
||||
" fmov s4, "REG0" \n"
|
||||
" fmov s5, "REG0" \n"
|
||||
" fmov s6, "REG0" \n"
|
||||
" fmov s7, "REG0" \n"
|
||||
" cmp "N", xzr \n"
|
||||
" ble 9f //asum_kernel_L999 \n"
|
||||
" cmp "INC_X", xzr \n"
|
||||
" ble 9f //asum_kernel_L999 \n"
|
||||
" cmp "INC_X", #1 \n"
|
||||
" bne 5f //asum_kernel_S_BEGIN \n"
|
||||
|
||||
"1: //asum_kernel_F_BEGIN: \n"
|
||||
" asr "J", "N", #5 \n"
|
||||
" cmp "J", xzr \n"
|
||||
" beq 3f //asum_kernel_F1 \n"
|
||||
|
||||
"2: //asum_kernel_F32: \n"
|
||||
" "KERNEL_F32" \n"
|
||||
" subs "J", "J", #1 \n"
|
||||
" bne 2b //asum_kernel_F32 \n"
|
||||
" "KERNEL_F32_FINALIZE" \n"
|
||||
|
||||
"3: //asum_kernel_F1: \n"
|
||||
" ands "J", "N", #31 \n"
|
||||
" ble 9f //asum_kernel_L999 \n"
|
||||
|
||||
"4: //asum_kernel_F10: \n"
|
||||
" "KERNEL_F1" \n"
|
||||
" subs "J", "J", #1 \n"
|
||||
" bne 4b //asum_kernel_F10 \n"
|
||||
" b 9f //asum_kernel_L999 \n"
|
||||
|
||||
"5: //asum_kernel_S_BEGIN: \n"
|
||||
" "INIT_S" \n"
|
||||
" asr "J", "N", #2 \n"
|
||||
" cmp "J", xzr \n"
|
||||
" ble 7f //asum_kernel_S1 \n"
|
||||
|
||||
"6: //asum_kernel_S4: \n"
|
||||
" "KERNEL_S1" \n"
|
||||
" "KERNEL_S1" \n"
|
||||
" "KERNEL_S1" \n"
|
||||
" "KERNEL_S1" \n"
|
||||
" subs "J", "J", #1 \n"
|
||||
" bne 6b //asum_kernel_S4 \n"
|
||||
|
||||
"7: //asum_kernel_S1: \n"
|
||||
" ands "J", "N", #3 \n"
|
||||
" ble 9f //asum_kernel_L999 \n"
|
||||
|
||||
"8: //asum_kernel_S10: \n"
|
||||
" "KERNEL_S1" \n"
|
||||
" subs "J", "J", #1 \n"
|
||||
" bne 8b //asum_kernel_S10 \n"
|
||||
|
||||
"9: //asum_kernel_L999: \n"
|
||||
" fmov %[ASUM_], "SUMFD" \n"
|
||||
|
||||
: [ASUM_] "=r" (asum) //%0
|
||||
: [N_] "r" (n), //%1
|
||||
[X_] "r" (x), //%2
|
||||
[INCX_] "r" (inc_x) //%3
|
||||
: "cc",
|
||||
"memory",
|
||||
"x0", "x1", "x2", "x3", "x4", "x5",
|
||||
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
|
||||
);
|
||||
|
||||
return asum;
|
||||
}
|
||||
|
||||
#if defined(SMP)
|
||||
static int casum_thread_function(BLASLONG n, BLASLONG dummy0,
|
||||
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
||||
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
|
||||
{
|
||||
*result = casum_compute(n, x, inc_x);
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
#if defined(SMP)
|
||||
int nthreads;
|
||||
FLOAT dummy_alpha;
|
||||
#endif
|
||||
FLOAT asum = 0.0;
|
||||
|
||||
#if defined(SMP)
|
||||
if (inc_x == 0 || n <= 10000)
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = num_cpu_avail(1);
|
||||
|
||||
if (nthreads == 1) {
|
||||
asum = casum_compute(n, x, inc_x);
|
||||
} else {
|
||||
int mode, i;
|
||||
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
|
||||
FLOAT *ptr;
|
||||
|
||||
mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||
|
||||
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
|
||||
x, inc_x, NULL, 0, result, 0,
|
||||
( void *)casum_thread_function, nthreads);
|
||||
|
||||
ptr = (FLOAT *)result;
|
||||
for (i = 0; i < nthreads; i++) {
|
||||
asum = asum + (*ptr);
|
||||
ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2);
|
||||
}
|
||||
}
|
||||
#else
|
||||
asum = casum_compute(n, x, inc_x);
|
||||
#endif
|
||||
|
||||
return asum;
|
||||
}
|
|
@ -77,7 +77,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
|||
" cmp "N", xzr \n"
|
||||
" ble 9f //nrm2_kernel_L999 \n"
|
||||
" cmp "INC_X", xzr \n"
|
||||
" ble 9f //nrm2_kernel_L999 \n"
|
||||
" beq 9f //nrm2_kernel_L999 \n"
|
||||
|
||||
"1: //nrm2_kernel_F_BEGIN: \n"
|
||||
" mov x6, #0x7FF0000000000000 //+Infinity \n"
|
||||
|
@ -345,7 +345,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
#endif
|
||||
FLOAT ssq, scale;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return 0.0;
|
||||
if (n <= 0 || inc_x == 0) return 0.0;
|
||||
|
||||
#if defined(SMP)
|
||||
if (n <= 10000)
|
||||
|
|
|
@ -229,7 +229,7 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
" cmp "N", xzr \n"
|
||||
" ble 9f //nrm2_kernel_L999 \n"
|
||||
" cmp "INC_X", xzr \n"
|
||||
" ble 9f //nrm2_kernel_L999 \n"
|
||||
" beq 9f //nrm2_kernel_L999 \n"
|
||||
" cmp "INC_X", #1 \n"
|
||||
" bne 5f //nrm2_kernel_S_BEGIN \n"
|
||||
|
||||
|
@ -315,7 +315,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
FLOAT nrm2 = 0.0;
|
||||
double nrm2_double = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return 0.0;
|
||||
if (n <= 0 || inc_x == 0) return 0.0;
|
||||
|
||||
#if defined(SMP)
|
||||
if (n <= 10000)
|
||||
|
|
|
@ -223,7 +223,7 @@ zscal_begin:
|
|||
fcmp DA_I, #0.0
|
||||
beq .Lzscal_kernel_RI_zero
|
||||
|
||||
b .Lzscal_kernel_R_zero
|
||||
// b .Lzscal_kernel_R_zero
|
||||
|
||||
.Lzscal_kernel_R_non_zero:
|
||||
|
||||
|
|
|
@ -0,0 +1,244 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#define N "x0" /* vector length */
|
||||
#define X "x1" /* "X" vector address */
|
||||
#define INC_X "x2" /* "X" stride */
|
||||
#define J "x5" /* loop variable */
|
||||
|
||||
#define REG0 "xzr"
|
||||
#define SUMF "d0"
|
||||
#define TMPF "d1"
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
#define KERNEL_F1 \
|
||||
"ldr q1, ["X"] \n" \
|
||||
"add "X", "X", #16 \n" \
|
||||
"faddp d1, v1.2d \n" \
|
||||
"fadd "SUMF", "SUMF", d1 \n"
|
||||
|
||||
#define KERNEL_F16 \
|
||||
"ldr q16, ["X"] \n" \
|
||||
"ldr q17, ["X", #16] \n" \
|
||||
"ldr q18, ["X", #32] \n" \
|
||||
"ldr q19, ["X", #48] \n" \
|
||||
"ldp q20, q21, ["X", #64] \n" \
|
||||
"ldp q22, q23, ["X", #96] \n" \
|
||||
"ldp q24, q25, ["X", #128] \n" \
|
||||
"ldp q26, q27, ["X", #160] \n" \
|
||||
"fadd v16.2d, v16.2d, v17.2d \n" \
|
||||
"fadd v18.2d, v18.2d, v19.2d \n" \
|
||||
"ldp q28, q29, ["X", #192] \n" \
|
||||
"ldp q30, q31, ["X", #224] \n" \
|
||||
"add "X", "X", #256 \n" \
|
||||
"fadd v20.2d, v20.2d, v21.2d \n" \
|
||||
"fadd v22.2d, v22.2d, v23.2d \n" \
|
||||
"PRFM PLDL1KEEP, ["X", #1024] \n" \
|
||||
"PRFM PLDL1KEEP, ["X", #1024+64] \n" \
|
||||
"fadd v24.2d, v24.2d, v25.2d \n" \
|
||||
"fadd v26.2d, v26.2d, v27.2d \n" \
|
||||
"fadd v28.2d, v28.2d, v29.2d \n" \
|
||||
"fadd v30.2d, v30.2d, v31.2d \n" \
|
||||
"fadd v0.2d, v0.2d, v16.2d \n" \
|
||||
"fadd v1.2d, v1.2d, v18.2d \n" \
|
||||
"fadd v2.2d, v2.2d, v20.2d \n" \
|
||||
"fadd v3.2d, v3.2d, v22.2d \n" \
|
||||
"PRFM PLDL1KEEP, ["X", #1024+128] \n" \
|
||||
"PRFM PLDL1KEEP, ["X", #1024+192] \n" \
|
||||
"fadd v4.2d, v4.2d, v24.2d \n" \
|
||||
"fadd v5.2d, v5.2d, v26.2d \n" \
|
||||
"fadd v6.2d, v6.2d, v28.2d \n" \
|
||||
"fadd v7.2d, v7.2d, v30.2d \n"
|
||||
|
||||
#define KERNEL_F16_FINALIZE \
|
||||
"fadd v0.2d, v0.2d, v1.2d \n" \
|
||||
"fadd v2.2d, v2.2d, v3.2d \n" \
|
||||
"fadd v4.2d, v4.2d, v5.2d \n" \
|
||||
"fadd v6.2d, v6.2d, v7.2d \n" \
|
||||
"fadd v0.2d, v0.2d, v2.2d \n" \
|
||||
"fadd v4.2d, v4.2d, v6.2d \n" \
|
||||
"fadd v0.2d, v0.2d, v4.2d \n" \
|
||||
"faddp "SUMF", v0.2d \n"
|
||||
|
||||
#define INIT_S \
|
||||
"lsl "INC_X", "INC_X", #4 \n"
|
||||
|
||||
#define KERNEL_S1 \
|
||||
"ldr q1, ["X"] \n" \
|
||||
"add "X", "X", "INC_X" \n" \
|
||||
"faddp d1, v1.2d \n" \
|
||||
"fadd "SUMF", "SUMF", d1 \n"
|
||||
|
||||
|
||||
#if defined(SMP)
|
||||
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
|
||||
BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
|
||||
void *c, BLASLONG ldc, int (*function)(), int nthreads);
|
||||
#endif
|
||||
|
||||
|
||||
static FLOAT zasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
FLOAT asum = 0.0 ;
|
||||
|
||||
if ( n < 0 ) return(asum);
|
||||
|
||||
__asm__ __volatile__ (
|
||||
" mov "N", %[N_] \n"
|
||||
" mov "X", %[X_] \n"
|
||||
" mov "INC_X", %[INCX_] \n"
|
||||
" fmov "SUMF", "REG0" \n"
|
||||
" fmov d1, "REG0" \n"
|
||||
" fmov d2, "REG0" \n"
|
||||
" fmov d3, "REG0" \n"
|
||||
" fmov d4, "REG0" \n"
|
||||
" fmov d5, "REG0" \n"
|
||||
" fmov d6, "REG0" \n"
|
||||
" fmov d7, "REG0" \n"
|
||||
" cmp "N", xzr \n"
|
||||
" ble 9f //asum_kernel_L999 \n"
|
||||
" cmp "INC_X", xzr \n"
|
||||
" ble 9f //asum_kernel_L999 \n"
|
||||
" cmp "INC_X", #1 \n"
|
||||
" bne 5f //asum_kernel_S_BEGIN \n"
|
||||
|
||||
"1: //asum_kernel_F_BEGIN: \n"
|
||||
" asr "J", "N", #4 \n"
|
||||
" cmp "J", xzr \n"
|
||||
" beq 3f //asum_kernel_F1 \n"
|
||||
|
||||
".align 5 \n"
|
||||
"2: //asum_kernel_F16: \n"
|
||||
" "KERNEL_F16" \n"
|
||||
" subs "J", "J", #1 \n"
|
||||
" bne 2b //asum_kernel_F16 \n"
|
||||
" "KERNEL_F16_FINALIZE" \n"
|
||||
|
||||
"3: //asum_kernel_F1: \n"
|
||||
" ands "J", "N", #15 \n"
|
||||
" ble 9f //asum_kernel_L999 \n"
|
||||
|
||||
"4: //asum_kernel_F10: \n"
|
||||
" "KERNEL_F1" \n"
|
||||
" subs "J", "J", #1 \n"
|
||||
" bne 4b //asum_kernel_F10 \n"
|
||||
" b 9f //asum_kernel_L999 \n"
|
||||
|
||||
"5: //asum_kernel_S_BEGIN: \n"
|
||||
" "INIT_S" \n"
|
||||
" asr "J", "N", #2 \n"
|
||||
" cmp "J", xzr \n"
|
||||
" ble 7f //asum_kernel_S1 \n"
|
||||
|
||||
"6: //asum_kernel_S4: \n"
|
||||
" "KERNEL_S1" \n"
|
||||
" "KERNEL_S1" \n"
|
||||
" "KERNEL_S1" \n"
|
||||
" "KERNEL_S1" \n"
|
||||
" subs "J", "J", #1 \n"
|
||||
" bne 6b //asum_kernel_S4 \n"
|
||||
|
||||
"7: //asum_kernel_S1: \n"
|
||||
" ands "J", "N", #3 \n"
|
||||
" ble 9f //asum_kernel_L999 \n"
|
||||
|
||||
"8: //asum_kernel_S10: \n"
|
||||
" "KERNEL_S1" \n"
|
||||
" subs "J", "J", #1 \n"
|
||||
" bne 8b //asum_kernel_S10 \n"
|
||||
|
||||
"9: //asum_kernel_L999: \n"
|
||||
" fmov %[ASUM_], "SUMF" \n"
|
||||
|
||||
: [ASUM_] "=r" (asum) //%0
|
||||
: [N_] "r" (n), //%1
|
||||
[X_] "r" (x), //%2
|
||||
[INCX_] "r" (inc_x) //%3
|
||||
: "cc",
|
||||
"memory",
|
||||
"x0", "x1", "x2", "x3", "x4", "x5",
|
||||
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
|
||||
);
|
||||
|
||||
return asum;
|
||||
}
|
||||
|
||||
#if defined(SMP)
|
||||
static int zasum_thread_function(BLASLONG n, BLASLONG dummy0,
|
||||
BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
||||
BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
|
||||
{
|
||||
*result = zasum_compute(n, x, inc_x);
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
#if defined(SMP)
|
||||
int nthreads;
|
||||
FLOAT dummy_alpha;
|
||||
#endif
|
||||
FLOAT asum = 0.0;
|
||||
|
||||
#if defined(SMP)
|
||||
if (inc_x == 0 || n <= 10000)
|
||||
nthreads = 1;
|
||||
else
|
||||
nthreads = num_cpu_avail(1);
|
||||
|
||||
if (nthreads == 1) {
|
||||
asum = zasum_compute(n, x, inc_x);
|
||||
} else {
|
||||
int mode, i;
|
||||
char result[MAX_CPU_NUMBER * sizeof(double) * 2];
|
||||
FLOAT *ptr;
|
||||
|
||||
mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||
|
||||
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
|
||||
x, inc_x, NULL, 0, result, 0,
|
||||
( void *)zasum_thread_function, nthreads);
|
||||
|
||||
ptr = (FLOAT *)result;
|
||||
for (i = 0; i < nthreads; i++) {
|
||||
asum = asum + (*ptr);
|
||||
ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2);
|
||||
}
|
||||
}
|
||||
#else
|
||||
asum = zasum_compute(n, x, inc_x);
|
||||
#endif
|
||||
|
||||
return asum;
|
||||
}
|
|
@ -0,0 +1,149 @@
|
|||
SAMAXKERNEL = ../arm/amax.c
|
||||
DAMAXKERNEL = ../arm/amax.c
|
||||
CAMAXKERNEL = ../arm/zamax.c
|
||||
ZAMAXKERNEL = ../arm/zamax.c
|
||||
|
||||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMAXKERNEL = ../arm/iamax.c
|
||||
IDAMAXKERNEL = ../arm/iamax.c
|
||||
ICAMAXKERNEL = ../arm/izamax.c
|
||||
IZAMAXKERNEL = ../arm/izamax.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
SASUMKERNEL = ../arm/asum.c
|
||||
DASUMKERNEL = ../arm/asum.c
|
||||
CASUMKERNEL = ../arm/zasum.c
|
||||
ZASUMKERNEL = ../arm/zasum.c
|
||||
|
||||
SSUMKERNEL = ../arm/sum.c
|
||||
DSUMKERNEL = ../arm/sum.c
|
||||
CSUMKERNEL = ../arm/zsum.c
|
||||
ZSUMKERNEL = ../arm/zsum.c
|
||||
|
||||
SAXPYKERNEL = ../arm/axpy.c
|
||||
DAXPYKERNEL = ../arm/axpy.c
|
||||
CAXPYKERNEL = ../arm/zaxpy.c
|
||||
ZAXPYKERNEL = ../arm/zaxpy.c
|
||||
|
||||
SCOPYKERNEL = ../arm/copy.c
|
||||
DCOPYKERNEL = ../arm/copy.c
|
||||
CCOPYKERNEL = ../arm/zcopy.c
|
||||
ZCOPYKERNEL = ../arm/zcopy.c
|
||||
|
||||
SDOTKERNEL = ../arm/dot.c
|
||||
DDOTKERNEL = ../arm/dot.c
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
DSDOTKERNEL = ../generic/dot.c
|
||||
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
DNRM2KERNEL = ../arm/nrm2.c
|
||||
CNRM2KERNEL = ../arm/znrm2.c
|
||||
ZNRM2KERNEL = ../arm/znrm2.c
|
||||
|
||||
SROTKERNEL = ../arm/rot.c
|
||||
DROTKERNEL = ../arm/rot.c
|
||||
CROTKERNEL = ../arm/zrot.c
|
||||
ZROTKERNEL = ../arm/zrot.c
|
||||
|
||||
SSCALKERNEL = ../arm/scal.c
|
||||
DSCALKERNEL = ../arm/scal.c
|
||||
CSCALKERNEL = ../arm/zscal.c
|
||||
ZSCALKERNEL = ../arm/zscal.c
|
||||
|
||||
SSWAPKERNEL = ../arm/swap.c
|
||||
DSWAPKERNEL = ../arm/swap.c
|
||||
CSWAPKERNEL = ../arm/zswap.c
|
||||
ZSWAPKERNEL = ../arm/zswap.c
|
||||
|
||||
SGEMVNKERNEL = ../arm/gemv_n.c
|
||||
DGEMVNKERNEL = ../arm/gemv_n.c
|
||||
CGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
ZGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
|
||||
SGEMVTKERNEL = ../arm/gemv_t.c
|
||||
DGEMVTKERNEL = ../arm/gemv_t.c
|
||||
CGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
ZGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
|
||||
SCABS_KERNEL = ../generic/cabs.c
|
||||
DCABS_KERNEL = ../generic/cabs.c
|
||||
QCABS_KERNEL = ../generic/cabs.c
|
||||
LSAME_KERNEL = ../generic/lsame.c
|
||||
|
||||
SGEMM_BETA = ../generic/gemm_beta.c
|
||||
DGEMM_BETA = ../generic/gemm_beta.c
|
||||
CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||
|
||||
|
|
@ -0,0 +1 @@
|
|||
clean ::
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -40,7 +40,6 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a,
|
|||
|
||||
if ( rows <= 0 ) return(0);
|
||||
if ( cols <= 0 ) return(0);
|
||||
if ( alpha_r == 1.0 && alpha_i == 0.0 ) return (0);
|
||||
|
||||
aptr = a;
|
||||
lda *= 2;
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,587 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
||||
|
||||
BLASLONG i, j;
|
||||
|
||||
FLOAT *aoffset;
|
||||
FLOAT *aoffset1, *aoffset2;
|
||||
|
||||
FLOAT *boffset;
|
||||
|
||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||
|
||||
FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
|
||||
FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
|
||||
FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
|
||||
FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
|
||||
FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
|
||||
FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
|
||||
FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
|
||||
FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
lda *= 2;
|
||||
|
||||
#if 0
|
||||
fprintf(stderr, "M = %d N = %d\n", m, n);
|
||||
#endif
|
||||
|
||||
j = (n >> 4);
|
||||
if (j > 0){
|
||||
do{
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset + lda;
|
||||
aoffset += 32;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
ctemp05 = *(aoffset1 + 4);
|
||||
ctemp06 = *(aoffset1 + 5);
|
||||
ctemp07 = *(aoffset1 + 6);
|
||||
ctemp08 = *(aoffset1 + 7);
|
||||
ctemp09 = *(aoffset1 + 8);
|
||||
ctemp10 = *(aoffset1 + 9);
|
||||
ctemp11 = *(aoffset1 + 10);
|
||||
ctemp12 = *(aoffset1 + 11);
|
||||
ctemp13 = *(aoffset1 + 12);
|
||||
ctemp14 = *(aoffset1 + 13);
|
||||
ctemp15 = *(aoffset1 + 14);
|
||||
ctemp16 = *(aoffset1 + 15);
|
||||
ctemp17 = *(aoffset1 + 16);
|
||||
ctemp18 = *(aoffset1 + 17);
|
||||
ctemp19 = *(aoffset1 + 18);
|
||||
ctemp20 = *(aoffset1 + 19);
|
||||
ctemp21 = *(aoffset1 + 20);
|
||||
ctemp22 = *(aoffset1 + 21);
|
||||
ctemp23 = *(aoffset1 + 22);
|
||||
ctemp24 = *(aoffset1 + 23);
|
||||
ctemp25 = *(aoffset1 + 24);
|
||||
ctemp26 = *(aoffset1 + 25);
|
||||
ctemp27 = *(aoffset1 + 26);
|
||||
ctemp28 = *(aoffset1 + 27);
|
||||
ctemp29 = *(aoffset1 + 28);
|
||||
ctemp30 = *(aoffset1 + 29);
|
||||
ctemp31 = *(aoffset1 + 30);
|
||||
ctemp32 = *(aoffset1 + 31);
|
||||
|
||||
ctemp33 = *(aoffset2 + 0);
|
||||
ctemp34 = *(aoffset2 + 1);
|
||||
ctemp35 = *(aoffset2 + 2);
|
||||
ctemp36 = *(aoffset2 + 3);
|
||||
ctemp37 = *(aoffset2 + 4);
|
||||
ctemp38 = *(aoffset2 + 5);
|
||||
ctemp39 = *(aoffset2 + 6);
|
||||
ctemp40 = *(aoffset2 + 7);
|
||||
ctemp41 = *(aoffset2 + 8);
|
||||
ctemp42 = *(aoffset2 + 9);
|
||||
ctemp43 = *(aoffset2 + 10);
|
||||
ctemp44 = *(aoffset2 + 11);
|
||||
ctemp45 = *(aoffset2 + 12);
|
||||
ctemp46 = *(aoffset2 + 13);
|
||||
ctemp47 = *(aoffset2 + 14);
|
||||
ctemp48 = *(aoffset2 + 15);
|
||||
ctemp49 = *(aoffset2 + 16);
|
||||
ctemp50 = *(aoffset2 + 17);
|
||||
ctemp51 = *(aoffset2 + 18);
|
||||
ctemp52 = *(aoffset2 + 19);
|
||||
ctemp53 = *(aoffset2 + 20);
|
||||
ctemp54 = *(aoffset2 + 21);
|
||||
ctemp55 = *(aoffset2 + 22);
|
||||
ctemp56 = *(aoffset2 + 23);
|
||||
ctemp57 = *(aoffset2 + 24);
|
||||
ctemp58 = *(aoffset2 + 25);
|
||||
ctemp59 = *(aoffset2 + 26);
|
||||
ctemp60 = *(aoffset2 + 27);
|
||||
ctemp61 = *(aoffset2 + 28);
|
||||
ctemp62 = *(aoffset2 + 29);
|
||||
ctemp63 = *(aoffset2 + 30);
|
||||
ctemp64 = *(aoffset2 + 31);
|
||||
|
||||
*(boffset + 0) = -ctemp01;
|
||||
*(boffset + 1) = -ctemp02;
|
||||
*(boffset + 2) = -ctemp03;
|
||||
*(boffset + 3) = -ctemp04;
|
||||
*(boffset + 4) = -ctemp05;
|
||||
*(boffset + 5) = -ctemp06;
|
||||
*(boffset + 6) = -ctemp07;
|
||||
*(boffset + 7) = -ctemp08;
|
||||
|
||||
*(boffset + 8) = -ctemp09;
|
||||
*(boffset + 9) = -ctemp10;
|
||||
*(boffset + 10) = -ctemp11;
|
||||
*(boffset + 11) = -ctemp12;
|
||||
*(boffset + 12) = -ctemp13;
|
||||
*(boffset + 13) = -ctemp14;
|
||||
*(boffset + 14) = -ctemp15;
|
||||
*(boffset + 15) = -ctemp16;
|
||||
|
||||
*(boffset + 16) = -ctemp17;
|
||||
*(boffset + 17) = -ctemp18;
|
||||
*(boffset + 18) = -ctemp19;
|
||||
*(boffset + 19) = -ctemp20;
|
||||
*(boffset + 20) = -ctemp21;
|
||||
*(boffset + 21) = -ctemp22;
|
||||
*(boffset + 22) = -ctemp23;
|
||||
*(boffset + 23) = -ctemp24;
|
||||
|
||||
*(boffset + 24) = -ctemp25;
|
||||
*(boffset + 25) = -ctemp26;
|
||||
*(boffset + 26) = -ctemp27;
|
||||
*(boffset + 27) = -ctemp28;
|
||||
*(boffset + 28) = -ctemp29;
|
||||
*(boffset + 29) = -ctemp30;
|
||||
*(boffset + 30) = -ctemp31;
|
||||
*(boffset + 31) = -ctemp32;
|
||||
|
||||
*(boffset + 32) = -ctemp33;
|
||||
*(boffset + 33) = -ctemp34;
|
||||
*(boffset + 34) = -ctemp35;
|
||||
*(boffset + 35) = -ctemp36;
|
||||
*(boffset + 36) = -ctemp37;
|
||||
*(boffset + 37) = -ctemp38;
|
||||
*(boffset + 38) = -ctemp39;
|
||||
*(boffset + 39) = -ctemp40;
|
||||
|
||||
*(boffset + 40) = -ctemp41;
|
||||
*(boffset + 41) = -ctemp42;
|
||||
*(boffset + 42) = -ctemp43;
|
||||
*(boffset + 43) = -ctemp44;
|
||||
*(boffset + 44) = -ctemp45;
|
||||
*(boffset + 45) = -ctemp46;
|
||||
*(boffset + 46) = -ctemp47;
|
||||
*(boffset + 47) = -ctemp48;
|
||||
|
||||
*(boffset + 48) = -ctemp49;
|
||||
*(boffset + 49) = -ctemp50;
|
||||
*(boffset + 50) = -ctemp51;
|
||||
*(boffset + 51) = -ctemp52;
|
||||
*(boffset + 52) = -ctemp53;
|
||||
*(boffset + 53) = -ctemp54;
|
||||
*(boffset + 54) = -ctemp55;
|
||||
*(boffset + 55) = -ctemp56;
|
||||
|
||||
*(boffset + 56) = -ctemp57;
|
||||
*(boffset + 57) = -ctemp58;
|
||||
*(boffset + 58) = -ctemp59;
|
||||
*(boffset + 59) = -ctemp60;
|
||||
*(boffset + 60) = -ctemp61;
|
||||
*(boffset + 61) = -ctemp62;
|
||||
*(boffset + 62) = -ctemp63;
|
||||
*(boffset + 63) = -ctemp64;
|
||||
|
||||
aoffset1 += 2 * lda;
|
||||
aoffset2 += 2 * lda;
|
||||
boffset += 64;
|
||||
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
ctemp05 = *(aoffset1 + 4);
|
||||
ctemp06 = *(aoffset1 + 5);
|
||||
ctemp07 = *(aoffset1 + 6);
|
||||
ctemp08 = *(aoffset1 + 7);
|
||||
ctemp09 = *(aoffset1 + 8);
|
||||
ctemp10 = *(aoffset1 + 9);
|
||||
ctemp11 = *(aoffset1 + 10);
|
||||
ctemp12 = *(aoffset1 + 11);
|
||||
ctemp13 = *(aoffset1 + 12);
|
||||
ctemp14 = *(aoffset1 + 13);
|
||||
ctemp15 = *(aoffset1 + 14);
|
||||
ctemp16 = *(aoffset1 + 15);
|
||||
ctemp17 = *(aoffset1 + 16);
|
||||
ctemp18 = *(aoffset1 + 17);
|
||||
ctemp19 = *(aoffset1 + 18);
|
||||
ctemp20 = *(aoffset1 + 19);
|
||||
ctemp21 = *(aoffset1 + 20);
|
||||
ctemp22 = *(aoffset1 + 21);
|
||||
ctemp23 = *(aoffset1 + 22);
|
||||
ctemp24 = *(aoffset1 + 23);
|
||||
ctemp25 = *(aoffset1 + 24);
|
||||
ctemp26 = *(aoffset1 + 25);
|
||||
ctemp27 = *(aoffset1 + 26);
|
||||
ctemp28 = *(aoffset1 + 27);
|
||||
ctemp29 = *(aoffset1 + 28);
|
||||
ctemp30 = *(aoffset1 + 29);
|
||||
ctemp31 = *(aoffset1 + 30);
|
||||
ctemp32 = *(aoffset1 + 31);
|
||||
|
||||
*(boffset + 0) = -ctemp01;
|
||||
*(boffset + 1) = -ctemp02;
|
||||
*(boffset + 2) = -ctemp03;
|
||||
*(boffset + 3) = -ctemp04;
|
||||
*(boffset + 4) = -ctemp05;
|
||||
*(boffset + 5) = -ctemp06;
|
||||
*(boffset + 6) = -ctemp07;
|
||||
*(boffset + 7) = -ctemp08;
|
||||
|
||||
*(boffset + 8) = -ctemp09;
|
||||
*(boffset + 9) = -ctemp10;
|
||||
*(boffset + 10) = -ctemp11;
|
||||
*(boffset + 11) = -ctemp12;
|
||||
*(boffset + 12) = -ctemp13;
|
||||
*(boffset + 13) = -ctemp14;
|
||||
*(boffset + 14) = -ctemp15;
|
||||
*(boffset + 15) = -ctemp16;
|
||||
|
||||
*(boffset + 16) = -ctemp17;
|
||||
*(boffset + 17) = -ctemp18;
|
||||
*(boffset + 18) = -ctemp19;
|
||||
*(boffset + 19) = -ctemp20;
|
||||
*(boffset + 20) = -ctemp21;
|
||||
*(boffset + 21) = -ctemp22;
|
||||
*(boffset + 22) = -ctemp23;
|
||||
*(boffset + 23) = -ctemp24;
|
||||
|
||||
*(boffset + 24) = -ctemp25;
|
||||
*(boffset + 25) = -ctemp26;
|
||||
*(boffset + 26) = -ctemp27;
|
||||
*(boffset + 27) = -ctemp28;
|
||||
*(boffset + 28) = -ctemp29;
|
||||
*(boffset + 29) = -ctemp30;
|
||||
*(boffset + 30) = -ctemp31;
|
||||
*(boffset + 31) = -ctemp32;
|
||||
|
||||
boffset += 32;
|
||||
}
|
||||
|
||||
j--;
|
||||
}while(j > 0);
|
||||
} /* end of if(j > 0) */
|
||||
|
||||
if (n & 8){
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset + lda;
|
||||
aoffset += 16;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
ctemp05 = *(aoffset1 + 4);
|
||||
ctemp06 = *(aoffset1 + 5);
|
||||
ctemp07 = *(aoffset1 + 6);
|
||||
ctemp08 = *(aoffset1 + 7);
|
||||
ctemp09 = *(aoffset1 + 8);
|
||||
ctemp10 = *(aoffset1 + 9);
|
||||
ctemp11 = *(aoffset1 + 10);
|
||||
ctemp12 = *(aoffset1 + 11);
|
||||
ctemp13 = *(aoffset1 + 12);
|
||||
ctemp14 = *(aoffset1 + 13);
|
||||
ctemp15 = *(aoffset1 + 14);
|
||||
ctemp16 = *(aoffset1 + 15);
|
||||
|
||||
ctemp17 = *(aoffset2 + 0);
|
||||
ctemp18 = *(aoffset2 + 1);
|
||||
ctemp19 = *(aoffset2 + 2);
|
||||
ctemp20 = *(aoffset2 + 3);
|
||||
ctemp21 = *(aoffset2 + 4);
|
||||
ctemp22 = *(aoffset2 + 5);
|
||||
ctemp23 = *(aoffset2 + 6);
|
||||
ctemp24 = *(aoffset2 + 7);
|
||||
ctemp25 = *(aoffset2 + 8);
|
||||
ctemp26 = *(aoffset2 + 9);
|
||||
ctemp27 = *(aoffset2 + 10);
|
||||
ctemp28 = *(aoffset2 + 11);
|
||||
ctemp29 = *(aoffset2 + 12);
|
||||
ctemp30 = *(aoffset2 + 13);
|
||||
ctemp31 = *(aoffset2 + 14);
|
||||
ctemp32 = *(aoffset2 + 15);
|
||||
|
||||
*(boffset + 0) = -ctemp01;
|
||||
*(boffset + 1) = -ctemp02;
|
||||
*(boffset + 2) = -ctemp03;
|
||||
*(boffset + 3) = -ctemp04;
|
||||
*(boffset + 4) = -ctemp05;
|
||||
*(boffset + 5) = -ctemp06;
|
||||
*(boffset + 6) = -ctemp07;
|
||||
*(boffset + 7) = -ctemp08;
|
||||
|
||||
*(boffset + 8) = -ctemp09;
|
||||
*(boffset + 9) = -ctemp10;
|
||||
*(boffset + 10) = -ctemp11;
|
||||
*(boffset + 11) = -ctemp12;
|
||||
*(boffset + 12) = -ctemp13;
|
||||
*(boffset + 13) = -ctemp14;
|
||||
*(boffset + 14) = -ctemp15;
|
||||
*(boffset + 15) = -ctemp16;
|
||||
|
||||
*(boffset + 16) = -ctemp17;
|
||||
*(boffset + 17) = -ctemp18;
|
||||
*(boffset + 18) = -ctemp19;
|
||||
*(boffset + 19) = -ctemp20;
|
||||
*(boffset + 20) = -ctemp21;
|
||||
*(boffset + 21) = -ctemp22;
|
||||
*(boffset + 22) = -ctemp23;
|
||||
*(boffset + 23) = -ctemp24;
|
||||
|
||||
*(boffset + 24) = -ctemp25;
|
||||
*(boffset + 25) = -ctemp26;
|
||||
*(boffset + 26) = -ctemp27;
|
||||
*(boffset + 27) = -ctemp28;
|
||||
*(boffset + 28) = -ctemp29;
|
||||
*(boffset + 29) = -ctemp30;
|
||||
*(boffset + 30) = -ctemp31;
|
||||
*(boffset + 31) = -ctemp32;
|
||||
|
||||
aoffset1 += 2 * lda;
|
||||
aoffset2 += 2 * lda;
|
||||
boffset += 32;
|
||||
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
ctemp05 = *(aoffset1 + 4);
|
||||
ctemp06 = *(aoffset1 + 5);
|
||||
ctemp07 = *(aoffset1 + 6);
|
||||
ctemp08 = *(aoffset1 + 7);
|
||||
ctemp09 = *(aoffset1 + 8);
|
||||
ctemp10 = *(aoffset1 + 9);
|
||||
ctemp11 = *(aoffset1 + 10);
|
||||
ctemp12 = *(aoffset1 + 11);
|
||||
ctemp13 = *(aoffset1 + 12);
|
||||
ctemp14 = *(aoffset1 + 13);
|
||||
ctemp15 = *(aoffset1 + 14);
|
||||
ctemp16 = *(aoffset1 + 15);
|
||||
|
||||
*(boffset + 0) = -ctemp01;
|
||||
*(boffset + 1) = -ctemp02;
|
||||
*(boffset + 2) = -ctemp03;
|
||||
*(boffset + 3) = -ctemp04;
|
||||
*(boffset + 4) = -ctemp05;
|
||||
*(boffset + 5) = -ctemp06;
|
||||
*(boffset + 6) = -ctemp07;
|
||||
*(boffset + 7) = -ctemp08;
|
||||
|
||||
*(boffset + 8) = -ctemp09;
|
||||
*(boffset + 9) = -ctemp10;
|
||||
*(boffset + 10) = -ctemp11;
|
||||
*(boffset + 11) = -ctemp12;
|
||||
*(boffset + 12) = -ctemp13;
|
||||
*(boffset + 13) = -ctemp14;
|
||||
*(boffset + 14) = -ctemp15;
|
||||
*(boffset + 15) = -ctemp16;
|
||||
|
||||
boffset += 16;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 4){
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset + lda;
|
||||
aoffset += 8;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
ctemp05 = *(aoffset1 + 4);
|
||||
ctemp06 = *(aoffset1 + 5);
|
||||
ctemp07 = *(aoffset1 + 6);
|
||||
ctemp08 = *(aoffset1 + 7);
|
||||
|
||||
ctemp09 = *(aoffset2 + 0);
|
||||
ctemp10 = *(aoffset2 + 1);
|
||||
ctemp11 = *(aoffset2 + 2);
|
||||
ctemp12 = *(aoffset2 + 3);
|
||||
ctemp13 = *(aoffset2 + 4);
|
||||
ctemp14 = *(aoffset2 + 5);
|
||||
ctemp15 = *(aoffset2 + 6);
|
||||
ctemp16 = *(aoffset2 + 7);
|
||||
|
||||
*(boffset + 0) = -ctemp01;
|
||||
*(boffset + 1) = -ctemp02;
|
||||
*(boffset + 2) = -ctemp03;
|
||||
*(boffset + 3) = -ctemp04;
|
||||
*(boffset + 4) = -ctemp05;
|
||||
*(boffset + 5) = -ctemp06;
|
||||
*(boffset + 6) = -ctemp07;
|
||||
*(boffset + 7) = -ctemp08;
|
||||
|
||||
*(boffset + 8) = -ctemp09;
|
||||
*(boffset + 9) = -ctemp10;
|
||||
*(boffset + 10) = -ctemp11;
|
||||
*(boffset + 11) = -ctemp12;
|
||||
*(boffset + 12) = -ctemp13;
|
||||
*(boffset + 13) = -ctemp14;
|
||||
*(boffset + 14) = -ctemp15;
|
||||
*(boffset + 15) = -ctemp16;
|
||||
|
||||
aoffset1 += 2 * lda;
|
||||
aoffset2 += 2 * lda;
|
||||
boffset += 16;
|
||||
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
ctemp05 = *(aoffset1 + 4);
|
||||
ctemp06 = *(aoffset1 + 5);
|
||||
ctemp07 = *(aoffset1 + 6);
|
||||
ctemp08 = *(aoffset1 + 7);
|
||||
|
||||
*(boffset + 0) = -ctemp01;
|
||||
*(boffset + 1) = -ctemp02;
|
||||
*(boffset + 2) = -ctemp03;
|
||||
*(boffset + 3) = -ctemp04;
|
||||
*(boffset + 4) = -ctemp05;
|
||||
*(boffset + 5) = -ctemp06;
|
||||
*(boffset + 6) = -ctemp07;
|
||||
*(boffset + 7) = -ctemp08;
|
||||
|
||||
boffset += 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 2){
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset + lda;
|
||||
aoffset += 4;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
|
||||
ctemp05 = *(aoffset2 + 0);
|
||||
ctemp06 = *(aoffset2 + 1);
|
||||
ctemp07 = *(aoffset2 + 2);
|
||||
ctemp08 = *(aoffset2 + 3);
|
||||
|
||||
*(boffset + 0) = -ctemp01;
|
||||
*(boffset + 1) = -ctemp02;
|
||||
*(boffset + 2) = -ctemp03;
|
||||
*(boffset + 3) = -ctemp04;
|
||||
*(boffset + 4) = -ctemp05;
|
||||
*(boffset + 5) = -ctemp06;
|
||||
*(boffset + 6) = -ctemp07;
|
||||
*(boffset + 7) = -ctemp08;
|
||||
|
||||
aoffset1 += 2 * lda;
|
||||
aoffset2 += 2 * lda;
|
||||
boffset += 8;
|
||||
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
|
||||
*(boffset + 0) = -ctemp01;
|
||||
*(boffset + 1) = -ctemp02;
|
||||
*(boffset + 2) = -ctemp03;
|
||||
*(boffset + 3) = -ctemp04;
|
||||
|
||||
boffset += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 1){
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset + lda;
|
||||
// aoffset += 2;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset2 + 0);
|
||||
ctemp04 = *(aoffset2 + 1);
|
||||
|
||||
*(boffset + 0) = -ctemp01;
|
||||
*(boffset + 1) = -ctemp02;
|
||||
*(boffset + 2) = -ctemp03;
|
||||
*(boffset + 3) = -ctemp04;
|
||||
|
||||
aoffset1 += 2 * lda;
|
||||
aoffset2 += 2 * lda;
|
||||
boffset += 4;
|
||||
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
|
||||
*(boffset + 0) = -ctemp01;
|
||||
*(boffset + 1) = -ctemp02;
|
||||
// boffset += 2;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,333 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, js, offset;
|
||||
|
||||
FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
|
||||
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
|
||||
FLOAT data17, data18, data19, data20, data21, data22, data23, data24;
|
||||
FLOAT data25, data26, data27, data28, data29, data30, data31, data32;
|
||||
|
||||
FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8;
|
||||
FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16;
|
||||
|
||||
lda *= 2;
|
||||
|
||||
js = (n >> 4);
|
||||
while (js > 0){
|
||||
|
||||
offset = posX - posY;
|
||||
|
||||
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
|
||||
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
|
||||
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
|
||||
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
|
||||
if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
|
||||
if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
|
||||
if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
|
||||
if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
|
||||
if (offset > -8) ao9 = a + (posX + 8) * 2 + posY * lda; else ao9 = a + posY * 2 + (posX + 8) * lda;
|
||||
if (offset > -9) ao10 = a + (posX + 9) * 2 + posY * lda; else ao10 = a + posY * 2 + (posX + 9) * lda;
|
||||
if (offset > -10) ao11 = a + (posX + 10) * 2 + posY * lda; else ao11 = a + posY * 2 + (posX + 10) * lda;
|
||||
if (offset > -11) ao12 = a + (posX + 11) * 2 + posY * lda; else ao12 = a + posY * 2 + (posX + 11) * lda;
|
||||
if (offset > -12) ao13 = a + (posX + 12) * 2 + posY * lda; else ao13 = a + posY * 2 + (posX + 12) * lda;
|
||||
if (offset > -13) ao14 = a + (posX + 13) * 2 + posY * lda; else ao14 = a + posY * 2 + (posX + 13) * lda;
|
||||
if (offset > -14) ao15 = a + (posX + 14) * 2 + posY * lda; else ao15 = a + posY * 2 + (posX + 14) * lda;
|
||||
if (offset > -15) ao16 = a + (posX + 15) * 2 + posY * lda; else ao16 = a + posY * 2 + (posX + 15) * lda;
|
||||
|
||||
|
||||
i = m;
|
||||
|
||||
while (i > 0) {
|
||||
data01 = *(ao1 + 0);
|
||||
data02 = *(ao1 + 1);
|
||||
data03 = *(ao2 + 0);
|
||||
data04 = *(ao2 + 1);
|
||||
data05 = *(ao3 + 0);
|
||||
data06 = *(ao3 + 1);
|
||||
data07 = *(ao4 + 0);
|
||||
data08 = *(ao4 + 1);
|
||||
data09 = *(ao5 + 0);
|
||||
data10 = *(ao5 + 1);
|
||||
data11 = *(ao6 + 0);
|
||||
data12 = *(ao6 + 1);
|
||||
data13 = *(ao7 + 0);
|
||||
data14 = *(ao7 + 1);
|
||||
data15 = *(ao8 + 0);
|
||||
data16 = *(ao8 + 1);
|
||||
data17 = *(ao9 + 0);
|
||||
data18 = *(ao9 + 1);
|
||||
data19 = *(ao10 + 0);
|
||||
data20 = *(ao10 + 1);
|
||||
data21 = *(ao11 + 0);
|
||||
data22 = *(ao11 + 1);
|
||||
data23 = *(ao12 + 0);
|
||||
data24 = *(ao12 + 1);
|
||||
data25 = *(ao13 + 0);
|
||||
data26 = *(ao13 + 1);
|
||||
data27 = *(ao14 + 0);
|
||||
data28 = *(ao14 + 1);
|
||||
data29 = *(ao15 + 0);
|
||||
data30 = *(ao15 + 1);
|
||||
data31 = *(ao16 + 0);
|
||||
data32 = *(ao16 + 1);
|
||||
|
||||
if (offset > 0) ao1 += lda; else ao1 += 2;
|
||||
if (offset > -1) ao2 += lda; else ao2 += 2;
|
||||
if (offset > -2) ao3 += lda; else ao3 += 2;
|
||||
if (offset > -3) ao4 += lda; else ao4 += 2;
|
||||
if (offset > -4) ao5 += lda; else ao5 += 2;
|
||||
if (offset > -5) ao6 += lda; else ao6 += 2;
|
||||
if (offset > -6) ao7 += lda; else ao7 += 2;
|
||||
if (offset > -7) ao8 += lda; else ao8 += 2;
|
||||
if (offset > -8) ao9 += lda; else ao9 += 2;
|
||||
if (offset > -9) ao10 += lda; else ao10 += 2;
|
||||
if (offset > -10) ao11 += lda; else ao11 += 2;
|
||||
if (offset > -11) ao12 += lda; else ao12 += 2;
|
||||
if (offset > -12) ao13 += lda; else ao13 += 2;
|
||||
if (offset > -13) ao14 += lda; else ao14 += 2;
|
||||
if (offset > -14) ao15 += lda; else ao15 += 2;
|
||||
if (offset > -15) ao16 += lda; else ao16 += 2;
|
||||
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
b[ 2] = data03;
|
||||
b[ 3] = data04;
|
||||
b[ 4] = data05;
|
||||
b[ 5] = data06;
|
||||
b[ 6] = data07;
|
||||
b[ 7] = data08;
|
||||
b[ 8] = data09;
|
||||
b[ 9] = data10;
|
||||
b[10] = data11;
|
||||
b[11] = data12;
|
||||
b[12] = data13;
|
||||
b[13] = data14;
|
||||
b[14] = data15;
|
||||
b[15] = data16;
|
||||
b[16] = data17;
|
||||
b[17] = data18;
|
||||
b[18] = data19;
|
||||
b[19] = data20;
|
||||
b[20] = data21;
|
||||
b[21] = data22;
|
||||
b[22] = data23;
|
||||
b[23] = data24;
|
||||
b[24] = data25;
|
||||
b[25] = data26;
|
||||
b[26] = data27;
|
||||
b[27] = data28;
|
||||
b[28] = data29;
|
||||
b[29] = data30;
|
||||
b[30] = data31;
|
||||
b[31] = data32;
|
||||
|
||||
b += 32;
|
||||
|
||||
offset --;
|
||||
i --;
|
||||
}
|
||||
|
||||
posX += 16;
|
||||
js --;
|
||||
}
|
||||
|
||||
if (n & 8) {
|
||||
offset = posX - posY;
|
||||
|
||||
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
|
||||
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
|
||||
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
|
||||
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
|
||||
if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
|
||||
if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
|
||||
if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
|
||||
if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
|
||||
|
||||
i = m;
|
||||
|
||||
while (i > 0) {
|
||||
data01 = *(ao1 + 0);
|
||||
data02 = *(ao1 + 1);
|
||||
data03 = *(ao2 + 0);
|
||||
data04 = *(ao2 + 1);
|
||||
data05 = *(ao3 + 0);
|
||||
data06 = *(ao3 + 1);
|
||||
data07 = *(ao4 + 0);
|
||||
data08 = *(ao4 + 1);
|
||||
data09 = *(ao5 + 0);
|
||||
data10 = *(ao5 + 1);
|
||||
data11 = *(ao6 + 0);
|
||||
data12 = *(ao6 + 1);
|
||||
data13 = *(ao7 + 0);
|
||||
data14 = *(ao7 + 1);
|
||||
data15 = *(ao8 + 0);
|
||||
data16 = *(ao8 + 1);
|
||||
|
||||
if (offset > 0) ao1 += lda; else ao1 += 2;
|
||||
if (offset > -1) ao2 += lda; else ao2 += 2;
|
||||
if (offset > -2) ao3 += lda; else ao3 += 2;
|
||||
if (offset > -3) ao4 += lda; else ao4 += 2;
|
||||
if (offset > -4) ao5 += lda; else ao5 += 2;
|
||||
if (offset > -5) ao6 += lda; else ao6 += 2;
|
||||
if (offset > -6) ao7 += lda; else ao7 += 2;
|
||||
if (offset > -7) ao8 += lda; else ao8 += 2;
|
||||
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
b[ 2] = data03;
|
||||
b[ 3] = data04;
|
||||
b[ 4] = data05;
|
||||
b[ 5] = data06;
|
||||
b[ 6] = data07;
|
||||
b[ 7] = data08;
|
||||
b[ 8] = data09;
|
||||
b[ 9] = data10;
|
||||
b[10] = data11;
|
||||
b[11] = data12;
|
||||
b[12] = data13;
|
||||
b[13] = data14;
|
||||
b[14] = data15;
|
||||
b[15] = data16;
|
||||
|
||||
b += 16;
|
||||
|
||||
offset --;
|
||||
i --;
|
||||
}
|
||||
|
||||
posX += 8;
|
||||
}
|
||||
|
||||
if (n & 4) {
|
||||
offset = posX - posY;
|
||||
|
||||
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
|
||||
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
|
||||
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
|
||||
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
|
||||
|
||||
i = m;
|
||||
|
||||
while (i > 0) {
|
||||
data01 = *(ao1 + 0);
|
||||
data02 = *(ao1 + 1);
|
||||
data03 = *(ao2 + 0);
|
||||
data04 = *(ao2 + 1);
|
||||
data05 = *(ao3 + 0);
|
||||
data06 = *(ao3 + 1);
|
||||
data07 = *(ao4 + 0);
|
||||
data08 = *(ao4 + 1);
|
||||
|
||||
if (offset > 0) ao1 += lda; else ao1 += 2;
|
||||
if (offset > -1) ao2 += lda; else ao2 += 2;
|
||||
if (offset > -2) ao3 += lda; else ao3 += 2;
|
||||
if (offset > -3) ao4 += lda; else ao4 += 2;
|
||||
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
b[ 2] = data03;
|
||||
b[ 3] = data04;
|
||||
b[ 4] = data05;
|
||||
b[ 5] = data06;
|
||||
b[ 6] = data07;
|
||||
b[ 7] = data08;
|
||||
|
||||
b += 8;
|
||||
|
||||
offset --;
|
||||
i --;
|
||||
}
|
||||
|
||||
posX += 4;
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
|
||||
offset = posX - posY;
|
||||
|
||||
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
|
||||
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
|
||||
|
||||
i = m;
|
||||
|
||||
while (i > 0) {
|
||||
data01 = *(ao1 + 0);
|
||||
data02 = *(ao1 + 1);
|
||||
data03 = *(ao2 + 0);
|
||||
data04 = *(ao2 + 1);
|
||||
|
||||
if (offset > 0) ao1 += lda; else ao1 += 2;
|
||||
if (offset > -1) ao2 += lda; else ao2 += 2;
|
||||
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
b[ 2] = data03;
|
||||
b[ 3] = data04;
|
||||
|
||||
b += 4;
|
||||
|
||||
offset --;
|
||||
i --;
|
||||
}
|
||||
|
||||
posX += 2;
|
||||
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
|
||||
offset = posX - posY;
|
||||
|
||||
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
|
||||
|
||||
i = m;
|
||||
|
||||
while (i > 0) {
|
||||
data01 = *(ao1 + 0);
|
||||
data02 = *(ao1 + 1);
|
||||
|
||||
if (offset > 0) ao1 += lda; else ao1 += 2;
|
||||
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
|
||||
b += 2;
|
||||
|
||||
offset --;
|
||||
i --;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue