Merge branch 'develop' into risc-v
This commit is contained in:
commit
45786b05da
24
.drone.yml
24
.drone.yml
|
|
@ -190,3 +190,27 @@ steps:
|
|||
- make -C ctest $COMMON_FLAGS
|
||||
- make -C utest $COMMON_FLAGS
|
||||
- make -C cpp_thread_test dgemm_tester
|
||||
---
|
||||
kind: pipeline
|
||||
name: arm64_gcc10
|
||||
|
||||
platform:
|
||||
os: linux
|
||||
arch: arm64
|
||||
|
||||
steps:
|
||||
- name: Build and Test
|
||||
image: ubuntu:20.04
|
||||
environment:
|
||||
CC: gcc-10
|
||||
FC: gfortran-10
|
||||
COMMON_FLAGS: 'TARGET=ARMV8 DYNAMIC_ARCH=1'
|
||||
commands:
|
||||
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
|
||||
- apt-get update -y
|
||||
- apt-get install -y make $CC gfortran-10 perl python g++
|
||||
- $CC --version
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS
|
||||
- make -C utest $COMMON_FLAGS
|
||||
- make -C test $COMMON_FLAGS
|
||||
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@ jobs:
|
|||
- name: Update Homebrew
|
||||
if: github.event_name != 'pull_request'
|
||||
run: brew update || true
|
||||
|
||||
|
||||
- name: Install prerequisites
|
||||
run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas
|
||||
|
||||
|
|
|
|||
|
|
@ -89,5 +89,7 @@ build.*
|
|||
*.swp
|
||||
benchmark/*.goto
|
||||
benchmark/smallscaling
|
||||
.vscode
|
||||
CMakeCache.txt
|
||||
CMakeFiles/*
|
||||
.vscode
|
||||
|
|
|
|||
320
.travis.yml
320
.travis.yml
|
|
@ -1,33 +1,38 @@
|
|||
# XXX: Precise is already deprecated, new default is Trusty.
|
||||
# https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming
|
||||
dist: precise
|
||||
dist: focal
|
||||
sudo: true
|
||||
language: c
|
||||
|
||||
matrix:
|
||||
include:
|
||||
- &test-ubuntu
|
||||
os: linux
|
||||
# os: linux
|
||||
compiler: gcc
|
||||
addons:
|
||||
apt:
|
||||
packages:
|
||||
- gfortran
|
||||
# before_script: &common-before
|
||||
# - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
# script:
|
||||
# - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
# - make -C test $COMMON_FLAGS $BTYPE
|
||||
# - make -C ctest $COMMON_FLAGS $BTYPE
|
||||
# - make -C utest $COMMON_FLAGS $BTYPE
|
||||
# env:
|
||||
# - TARGET_BOX=LINUX64
|
||||
# - BTYPE="BINARY=64"
|
||||
#
|
||||
# - <<: *test-ubuntu
|
||||
os: linux-ppc64le
|
||||
before_script: &common-before
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
|
||||
script:
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
- make -C test $COMMON_FLAGS $BTYPE
|
||||
- make -C ctest $COMMON_FLAGS $BTYPE
|
||||
- make -C utest $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- TARGET_BOX=LINUX64
|
||||
- BTYPE="BINARY=64"
|
||||
|
||||
- <<: *test-ubuntu
|
||||
os: linux-ppc64le
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
|
||||
env:
|
||||
# for matrix annotation only
|
||||
- TARGET_BOX=PPC64LE_LINUX
|
||||
|
|
@ -55,38 +60,38 @@ matrix:
|
|||
- TARGET_BOX=IBMZ_LINUX
|
||||
- BTYPE="BINARY=64 USE_OPENMP=0 CC=clang"
|
||||
|
||||
- <<: *test-ubuntu
|
||||
env:
|
||||
- TARGET_BOX=LINUX64
|
||||
- BTYPE="BINARY=64 USE_OPENMP=1"
|
||||
|
||||
- <<: *test-ubuntu
|
||||
env:
|
||||
- TARGET_BOX=LINUX64
|
||||
- BTYPE="BINARY=64 INTERFACE64=1"
|
||||
|
||||
- <<: *test-ubuntu
|
||||
compiler: clang
|
||||
env:
|
||||
- TARGET_BOX=LINUX64
|
||||
- BTYPE="BINARY=64 CC=clang"
|
||||
|
||||
- <<: *test-ubuntu
|
||||
compiler: clang
|
||||
env:
|
||||
- TARGET_BOX=LINUX64
|
||||
- BTYPE="BINARY=64 INTERFACE64=1 CC=clang"
|
||||
|
||||
- <<: *test-ubuntu
|
||||
addons:
|
||||
apt:
|
||||
packages:
|
||||
- gcc-multilib
|
||||
- gfortran-multilib
|
||||
env:
|
||||
- TARGET_BOX=LINUX32
|
||||
- BTYPE="BINARY=32"
|
||||
|
||||
# - <<: *test-ubuntu
|
||||
# env:
|
||||
# - TARGET_BOX=LINUX64
|
||||
# - BTYPE="BINARY=64 USE_OPENMP=1"
|
||||
#
|
||||
# - <<: *test-ubuntu
|
||||
# env:
|
||||
# - TARGET_BOX=LINUX64
|
||||
# - BTYPE="BINARY=64 INTERFACE64=1"
|
||||
#
|
||||
# - <<: *test-ubuntu
|
||||
# compiler: clang
|
||||
# env:
|
||||
# - TARGET_BOX=LINUX64
|
||||
# - BTYPE="BINARY=64 CC=clang"
|
||||
#
|
||||
# - <<: *test-ubuntu
|
||||
# compiler: clang
|
||||
# env:
|
||||
# - TARGET_BOX=LINUX64
|
||||
# - BTYPE="BINARY=64 INTERFACE64=1 CC=clang"
|
||||
#
|
||||
# - <<: *test-ubuntu
|
||||
# addons:
|
||||
# apt:
|
||||
# packages:
|
||||
# - gcc-multilib
|
||||
# - gfortran-multilib
|
||||
# env:
|
||||
# - TARGET_BOX=LINUX32
|
||||
# - BTYPE="BINARY=32"
|
||||
#
|
||||
- os: linux
|
||||
arch: ppc64le
|
||||
dist: bionic
|
||||
|
|
@ -121,47 +126,47 @@ matrix:
|
|||
# for matrix annotation only
|
||||
- TARGET_BOX=PPC64LE_LINUX_P9
|
||||
|
||||
- os: linux
|
||||
compiler: gcc
|
||||
addons:
|
||||
apt:
|
||||
packages:
|
||||
- binutils-mingw-w64-x86-64
|
||||
- gcc-mingw-w64-x86-64
|
||||
- gfortran-mingw-w64-x86-64
|
||||
before_script: *common-before
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- TARGET_BOX=WIN64
|
||||
- BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran"
|
||||
|
||||
# - os: linux
|
||||
# compiler: gcc
|
||||
# addons:
|
||||
# apt:
|
||||
# packages:
|
||||
# - binutils-mingw-w64-x86-64
|
||||
# - gcc-mingw-w64-x86-64
|
||||
# - gfortran-mingw-w64-x86-64
|
||||
# before_script: *common-before
|
||||
# script:
|
||||
# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
# env:
|
||||
# - TARGET_BOX=WIN64
|
||||
# - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran"
|
||||
#
|
||||
# Build & test on Alpine Linux inside chroot, i.e. on system with musl libc.
|
||||
# These jobs needs sudo, so Travis runs them on VM-based infrastructure
|
||||
# which is slower than container-based infrastructure used for jobs
|
||||
# that don't require sudo.
|
||||
- &test-alpine
|
||||
os: linux
|
||||
dist: trusty
|
||||
sudo: true
|
||||
language: minimal
|
||||
before_install:
|
||||
- "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
|
||||
&& echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1"
|
||||
- alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
|
||||
install:
|
||||
- sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
|
||||
before_script: *common-before
|
||||
script:
|
||||
# XXX: Disable some warnings for now to avoid exceeding Travis limit for log size.
|
||||
- alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types"
|
||||
- alpine make -C test $COMMON_FLAGS $BTYPE
|
||||
- alpine make -C ctest $COMMON_FLAGS $BTYPE
|
||||
- alpine make -C utest $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- TARGET_BOX=LINUX64_MUSL
|
||||
- BTYPE="BINARY=64"
|
||||
# - &test-alpine
|
||||
# os: linux
|
||||
# dist: trusty
|
||||
# sudo: true
|
||||
# language: minimal
|
||||
# before_install:
|
||||
# - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
|
||||
# && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1"
|
||||
# - alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
|
||||
# install:
|
||||
# - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
|
||||
# before_script: *common-before
|
||||
# script:
|
||||
# # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size.
|
||||
# - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
# CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types"
|
||||
# - alpine make -C test $COMMON_FLAGS $BTYPE
|
||||
# - alpine make -C ctest $COMMON_FLAGS $BTYPE
|
||||
# - alpine make -C utest $COMMON_FLAGS $BTYPE
|
||||
# env:
|
||||
# - TARGET_BOX=LINUX64_MUSL
|
||||
# - BTYPE="BINARY=64"
|
||||
|
||||
# XXX: This job segfaults in TESTS OF THE COMPLEX LEVEL 3 BLAS,
|
||||
# but only on Travis CI, cannot reproduce it elsewhere.
|
||||
|
|
@ -171,89 +176,98 @@ matrix:
|
|||
# - TARGET_BOX=LINUX64_MUSL
|
||||
# - BTYPE="BINARY=64 USE_OPENMP=1"
|
||||
|
||||
- <<: *test-alpine
|
||||
env:
|
||||
- TARGET_BOX=LINUX64_MUSL
|
||||
- BTYPE="BINARY=64 INTERFACE64=1"
|
||||
# - <<: *test-alpine
|
||||
# env:
|
||||
# - TARGET_BOX=LINUX64_MUSL
|
||||
# - BTYPE="BINARY=64 INTERFACE64=1"
|
||||
#
|
||||
# # Build with the same flags as Alpine do in OpenBLAS package.
|
||||
# - <<: *test-alpine
|
||||
# env:
|
||||
# - TARGET_BOX=LINUX64_MUSL
|
||||
# - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2"
|
||||
|
||||
# Build with the same flags as Alpine do in OpenBLAS package.
|
||||
- <<: *test-alpine
|
||||
env:
|
||||
- TARGET_BOX=LINUX64_MUSL
|
||||
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2"
|
||||
# - &test-cmake
|
||||
# os: linux
|
||||
# compiler: clang
|
||||
# addons:
|
||||
# apt:
|
||||
# packages:
|
||||
# - gfortran
|
||||
# - cmake
|
||||
# dist: trusty
|
||||
# sudo: true
|
||||
# before_script:
|
||||
# - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32"
|
||||
# script:
|
||||
# - mkdir build
|
||||
# - CONFIG=Release
|
||||
# - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG
|
||||
# - cmake --build build --config $CONFIG -- -j2
|
||||
# env:
|
||||
# - CMAKE=1
|
||||
# - <<: *test-cmake
|
||||
# env:
|
||||
# - CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1"
|
||||
# - <<: *test-cmake
|
||||
# compiler: gcc
|
||||
# env:
|
||||
# - CMAKE=1
|
||||
|
||||
- &test-cmake
|
||||
os: linux
|
||||
compiler: clang
|
||||
addons:
|
||||
apt:
|
||||
packages:
|
||||
- gfortran
|
||||
- cmake
|
||||
dist: trusty
|
||||
sudo: true
|
||||
before_script:
|
||||
- COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32"
|
||||
script:
|
||||
- mkdir build
|
||||
- CONFIG=Release
|
||||
- cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG
|
||||
- cmake --build build --config $CONFIG -- -j2
|
||||
env:
|
||||
- CMAKE=1
|
||||
- <<: *test-cmake
|
||||
env:
|
||||
- CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1"
|
||||
- <<: *test-cmake
|
||||
compiler: gcc
|
||||
env:
|
||||
- CMAKE=1
|
||||
# - &test-macos
|
||||
# os: osx
|
||||
# osx_image: xcode11.5
|
||||
# before_script:
|
||||
# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
# script:
|
||||
# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
# env:
|
||||
# - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9"
|
||||
#
|
||||
# - <<: *test-macos
|
||||
# osx_image: xcode12
|
||||
# before_script:
|
||||
# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
# - brew update
|
||||
# script:
|
||||
# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
# env:
|
||||
# - BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10"
|
||||
#
|
||||
# - <<: *test-macos
|
||||
# osx_image: xcode12
|
||||
# before_script:
|
||||
# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
# - brew update
|
||||
# script:
|
||||
# - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
# env:
|
||||
# - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
|
||||
|
||||
- &test-macos
|
||||
os: osx
|
||||
osx_image: xcode11.5
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode12
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
- brew update
|
||||
- brew install gcc@10
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
|
||||
|
||||
# - <<: *test-macos
|
||||
# osx_image: xcode10
|
||||
# env:
|
||||
# - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode11.5
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
- brew update
|
||||
env:
|
||||
# - <<: *test-macos
|
||||
# osx_image: xcode11.5
|
||||
# before_script:
|
||||
# - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
# - brew update
|
||||
# env:
|
||||
# - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
|
||||
# - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
- CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
|
||||
- CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
- BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
|
||||
- <<: *test-macos
|
||||
osx_image: xcode11.5
|
||||
env:
|
||||
# - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
|
||||
# - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1"
|
||||
- CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
|
||||
- CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1"
|
||||
- BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1"
|
||||
# - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
|
||||
# - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
# - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
|
||||
# - <<: *test-macos
|
||||
# osx_image: xcode11.5
|
||||
# env:
|
||||
## - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
|
||||
## - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1"
|
||||
# - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
|
||||
# - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1"
|
||||
# - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1"
|
||||
|
||||
- &test-graviton2
|
||||
os: linux
|
||||
|
|
|
|||
352
CMakeLists.txt
352
CMakeLists.txt
|
|
@ -3,10 +3,13 @@
|
|||
##
|
||||
|
||||
cmake_minimum_required(VERSION 2.8.5)
|
||||
|
||||
project(OpenBLAS C ASM)
|
||||
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 12.dev)
|
||||
set(OpenBLAS_PATCH_VERSION 20)
|
||||
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
|
|
@ -14,54 +17,74 @@ include(GNUInstallDirs)
|
|||
|
||||
include(CMakePackageConfigHelpers)
|
||||
|
||||
if(MSVC AND NOT DEFINED NOFORTRAN)
|
||||
set(NOFORTRAN ON)
|
||||
endif()
|
||||
|
||||
#######
|
||||
if(MSVC)
|
||||
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
|
||||
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
|
||||
endif()
|
||||
|
||||
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
|
||||
|
||||
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
|
||||
|
||||
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
|
||||
|
||||
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
|
||||
|
||||
option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF)
|
||||
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
|
||||
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
|
||||
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
|
||||
else()
|
||||
set(NO_AFFINITY 1)
|
||||
set(NO_AFFINITY 1)
|
||||
endif()
|
||||
|
||||
option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF)
|
||||
|
||||
option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF)
|
||||
option(BUILD_STATIC_LIBS "Build static library" OFF)
|
||||
if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS)
|
||||
set(BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE)
|
||||
endif()
|
||||
if((BUILD_STATIC_LIBS AND BUILD_SHARED_LIBS) AND MSVC)
|
||||
message(WARNING "Could not enable both BUILD_STATIC_LIBS and BUILD_SHARED_LIBS with MSVC, Disable BUILD_SHARED_LIBS")
|
||||
set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build static library" FORCE)
|
||||
endif()
|
||||
|
||||
# Add a prefix or suffix to all exported symbol names in the shared library.
|
||||
# Avoids conflicts with other BLAS libraries, especially when using
|
||||
# 64 bit integer interfaces in OpenBLAS.
|
||||
|
||||
set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" )
|
||||
|
||||
set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" )
|
||||
|
||||
#######
|
||||
if(BUILD_WITHOUT_LAPACK)
|
||||
set(NO_LAPACK 1)
|
||||
set(NO_LAPACKE 1)
|
||||
set(NO_LAPACK 1)
|
||||
set(NO_LAPACKE 1)
|
||||
endif()
|
||||
|
||||
if(BUILD_WITHOUT_CBLAS)
|
||||
set(NO_CBLAS 1)
|
||||
set(NO_CBLAS 1)
|
||||
endif()
|
||||
|
||||
#######
|
||||
|
||||
if(MSVC AND MSVC_STATIC_CRT)
|
||||
set(CompilerFlags
|
||||
CMAKE_CXX_FLAGS
|
||||
CMAKE_CXX_FLAGS_DEBUG
|
||||
CMAKE_CXX_FLAGS_RELEASE
|
||||
CMAKE_C_FLAGS
|
||||
CMAKE_C_FLAGS_DEBUG
|
||||
CMAKE_C_FLAGS_RELEASE
|
||||
)
|
||||
foreach(CompilerFlag ${CompilerFlags})
|
||||
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
|
||||
endforeach()
|
||||
set(CompilerFlags
|
||||
CMAKE_CXX_FLAGS
|
||||
CMAKE_CXX_FLAGS_DEBUG
|
||||
CMAKE_CXX_FLAGS_RELEASE
|
||||
CMAKE_C_FLAGS
|
||||
CMAKE_C_FLAGS_DEBUG
|
||||
CMAKE_C_FLAGS_RELEASE
|
||||
)
|
||||
foreach(CompilerFlag ${CompilerFlags})
|
||||
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")
|
||||
|
|
@ -95,7 +118,7 @@ endif ()
|
|||
# set which float types we want to build for
|
||||
if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
|
||||
# if none are defined, build for all
|
||||
# set(BUILD_BFLOAT16 true)
|
||||
# set(BUILD_BFLOAT16 true)
|
||||
set(BUILD_SINGLE true)
|
||||
set(BUILD_DOUBLE true)
|
||||
set(BUILD_COMPLEX true)
|
||||
|
|
@ -129,7 +152,7 @@ endif ()
|
|||
|
||||
if (BUILD_BFLOAT16)
|
||||
message(STATUS "Building Half Precision")
|
||||
list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing
|
||||
# list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN")
|
||||
|
|
@ -140,9 +163,10 @@ endif ()
|
|||
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
if(MSVC)
|
||||
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug)
|
||||
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release)
|
||||
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug)
|
||||
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release)
|
||||
endif ()
|
||||
|
||||
# get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html)
|
||||
set(TARGET_OBJS "")
|
||||
foreach (SUBDIR ${SUBDIRS})
|
||||
|
|
@ -180,12 +204,63 @@ if (${DYNAMIC_ARCH})
|
|||
endif ()
|
||||
|
||||
# add objects to the openblas lib
|
||||
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
|
||||
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
|
||||
if(NOT NO_LAPACK)
|
||||
add_library(LAPACK OBJECT ${LA_SOURCES})
|
||||
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACK>")
|
||||
endif()
|
||||
if(NOT NO_LAPACKE)
|
||||
add_library(LAPACKE OBJECT ${LAPACKE_SOURCES})
|
||||
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACKE>")
|
||||
endif()
|
||||
if(BUILD_RELAPACK)
|
||||
add_library(RELAPACK OBJECT ${RELA_SOURCES})
|
||||
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:RELAPACK>")
|
||||
endif()
|
||||
set(OpenBLAS_LIBS "")
|
||||
if(BUILD_STATIC_LIBS)
|
||||
add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
|
||||
target_include_directories(${OpenBLAS_LIBNAME}_static INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
|
||||
list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_static)
|
||||
endif()
|
||||
if(BUILD_SHARED_LIBS)
|
||||
add_library(${OpenBLAS_LIBNAME}_shared SHARED ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
|
||||
target_include_directories(${OpenBLAS_LIBNAME}_shared INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
|
||||
list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_shared)
|
||||
endif()
|
||||
if(BUILD_STATIC_LIBS)
|
||||
add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_static)
|
||||
else()
|
||||
add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_shared)
|
||||
endif()
|
||||
|
||||
set_target_properties(${OpenBLAS_LIBS} PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME})
|
||||
|
||||
# Android needs to explicitly link against libm
|
||||
if(ANDROID)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME} m)
|
||||
if(BUILD_STATIC_LIBS)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME}_static m)
|
||||
endif()
|
||||
if(BUILD_SHARED_LIBS)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME}_shared m)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS)
|
||||
set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
|
||||
if (NOT NOFORTRAN)
|
||||
set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
|
||||
set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
|
||||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
|
||||
"sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
|
||||
"sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
|
||||
"sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
|
||||
"sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
|
||||
else ()
|
||||
set (CMAKE_C_CREATE_SHARED_LIBRARY
|
||||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
|
||||
"sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
|
||||
"sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
|
||||
endif ()
|
||||
endif()
|
||||
|
||||
# Handle MSVC exports
|
||||
|
|
@ -194,21 +269,21 @@ if(MSVC AND BUILD_SHARED_LIBS)
|
|||
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
|
||||
else()
|
||||
# Creates verbose .def file (51KB vs 18KB)
|
||||
set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true)
|
||||
set_target_properties(${OpenBLAS_LIBNAME}_shared PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Set output for libopenblas
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS")
|
||||
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
|
||||
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES EXPORT_NAME "OpenBLAS")
|
||||
|
||||
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
|
||||
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
|
||||
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
|
||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
|
||||
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
|
||||
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
|
||||
set_target_properties( ${OpenBLAS_LIBS} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
|
||||
endforeach()
|
||||
|
||||
enable_testing()
|
||||
|
|
@ -217,10 +292,17 @@ if (USE_THREAD)
|
|||
# Add threading library to linker
|
||||
find_package(Threads)
|
||||
if (THREADS_HAVE_PTHREAD_ARG)
|
||||
set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY COMPILE_OPTIONS "-pthread")
|
||||
set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY INTERFACE_COMPILE_OPTIONS "-pthread")
|
||||
set_target_properties(${OpenBLAS_LIBS} PROPERTIES
|
||||
COMPILE_OPTIONS "-pthread"
|
||||
INTERFACE_COMPILE_OPTIONS "-pthread"
|
||||
)
|
||||
endif()
|
||||
if(BUILD_STATIC_LIBS)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME}_static ${CMAKE_THREAD_LIBS_INIT})
|
||||
endif()
|
||||
if(BUILD_SHARED_LIBS)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME}_shared ${CMAKE_THREAD_LIBS_INIT})
|
||||
endif()
|
||||
target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
|
||||
endif()
|
||||
|
||||
#if (MSVC OR NOT NOFORTRAN)
|
||||
|
|
@ -229,104 +311,116 @@ if (NOT NO_CBLAS)
|
|||
add_subdirectory(utest)
|
||||
endif()
|
||||
|
||||
if (NOT MSVC AND NOT NOFORTRAN)
|
||||
if (NOT NOFORTRAN)
|
||||
# Build test and ctest
|
||||
add_subdirectory(test)
|
||||
if(NOT NO_CBLAS)
|
||||
add_subdirectory(ctest)
|
||||
endif()
|
||||
add_subdirectory(lapack-netlib/TESTING)
|
||||
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
|
||||
add_subdirectory(cpp_thread_test)
|
||||
endif()
|
||||
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
|
||||
add_subdirectory(cpp_thread_test)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
|
||||
set_target_properties(${OpenBLAS_LIBS} PROPERTIES
|
||||
VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}
|
||||
SOVERSION ${OpenBLAS_MAJOR_VERSION}
|
||||
)
|
||||
|
||||
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
|
||||
if (NOT MSVC)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition")
|
||||
target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition")
|
||||
else()
|
||||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE")
|
||||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
|
||||
if (NOT DEFINED ARCH)
|
||||
set(ARCH_IN "x86_64")
|
||||
else()
|
||||
set(ARCH_IN ${ARCH})
|
||||
endif()
|
||||
if (NOT DEFINED ARCH)
|
||||
set(ARCH_IN "x86_64")
|
||||
else()
|
||||
set(ARCH_IN ${ARCH})
|
||||
endif()
|
||||
|
||||
if (${CORE} STREQUAL "generic")
|
||||
set(ARCH_IN "GENERIC")
|
||||
endif ()
|
||||
if (${CORE} STREQUAL "generic")
|
||||
set(ARCH_IN "GENERIC")
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED EXPRECISION)
|
||||
set(EXPRECISION_IN 0)
|
||||
else()
|
||||
set(EXPRECISION_IN ${EXPRECISION})
|
||||
endif()
|
||||
if (NOT DEFINED EXPRECISION)
|
||||
set(EXPRECISION_IN 0)
|
||||
else()
|
||||
set(EXPRECISION_IN ${EXPRECISION})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NO_CBLAS)
|
||||
set(NO_CBLAS_IN 0)
|
||||
else()
|
||||
set(NO_CBLAS_IN ${NO_CBLAS})
|
||||
endif()
|
||||
if (NOT DEFINED NO_CBLAS)
|
||||
set(NO_CBLAS_IN 0)
|
||||
else()
|
||||
set(NO_CBLAS_IN ${NO_CBLAS})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NO_LAPACK)
|
||||
set(NO_LAPACK_IN 0)
|
||||
else()
|
||||
set(NO_LAPACK_IN ${NO_LAPACK})
|
||||
endif()
|
||||
if (NOT DEFINED NO_LAPACK)
|
||||
set(NO_LAPACK_IN 0)
|
||||
else()
|
||||
set(NO_LAPACK_IN ${NO_LAPACK})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NO_LAPACKE)
|
||||
set(NO_LAPACKE_IN 0)
|
||||
else()
|
||||
set(NO_LAPACKE_IN ${NO_LAPACKE})
|
||||
endif()
|
||||
if (NOT DEFINED NO_LAPACKE)
|
||||
set(NO_LAPACKE_IN 0)
|
||||
else()
|
||||
set(NO_LAPACKE_IN ${NO_LAPACKE})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NEED2UNDERSCORES)
|
||||
set(NEED2UNDERSCORES_IN 0)
|
||||
else()
|
||||
set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES})
|
||||
endif()
|
||||
if (NOT DEFINED NEED2UNDERSCORES)
|
||||
set(NEED2UNDERSCORES_IN 0)
|
||||
else()
|
||||
set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED ONLY_CBLAS)
|
||||
set(ONLY_CBLAS_IN 0)
|
||||
else()
|
||||
set(ONLY_CBLAS_IN ${ONLY_CBLAS})
|
||||
endif()
|
||||
if (NOT DEFINED ONLY_CBLAS)
|
||||
set(ONLY_CBLAS_IN 0)
|
||||
else()
|
||||
set(ONLY_CBLAS_IN ${ONLY_CBLAS})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED BU)
|
||||
set(BU _)
|
||||
endif()
|
||||
if (NOT DEFINED BU)
|
||||
set(BU _)
|
||||
endif()
|
||||
|
||||
if (NOT ${SYMBOLPREFIX} STREQUAL "")
|
||||
message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
|
||||
endif()
|
||||
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
|
||||
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
|
||||
endif()
|
||||
add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD
|
||||
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
|
||||
COMMENT "renaming symbols"
|
||||
)
|
||||
if (NOT ${SYMBOLPREFIX} STREQUAL "")
|
||||
message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
|
||||
endif()
|
||||
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
|
||||
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
|
||||
endif()
|
||||
|
||||
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
|
||||
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
|
||||
COMMENT "renaming symbols"
|
||||
)
|
||||
endif()
|
||||
|
||||
|
||||
# Install project
|
||||
|
||||
# Install libraries
|
||||
install(TARGETS ${OpenBLAS_LIBNAME}
|
||||
EXPORT "OpenBLAS${SUFFIX64}Targets"
|
||||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
|
||||
if(BUILD_SHARED_LIBS AND BUILD_STATIC_LIBS)
|
||||
install(TARGETS ${OpenBLAS_LIBNAME}_shared
|
||||
EXPORT "OpenBLAS${SUFFIX64}Targets"
|
||||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
|
||||
install(TARGETS ${OpenBLAS_LIBNAME}_static
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
|
||||
else()
|
||||
install(TARGETS ${OpenBLAS_LIBS}
|
||||
EXPORT "OpenBLAS${SUFFIX64}Targets"
|
||||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
|
||||
endif()
|
||||
|
||||
# Install headers
|
||||
set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
|
||||
|
|
@ -362,36 +456,41 @@ if(NOT NOFORTRAN)
|
|||
endif()
|
||||
|
||||
if(NOT NO_CBLAS)
|
||||
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
|
||||
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
|
||||
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
if (NOT ${SYMBOLPREFIX} STREQUAL "")
|
||||
string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
|
||||
string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
|
||||
string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
endif()
|
||||
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
|
||||
string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
|
||||
string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
|
||||
string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
endif()
|
||||
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
|
||||
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
|
||||
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
|
||||
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
if (NOT ${SYMBOLPREFIX} STREQUAL "")
|
||||
string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
|
||||
string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
|
||||
string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
endif()
|
||||
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
|
||||
string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
|
||||
string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
|
||||
string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||
endif()
|
||||
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
|
||||
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
endif()
|
||||
|
||||
if(NOT NO_LAPACKE)
|
||||
message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
add_dependencies( ${OpenBLAS_LIBNAME} genlapacke)
|
||||
FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h")
|
||||
install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||
if(BUILD_STATIC_LIBS)
|
||||
add_dependencies( ${OpenBLAS_LIBNAME}_static genlapacke)
|
||||
endif()
|
||||
if(BUILD_SHARED_LIBS)
|
||||
add_dependencies( ${OpenBLAS_LIBNAME}_shared genlapacke)
|
||||
endif()
|
||||
FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h")
|
||||
install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||
|
||||
ADD_CUSTOM_TARGET(genlapacke
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
|
||||
)
|
||||
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
|
||||
ADD_CUSTOM_TARGET(genlapacke
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
|
||||
)
|
||||
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
|
||||
endif()
|
||||
|
||||
# Install pkg-config files
|
||||
|
|
@ -416,4 +515,3 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
|
|||
install(EXPORT "${PN}${SUFFIX64}Targets"
|
||||
NAMESPACE "${PN}${SUFFIX64}::"
|
||||
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
|
||||
|
||||
|
|
|
|||
|
|
@ -194,3 +194,16 @@ In chronological order:
|
|||
|
||||
* PingTouGe Semiconductor Co., Ltd.
|
||||
* [2020-10] Add RISC-V Vector (0.7.1) support. Optimize BLAS kernels for Xuantie C910
|
||||
|
||||
* River Dillon <oss@outerpassage.net>
|
||||
* [2021-07-10] fix compilation with musl libc
|
||||
|
||||
* Bine Brank <https://github.com/binebrank>
|
||||
* [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE
|
||||
* [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM
|
||||
* [2021-11-12] SVE kernels for SGEMM, STRMM and corresponding SVE copy functions
|
||||
* [2022-01-06] SVE kernels for CGEMM, ZGEMM, CTRMM, ZTRMM and corresponding SVE copy functions
|
||||
* [2022-01-18] SVE kernels and copy functions for TRSM
|
||||
|
||||
* Ilya Kurdyukov <https://github.com/ilyakurdyukov>
|
||||
* [2021-02-21] Add basic support for the Elbrus E2000 architecture
|
||||
|
|
|
|||
336
Changelog.txt
336
Changelog.txt
|
|
@ -1,4 +1,340 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.20
|
||||
20-Feb-2022
|
||||
|
||||
general:
|
||||
- some code cleanup, with added casts etc.
|
||||
- fixed obtaining the cpu count with OpenMP and OMP_PROC_BIND unset
|
||||
- fixed pivot index calculation by ?LASWP for negative increments other than one
|
||||
- fixed input argument check in LAPACK ? GEQRT2
|
||||
- improved the check for a Fortran compiler in CMAKE builds
|
||||
- disabled building OpenBLAS' optimized versions of LAPACK complex SPMV,SPR,SYMV,SYR with NO_LAPACK=1
|
||||
- fixed building of LAPACK on certain distributed filesystems with parallel gmake
|
||||
- fixed building the shared library on MacOS with classic flang
|
||||
|
||||
x86_64:
|
||||
- fixed cross-compilation with CMAKE for CORE2 target
|
||||
- fixed miscompilation of AVX512 code in DYNAMIC_ARCH builds
|
||||
- added support for the "incidental" AVX512 hardware in Alder Lake when enabled in BIOS
|
||||
|
||||
E2K:
|
||||
- add new architecture (Russian Elbrus E2000 family)
|
||||
|
||||
SPARC:
|
||||
- fix IMIN/IMAX
|
||||
|
||||
ARMV8:
|
||||
- added SVE-enabled CGEMM and ZGEMM kernels for ARMV8SVE and A64FX
|
||||
- added support for Neoverse N2 and V1 cpus
|
||||
|
||||
MIPS,MIPS64:
|
||||
- fixed autodetection of MSA capability
|
||||
|
||||
LOONGARCH64:
|
||||
- added an optimized DGEMM kernel
|
||||
|
||||
====================================================================
|
||||
Version 0.3.19
|
||||
19-Dec-2021
|
||||
|
||||
general:
|
||||
- reverted unsafe TRSV/ZRSV optimizations introduced in 0.3.16
|
||||
- fixed a potential thread race in the thread buffer reallocation routines
|
||||
that were introduced in 0.3.18
|
||||
- fixed miscounting of thread pool size on Linux with OMP_PROC_BIND=TRUE
|
||||
- fixed CBLAS interfaces for CSROT/ZSROT and CROTG/ZROTG
|
||||
- made automatic library suffix for CMAKE builds with INTERFACE64 available
|
||||
to CBLAS-only builds
|
||||
|
||||
x86_64:
|
||||
- DYNAMIC_ARCH builds now fall back to the cpu with most similar capabilities
|
||||
when an unknown CPUID is encountered, instead of defaulting to Prescott
|
||||
- added cpu detection for Intel Alder Lake
|
||||
- added cpu detection for Intel Sapphire Rapids
|
||||
- added an optimized SBGEMM kernel for Sapphire Rapids
|
||||
- fixed DYNAMIC_ARCH builds on OSX with CMAKE
|
||||
- worked around DYNAMIC_ARCH builds made on Sandybridge failing on SkylakeX
|
||||
- fixed missing thread initialization for static builds on Windows/MSVC
|
||||
- fixed an excessive read in ZSYMV
|
||||
|
||||
POWER:
|
||||
- added support for POWER10 in big-endian mode
|
||||
- added support for building with CMAKE
|
||||
- added optimized SGEMM and DGEMM kernels for small matrix sizes
|
||||
|
||||
ARMV8:
|
||||
- added basic support and cputype detection for Fujitsu A64FX
|
||||
- added a generic ARMV8SVE target
|
||||
- added SVE-enabled SGEMM and DGEMM kernels for ARMV8SVE and A64FX
|
||||
- added optimized CGEMM and ZGEMM kernels for Cortex A53 and A55 cpus
|
||||
- fixed cpuid detection for Apple M1 and improved performance
|
||||
- improved compiler flag setting in CMAKE builds
|
||||
|
||||
RISCV64:
|
||||
- fixed improper initialization in CSCAL/ZSCAL for strided access patterns
|
||||
|
||||
MIPS:
|
||||
- added a GENERIC target for MIPS32
|
||||
- added support for cross-compiling to MIPS32 on x86_64 using CMAKE
|
||||
|
||||
MIPS64:
|
||||
- fixed misdetection of MSA capability
|
||||
|
||||
====================================================================
|
||||
Version 0.3.18
|
||||
02-Oct-2021
|
||||
|
||||
general:
|
||||
- when the build-time number of preconfigured threads is exceeded
|
||||
at runtime (typically by an external program calling BLAS functions
|
||||
from a larger number of threads in parallel), OpenBLAS will now
|
||||
allocate an auxiliary control structure for up to 512 additional
|
||||
threads instead of aborting
|
||||
- added support for Loongson's LoongArch64 cpu architecture
|
||||
- fixed building OpenBLAS with CMAKE and -DBUILD_BFLOAT16=ON
|
||||
- added support for building OpenBLAS as a CMAKE subproject
|
||||
- added support for building for Windows/ARM64 targets with clang
|
||||
- improved support for building with the IBM xlf compiler
|
||||
- imported Reference-LAPACK PR 625 (out-of-bounds reads in ?LARRV)
|
||||
- imported Reference-LAPACK PR 597 for testsuite compatibility with
|
||||
LLVM's libomp
|
||||
|
||||
x86_64:
|
||||
- added SkylakeX S/DGEMM kernels for small problem sizes (M*N*K<=1000000)
|
||||
- added optimized SBGEMM for Intel Cooper Lake
|
||||
- reinstated the performance patch for AVX512 SGEMV_T with a proper fix
|
||||
- added a workaround for a gcc11 tree-vectorizer bug that caused spurious
|
||||
failures in the test programs for complex BLAS3 when compiling at -O3
|
||||
(the default for cmake "release" builds)
|
||||
- added support for runtime cpu count detection under Haiku OS
|
||||
- worked around a long-standing miscompilation issue of the Haswell DGEMV_T
|
||||
kernel with gcc that could produce NaN output in some corner cases
|
||||
|
||||
POWER:
|
||||
- improved performance of DASUM on POWER10
|
||||
|
||||
ARMV8:
|
||||
- fixed crashes (use of reserved register x18) on Apple M1 under OSX
|
||||
- fixed building with gcc releases earlier than 5.1
|
||||
|
||||
MIPS:
|
||||
- fixed building under BSD
|
||||
|
||||
MIPS64:
|
||||
- fixed building under BSD
|
||||
|
||||
====================================================================
|
||||
Version 0.3.17
|
||||
15-Jul-2021
|
||||
|
||||
common:
|
||||
- reverted the optimization of SGEMV_N/DGEMV_N for small input sizes
|
||||
and consecutive arguments as it led to stack overflows on x86_64
|
||||
with some operating systems (notably OSX and Windows)
|
||||
|
||||
x86_64:
|
||||
- reverted the performance patch for SGEMV_T on AVX512 as it caused
|
||||
wrong results in some applications
|
||||
|
||||
SPARC:
|
||||
- fixed compilation with compilers other than gcc
|
||||
====================================================================
|
||||
Version 0.3.16
|
||||
11-Jul-2021
|
||||
|
||||
common:
|
||||
- drastically reduced the stack size requirements for running the LAPACK
|
||||
testsuite (Reference-LAPACK PR 553)
|
||||
- fixed spurious test failures in the LAPACK testsuite (Reference-LAPACK
|
||||
PR 564)
|
||||
- expressly setting DYNAMIC_ARCH=0 no longer enables dynamic_arch mode
|
||||
- improved performance of xGER, xSPR, xSPR2, xSYR, xSYR2, xTRSV, SGEMV_N
|
||||
and DGEMV_N, for small input sizes and consecutive arguments
|
||||
- improved performance of xGETRF, xPORTF and xPOTRI for small input sizes
|
||||
by disabling multithreading
|
||||
- fixed installing with BSD versions of the "install" utility
|
||||
|
||||
RISCV:
|
||||
- fixed the implementation of xIMIN
|
||||
- improved the performance of DSDOT
|
||||
- fixed linking of the tests on C910V with current vendor gcc
|
||||
|
||||
POWER:
|
||||
- fixed SBGEMM computation for some odd value inputs
|
||||
- fixed compilation for PPCG4, PPC970, POWER3, POWER4 and POWER5
|
||||
|
||||
x86_64:
|
||||
- improved performance of SGEMV_N and SGEMV_T for small N on AVX512-capable cpus
|
||||
- worked around a miscompilation of ZGEMM/ZTRMM on Sandybridge with old gcc
|
||||
versions
|
||||
- fixed compilation with MS Visual Studio versions older than 2017
|
||||
- fixed macro name collision with winnt.h from the latest Win10 SDK
|
||||
- added cpu type autodetection for Intel Ice Lake SP
|
||||
- fixed cpu type autodetection for Intel Tiger Lake
|
||||
- added cpu type autodetection for recent Centaur/Zhaoxin models
|
||||
- fixed compilation with musl libc
|
||||
|
||||
ARM64:
|
||||
- fixed compilation with gcc/gfortran on the Apple M1
|
||||
- fixed linking of the tests on FreeBSD
|
||||
- fixed missing restore of a register in the recently rewritten DNRM2 kernel
|
||||
for ThunderX2 and Neoverse N1 that could cause spurious failures in e.g.
|
||||
DGEEV
|
||||
- added compiler optimization flags for the EMAG8180
|
||||
- added initial support for Cortex A55
|
||||
|
||||
ARM:
|
||||
- fixed linking of the tests on FreeBSD
|
||||
|
||||
====================================================================
|
||||
Version 0.3.15
|
||||
2-May-2021
|
||||
|
||||
common:
|
||||
- imported improvements and bugfixes from Reference-LAPACK 3.9.1
|
||||
- imported LAPACKE interface fixes from Reference-LAPACK PRs 534 + 537
|
||||
- fixed a problem in the cpu detection of 0.3.14 that prevented cross-compilation
|
||||
- fixed a sequence problem in the generation of softlinks to the library in GMAKE
|
||||
|
||||
RISC V:
|
||||
- fixed compilation on RISCV (missing entry in getarch)
|
||||
- fixed a potential division by zero in CROTG and ZROTG
|
||||
|
||||
POWER:
|
||||
- fixed LAPACK testsuite failures seen with the NVIDIA HPC compiler
|
||||
- improved CGEMM, DGEMM and ZGEMM performance on POWER10
|
||||
- added an optimized ZGEMV kernel for POWER10
|
||||
- fixed a potential division by zero in CROTG and ZROTG
|
||||
|
||||
x86_64:
|
||||
- added support for Intel Control-flow Enforcement Technology (CET)
|
||||
- reverted the DOMATCOPY_RT code to the generic C version
|
||||
- fixed a bug in the AVX512 SGEMM kernel introduced in 0.3.14
|
||||
- fixed misapplication of -msse flag to non-SSE cpus in DYNAMIC_ARCH
|
||||
- added support for compilation of the benchmarks on older OSX versions
|
||||
- fix propagation of the NO_AVX512 option in CMAKE builds
|
||||
- fix compilation of the AVX512 SGEMM kernel with clang-cl on Windows
|
||||
- fixed compilation of the CTESTs with INTERFACE64=1 (random faults on OSX)
|
||||
- corrected the Haswell DROT kernel to require AVX2/FMA3 rather than AVX512
|
||||
|
||||
ARM:
|
||||
- fixed a potential division by zero in CROTG and ZROTG
|
||||
- fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs
|
||||
|
||||
ARM64:
|
||||
- fixed spurious reads outside the array in the SGEMM tcopy macro
|
||||
- fixed a potential division by zero in CROTG and ZROTG
|
||||
- fixed a segmentation fault in DYNAMIC_ARCH builds (reappeared in 0.3.14)
|
||||
|
||||
MIPS
|
||||
- fixed a potential division by zero in CROTG and ZROTG
|
||||
- fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs
|
||||
|
||||
MIPS64:
|
||||
- fixed a potential division by zero in CROTG and ZROTG
|
||||
|
||||
SPARC:
|
||||
- fixed a potential division by zero in CROTG and ZROTG
|
||||
|
||||
====================================================================
|
||||
Version 0.3.14
|
||||
17-Mar-2021
|
||||
|
||||
common:
|
||||
* Fixed a race condition on thread shutdown in non-OpenMP builds
|
||||
* Fixed custom BUFFERSIZE option getting ignored in gmake builds
|
||||
* Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms
|
||||
* Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT
|
||||
* Improved performance of OMATCOPY_RT across all platforms
|
||||
* Changed perl scripts to use env instead of a hardcoded /usr/bin/perl
|
||||
* Fixed potential misreading of the GCC compiler version in the build scripts
|
||||
* Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477)
|
||||
* Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335)
|
||||
|
||||
RISCV:
|
||||
* Fixed compilation on RISCV (missing entry in getarch)
|
||||
|
||||
POWER:
|
||||
* Fixed compilation for DYNAMIC_ARCH with clang and with old gcc versions
|
||||
* Added support for compilation on FreeBSD/ppc64le
|
||||
* Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL
|
||||
* Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM
|
||||
* Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10
|
||||
* Improved SCOPY and CCOPY performance on POWER10
|
||||
* Improved SGEMM and DGEMM performance on POWER10
|
||||
* Added support for compilation with the NVIDIA HPC compiler
|
||||
|
||||
x86_64:
|
||||
* Added an optimized bfloat16 GEMM kernel for Cooperlake
|
||||
* Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus
|
||||
* Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus
|
||||
* Added support for compilation with the NAG Fortran compiler
|
||||
* Fixed recognition of the AMD AOCC compiler
|
||||
* Fixed compilation for DYNAMIC_ARCH with clang on Windows
|
||||
* Added support for running the BLAS/CBLAS tests on Windows
|
||||
* Fixed signatures of the tls callback functions for Windows x64
|
||||
* Fixed various issues with fma intrinsics support handling
|
||||
|
||||
ARM:
|
||||
* Added support for embedded Cortex M targets via a new option EMBEDDED
|
||||
|
||||
ARMV8:
|
||||
* Fixed the THUNDERX2T99 and NEOVERSEN1 DNRM2/ZNRM2 kernels for inputs with Inf
|
||||
* Added support for the DYNAMIC_LIST option
|
||||
* Added support for compilation with the NVIDIA HPC compiler
|
||||
* Added support for compiling with the NAG Fortran compiler
|
||||
|
||||
====================================================================
|
||||
Version 0.3.13
|
||||
12-Dec-2020
|
||||
|
||||
common:
|
||||
* Added a generic bfloat16 SBGEMV kernel
|
||||
* Fixed a potentially severe memory leak after fork in OpenMP builds
|
||||
that was introduced in 0.3.12
|
||||
* Added detection of the Fujitsu Fortran compiler
|
||||
* Added detection of the (e)gfortran compiler on OpenBSD
|
||||
* Added support for overriding the default name of the library independently
|
||||
from symbol suffixing in the gmake builds (already supported in cmake)
|
||||
|
||||
RISCV:
|
||||
* Added a RISC V port optimized for C910V
|
||||
|
||||
POWER:
|
||||
* Added optimized POWER10 kernels for SAXPY, CAXPY, SDOT, DDOT and DGEMV_N
|
||||
* Improved DGEMM performance on POWER10
|
||||
* Improved STRSM and DTRSM performance on POWER9 and POWER10
|
||||
* Fixed segmemtation faults in DYNAMIC_ARCH builds
|
||||
* Fixed compilation with the PGI compiler
|
||||
|
||||
x86:
|
||||
* Fixed compilation of kernels that require SSE2 intrinsics since 0.3.12
|
||||
|
||||
x86_64:
|
||||
* Added an optimized bfloat16 SBGEMV kernel for SkylakeX and Cooperlake
|
||||
* Improved the performance of SASUM and DASUM kernels through parallelization
|
||||
* Improved the performance of SROT and DROT kernels
|
||||
* Improved the performance of multithreaded xSYRK
|
||||
* Fixed OpenMP builds that use the LLVM Clang compiler together with GNU gfortran
|
||||
(where linking of both the LLVM libomp and GNU libgomp could lead to lockups or
|
||||
wrong results)
|
||||
* Fixed miscompilations by old gcc 4.6
|
||||
* Fixed misdetection of AVX2 capability in some Sandybridge cpus
|
||||
* Fixed lockups in builds combining DYNAMIC_ARCH with TARGET=GENERIC on OpenBSD
|
||||
|
||||
ARM64:
|
||||
* Fixed segmemtation faults in DYNAMIC_ARCH builds
|
||||
|
||||
MIPS:
|
||||
* Improved kernels for Loongson 3R3 ("3A") and 3R4 ("3B") models, including MSA
|
||||
* Fixed bugs in the MSA kernels for CGEMM, CTRMM, CGEMV and ZGEMV
|
||||
* Added handling of zero increments in the MSA kernels for SSWAP and DSWAP
|
||||
* Added DYNAMIC_ARCH support for MIPS64 (currently Loongson3R3/3R4 only)
|
||||
|
||||
SPARC:
|
||||
* Fixed building 32 and 64 bit SPARC kernels with the SolarisStudio compilers
|
||||
|
||||
====================================================================
|
||||
Version 0.3.12
|
||||
24-Oct-2020
|
||||
|
|
|
|||
12
Makefile
12
Makefile
|
|
@ -32,7 +32,7 @@ export NOFORTRAN
|
|||
export NO_LAPACK
|
||||
endif
|
||||
|
||||
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
|
||||
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
|
||||
|
||||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
|
||||
|
||||
|
|
@ -59,6 +59,9 @@ endif
|
|||
@$(CC) --version > /dev/null 2>&1;\
|
||||
if [ $$? -eq 0 ]; then \
|
||||
cverinfo=`$(CC) --version | sed -n '1p'`; \
|
||||
if [ -z "$${cverinfo}" ]; then \
|
||||
cverinfo=`$(CC) --version | sed -n '2p'`; \
|
||||
fi; \
|
||||
echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\
|
||||
else \
|
||||
echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\
|
||||
|
|
@ -67,6 +70,9 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
|||
@$(FC) --version > /dev/null 2>&1;\
|
||||
if [ $$? -eq 0 ]; then \
|
||||
fverinfo=`$(FC) --version | sed -n '1p'`; \
|
||||
if [ -z "$${fverinfo}" ]; then \
|
||||
fverinfo=`$(FC) --version | sed -n '2p'`; \
|
||||
fi; \
|
||||
echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\
|
||||
else \
|
||||
echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\
|
||||
|
|
@ -161,7 +167,6 @@ ifeq ($(NO_SHARED), 1)
|
|||
$(error OpenBLAS: neither static nor shared are enabled.)
|
||||
endif
|
||||
endif
|
||||
@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
@for d in $(SUBDIRS) ; \
|
||||
do if test -d $$d; then \
|
||||
$(MAKE) -C $$d $(@F) || exit 1 ; \
|
||||
|
|
@ -190,6 +195,7 @@ endif
|
|||
ifdef USE_THREAD
|
||||
@echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
|
||||
endif
|
||||
@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
@touch lib.grd
|
||||
|
||||
prof : prof_blas prof_lapack
|
||||
|
|
@ -263,7 +269,7 @@ prof_lapack : lapack_prebuild
|
|||
lapack_prebuild :
|
||||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
-@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
|
|
|
|||
158
Makefile.arm64
158
Makefile.arm64
|
|
@ -1,80 +1,234 @@
|
|||
ifneq ($(C_COMPILER), PGI)
|
||||
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
ISCLANG=1
|
||||
endif
|
||||
ifneq (1, $(filter 1,$(GCCVERSIONGT4) $(ISCLANG)))
|
||||
CCOMMON_OPT += -march=armv8-a
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a
|
||||
endif
|
||||
|
||||
|
||||
else
|
||||
|
||||
|
||||
ifeq ($(CORE), ARMV8)
|
||||
CCOMMON_OPT += -march=armv8-a
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ARMV8SVE)
|
||||
CCOMMON_OPT += -march=armv8-a+sve
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a+sve
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA53)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA57)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA72)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA73)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
|
||||
endif
|
||||
endif
|
||||
|
||||
# Use a72 tunings because Neoverse-N1 is only available
|
||||
# in GCC>=9
|
||||
ifeq ($(CORE), NEOVERSEN1)
|
||||
ifeq ($(GCCVERSIONGTEQ7), 1)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# Use a72 tunings because Neoverse-V1 is only available
|
||||
# in GCC>=9.4
|
||||
ifeq ($(CORE), NEOVERSEV1)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
|
||||
CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.4-a -mtune=native
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.4-a -mtune=native
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# Use a72 tunings because Neoverse-N2 is only available
|
||||
# in GCC>=9.4
|
||||
ifeq ($(CORE), NEOVERSEN2)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
|
||||
CCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.5-a -mtune=native
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.5-a -mtune=native
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# Use a53 tunings because a55 is only available in GCC>=8.1
|
||||
ifeq ($(CORE), CORTEXA55)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||
ifeq ($(GCCVERSIONGTEQ8), 1)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=thunderx
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=thunderx
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), FALKOR)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=falkor
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=falkor
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX2T99)
|
||||
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX3T110)
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), VORTEX)
|
||||
CCOMMON_OPT += -march=armv8.3-a
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.3-a
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
|
||||
ifeq ($(CORE), TSV110)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq ($(CORE), EMAG8180)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=emag
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=emag
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
|
||||
ifeq ($(CORE), A64FX)
|
||||
CCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
endif
|
||||
|
|
|
|||
|
|
@ -0,0 +1 @@
|
|||
COPT = -Wall -O2 # -DGEMMTEST
|
||||
|
|
@ -74,17 +74,17 @@ endif
|
|||
ifneq ($(OSNAME), AIX)
|
||||
ifndef NO_LAPACKE
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
|
||||
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
|
||||
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
|
||||
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
||||
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
|
||||
endif
|
||||
|
||||
#for install static library
|
||||
ifneq ($(NO_STATIC),1)
|
||||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
endif
|
||||
|
|
@ -92,7 +92,7 @@ endif
|
|||
ifneq ($(NO_SHARED),1)
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
|
||||
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,3 @@
|
|||
ifdef BINARY64
|
||||
else
|
||||
endif
|
||||
|
|
@ -10,9 +10,15 @@ USE_OPENMP = 1
|
|||
endif
|
||||
|
||||
ifeq ($(CORE), POWER10)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
|
||||
ifeq ($(F_COMPILER), IBM)
|
||||
FCOMMON_OPT += -O2 -qrecur -qnosave
|
||||
else
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER9)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
|
|
@ -31,7 +37,11 @@ else
|
|||
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
|
||||
endif
|
||||
ifneq ($(F_COMPILER), PGI)
|
||||
ifeq ($(F_COMPILER), IBM)
|
||||
FCOMMON_OPT += -O2 -qrecur -qnosave
|
||||
else
|
||||
FCOMMON_OPT += -O2 -frecursive -fno-fast-math
|
||||
endif
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
ifneq ($(GCCVERSIONGT4), 1)
|
||||
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
|
||||
|
|
@ -55,7 +65,11 @@ CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
|
|||
endif
|
||||
ifneq ($(F_COMPILER), PGI)
|
||||
ifeq ($(OSNAME), AIX)
|
||||
ifeq ($(F_COMPILER), IBM)
|
||||
FCOMMON_OPT += -O2 -qrecur -qnosave
|
||||
else
|
||||
FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
|
||||
endif
|
||||
else
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
|
||||
endif
|
||||
|
|
|
|||
|
|
@ -3,6 +3,10 @@
|
|||
export BINARY
|
||||
export USE_OPENMP
|
||||
|
||||
ifdef DYNAMIC_ARCH
|
||||
override HOST_CFLAGS += -DDYNAMIC_ARCH
|
||||
endif
|
||||
|
||||
ifdef TARGET_CORE
|
||||
TARGET_MAKE = Makefile_kernel.conf
|
||||
TARGET_CONF = config_kernel.h
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.12.dev
|
||||
VERSION = 0.3.20
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
|
|
|||
180
Makefile.system
180
Makefile.system
|
|
@ -9,11 +9,10 @@ ifndef TOPDIR
|
|||
TOPDIR = .
|
||||
endif
|
||||
|
||||
# If ARCH is not set, we use the host system's architecture for getarch compile options.
|
||||
ifndef ARCH
|
||||
# we need to use the host system's architecture for getarch compile options even especially when cross-compiling
|
||||
HOSTARCH := $(shell uname -m)
|
||||
else
|
||||
HOSTARCH = $(ARCH)
|
||||
ifeq ($(HOSTARCH), amd64)
|
||||
HOSTARCH=x86_64
|
||||
endif
|
||||
|
||||
# Catch conflicting usage of ARCH in some BSD environments
|
||||
|
|
@ -21,6 +20,8 @@ ifeq ($(ARCH), amd64)
|
|||
override ARCH=x86_64
|
||||
else ifeq ($(ARCH), powerpc64)
|
||||
override ARCH=power
|
||||
else ifeq ($(ARCH), powerpc64le)
|
||||
override ARCH=power
|
||||
else ifeq ($(ARCH), powerpc)
|
||||
override ARCH=power
|
||||
else ifeq ($(ARCH), i386)
|
||||
|
|
@ -31,6 +32,10 @@ else ifeq ($(ARCH), armv7)
|
|||
override ARCH=arm
|
||||
else ifeq ($(ARCH), aarch64)
|
||||
override ARCH=arm64
|
||||
else ifeq ($(ARCH), mipsel)
|
||||
override ARCH=mips
|
||||
else ifeq ($(ARCH), mips64el)
|
||||
override ARCH=mips64
|
||||
else ifeq ($(ARCH), zarch)
|
||||
override ARCH=zarch
|
||||
endif
|
||||
|
|
@ -96,7 +101,7 @@ GETARCH_FLAGS += -DUSER_TARGET
|
|||
ifeq ($(TARGET), GENERIC)
|
||||
ifeq ($(DYNAMIC_ARCH), 1)
|
||||
override NO_EXPRECISION=1
|
||||
export NO_EXPRECiSION
|
||||
export NO_EXPRECISION
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
|
@ -113,6 +118,9 @@ endif
|
|||
ifeq ($(TARGET), COOPERLAKE)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
ifeq ($(TARGET), SAPPHIRERAPIDS)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
ifeq ($(TARGET), SANDYBRIDGE)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
|
|
@ -137,8 +145,13 @@ endif
|
|||
ifeq ($(TARGET), POWER8)
|
||||
GETARCH_FLAGS := -DFORCE_POWER6
|
||||
endif
|
||||
ifeq ($(TARGET), POWER9)
|
||||
GETARCH_FLAGS := -DFORCE_POWER6
|
||||
endif
|
||||
ifeq ($(TARGET), POWER10)
|
||||
GETARCH_FLAGS := -DFORCE_POWER6
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
#TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
|
||||
#
|
||||
|
|
@ -158,6 +171,9 @@ endif
|
|||
ifeq ($(TARGET_CORE), COOPERLAKE)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
ifeq ($(TARGET_CORE), SANDYBRIDGE)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
|
|
@ -181,7 +197,7 @@ endif
|
|||
|
||||
# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
|
||||
ifeq ($(HOSTARCH), x86_64)
|
||||
ifeq ($(findstring pgcc,$(HOSTCC)),)
|
||||
ifeq ($(findstring pgcc,$(HOSTCC))$(findstring nvc,$(HOSTCC)),)
|
||||
GETARCH_FLAGS += -march=native
|
||||
endif
|
||||
endif
|
||||
|
|
@ -242,12 +258,26 @@ else
|
|||
ONLY_CBLAS = 0
|
||||
endif
|
||||
|
||||
#For small matrix optimization
|
||||
ifeq ($(ARCH), x86_64)
|
||||
SMALL_MATRIX_OPT = 1
|
||||
else ifeq ($(CORE), POWER10)
|
||||
SMALL_MATRIX_OPT = 1
|
||||
endif
|
||||
ifeq ($(SMALL_MATRIX_OPT), 1)
|
||||
CCOMMON_OPT += -DSMALL_MATRIX_OPT
|
||||
endif
|
||||
|
||||
# This operation is expensive, so execution should be once.
|
||||
ifndef GOTOBLAS_MAKEFILE
|
||||
export GOTOBLAS_MAKEFILE = 1
|
||||
|
||||
# Determine if the assembler is GNU Assembler
|
||||
HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?)
|
||||
GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS)
|
||||
|
||||
# Generating Makefile.conf and config.h
|
||||
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
|
||||
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
|
||||
|
||||
ifndef TARGET_CORE
|
||||
include $(TOPDIR)/Makefile.conf
|
||||
|
|
@ -293,7 +323,7 @@ else
|
|||
SMP = 1
|
||||
endif
|
||||
else
|
||||
ifeq ($(NUM_THREAD), 1)
|
||||
ifeq ($(NUM_THREADS), 1)
|
||||
SMP =
|
||||
else
|
||||
SMP = 1
|
||||
|
|
@ -331,6 +361,7 @@ GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
|||
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
|
||||
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
|
||||
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
|
||||
GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8)
|
||||
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||
GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11)
|
||||
GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
|
||||
|
|
@ -343,6 +374,7 @@ else
|
|||
endif
|
||||
GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1)
|
||||
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2)
|
||||
GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 4)
|
||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
|
||||
endif
|
||||
|
||||
|
|
@ -378,6 +410,12 @@ ifeq ($(OSNAME), AIX)
|
|||
EXTRALIB += -lm
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
ifeq ($(ARCH), $(filter $(ARCH),arm arm64))
|
||||
EXTRALIB += -lm
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
NEED_PIC = 0
|
||||
NO_EXPRECISION = 1
|
||||
|
|
@ -617,12 +655,24 @@ DYNAMIC_CORE += CORTEXA57
|
|||
DYNAMIC_CORE += CORTEXA72
|
||||
DYNAMIC_CORE += CORTEXA73
|
||||
DYNAMIC_CORE += NEOVERSEN1
|
||||
DYNAMIC_CORE += NEOVERSEV1
|
||||
DYNAMIC_CORE += NEOVERSEN2
|
||||
DYNAMIC_CORE += CORTEXA55
|
||||
DYNAMIC_CORE += FALKOR
|
||||
DYNAMIC_CORE += THUNDERX
|
||||
DYNAMIC_CORE += THUNDERX2T99
|
||||
DYNAMIC_CORE += TSV110
|
||||
DYNAMIC_CORE += EMAG8180
|
||||
DYNAMIC_CORE += THUNDERX3T110
|
||||
ifdef DYNAMIC_LIST
|
||||
override DYNAMIC_CORE = ARMV8 $(DYNAMIC_LIST)
|
||||
XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_ARMV8
|
||||
XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), mips64)
|
||||
DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), zarch)
|
||||
|
|
@ -659,6 +709,7 @@ endif
|
|||
endif # ARCH zarch
|
||||
|
||||
ifeq ($(ARCH), power)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
DYNAMIC_CORE = POWER6
|
||||
DYNAMIC_CORE += POWER8
|
||||
ifneq ($(C_COMPILER), GCC)
|
||||
|
|
@ -672,7 +723,7 @@ DYNAMIC_CORE += POWER9
|
|||
else
|
||||
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
|
||||
endif
|
||||
LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35)
|
||||
LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35)
|
||||
ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11)
|
||||
DYNAMIC_CORE += POWER10
|
||||
CCOMMON_OPT += -DHAVE_P10_SUPPORT
|
||||
|
|
@ -685,6 +736,10 @@ else
|
|||
$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
|
||||
endif
|
||||
endif
|
||||
else
|
||||
DYNAMIC_CORE = POWER8
|
||||
DYNAMIC_CORE += POWER9
|
||||
endif
|
||||
endif
|
||||
|
||||
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
||||
|
|
@ -756,6 +811,11 @@ NO_BINARY_MODE = 1
|
|||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), loongarch64)
|
||||
NO_BINARY_MODE = 1
|
||||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
||||
|
||||
#
|
||||
# C Compiler dependent settings
|
||||
|
|
@ -787,14 +847,9 @@ CCOMMON_OPT += -mabi=32
|
|||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), LOONGSON3A)
|
||||
CCOMMON_OPT += -march=mips64
|
||||
FCOMMON_OPT += -march=mips64
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), LOONGSON3B)
|
||||
CCOMMON_OPT += -march=mips64
|
||||
FCOMMON_OPT += -march=mips64
|
||||
ifeq ($(CORE), $(filter $(CORE),LOONGSON3R3 LOONGSON3R4))
|
||||
CCOMMON_OPT += -march=loongson3a
|
||||
FCOMMON_OPT += -march=loongson3a
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), MIPS24K)
|
||||
|
|
@ -831,6 +886,13 @@ ifeq ($(OSNAME), AIX)
|
|||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), loongarch64)
|
||||
ifeq ($(CORE), LOONGSON3R5)
|
||||
CCOMMON_OPT += -march=loongarch64 -mabi=lp64
|
||||
FCOMMON_OPT += -march=loongarch64 -mabi=lp64
|
||||
endif
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
ifndef BINARY_DEFINED
|
||||
|
|
@ -848,9 +910,29 @@ endif
|
|||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20)
|
||||
PGCVERSIONEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 20)
|
||||
PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |cut -d "-" -f 1 |sed -e "s/[^0-9.]//g" |cut -c 4-5` \>= 11)
|
||||
PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11)
|
||||
ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 100 101 011))
|
||||
NEWPGI := 1
|
||||
PGCVERSIONGT21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 21)
|
||||
PGCVERSIONEQ21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 21)
|
||||
PGCVERSIONCHECK2 := $(PGCVERSIONGT21)$(PGCVERSIONEQ21)$(PGCMINORVERSIONGE11)
|
||||
ifeq ($(PGCVERSIONCHECK2), $(filter $(PGCVERSIONCHECK2), 100 101 011))
|
||||
NEWPGI2 := 1
|
||||
endif
|
||||
endif
|
||||
ifdef BINARY64
|
||||
ifeq ($(ARCH), x86_64)
|
||||
CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm
|
||||
ifneq ($(NEWPGI2),1)
|
||||
CCOMMON_OPT += -tp p7-64
|
||||
else
|
||||
CCOMMON_OPT += -tp px
|
||||
endif
|
||||
ifneq ($(NEWPGI),1)
|
||||
CCOMMON_OPT += -D__MMX__ -Mnollvm
|
||||
endif
|
||||
else
|
||||
ifeq ($(ARCH), power)
|
||||
ifeq ($(CORE), POWER8)
|
||||
|
|
@ -862,7 +944,11 @@ endif
|
|||
endif
|
||||
endif
|
||||
else
|
||||
ifneq ($(NEWPGI2),1)
|
||||
CCOMMON_OPT += -tp p7
|
||||
else
|
||||
CCOMMON_OPT += -tp px
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
|
@ -878,13 +964,25 @@ endif
|
|||
# Fortran Compiler dependent settings
|
||||
#
|
||||
|
||||
ifeq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -dcfuns -recursive -ieee=full -w=obs -thread_safe
|
||||
ifdef INTERFACE64
|
||||
ifneq ($(INTERFACE64), 0)
|
||||
FCOMMON_OPT += -i8
|
||||
endif
|
||||
endif
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
FCOMMON_OPT += -openmp
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER), FLANG)
|
||||
CCOMMON_OPT += -DF_INTERFACE_FLANG
|
||||
FCOMMON_OPT += -Mrecursive -Kieee
|
||||
ifeq ($(OSNAME), Linux)
|
||||
ifeq ($(ARCH), x86_64)
|
||||
FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`)
|
||||
ifeq ($(FLANG_VENDOR),AOCC)
|
||||
FLANG_VENDOR := $(shell $(FC) --version|head -1 |cut -f 1 -d " ")
|
||||
ifeq ($(FLANG_VENDOR), AMD)
|
||||
FCOMMON_OPT += -fno-unroll-loops
|
||||
endif
|
||||
endif
|
||||
|
|
@ -1027,21 +1125,31 @@ FCOMMON_OPT += -i8
|
|||
endif
|
||||
endif
|
||||
ifeq ($(ARCH), x86_64)
|
||||
ifneq ($(NEWPGI2),1)
|
||||
FCOMMON_OPT += -tp p7-64
|
||||
else
|
||||
FCOMMON_OPT += -tp px
|
||||
endif
|
||||
else
|
||||
ifeq ($(ARCH), power)
|
||||
ifeq ($(CORE), POWER6)
|
||||
$(warning NVIDIA HPC compilers do not support POWER6.)
|
||||
endif
|
||||
ifeq ($(CORE), POWER8)
|
||||
FCOMMON_OPT += -tp pwr8
|
||||
endif
|
||||
ifeq ($(CORE), POWER9)
|
||||
FCOMMON_OPT += -tp pwr9
|
||||
endif
|
||||
ifeq ($(CORE), POWER10)
|
||||
$(warning NVIDIA HPC compilers do not support POWER10.)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
FCOMMON_OPT += -tp p7
|
||||
endif
|
||||
FCOMMON_OPT += -Mrecursive
|
||||
FCOMMON_OPT += -Mrecursive -Kieee
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
FCOMMON_OPT += -mp
|
||||
endif
|
||||
|
|
@ -1078,11 +1186,11 @@ FCOMMON_OPT += -n32
|
|||
else
|
||||
FCOMMON_OPT += -n64
|
||||
endif
|
||||
ifeq ($(CORE), LOONGSON3A)
|
||||
ifeq ($(CORE), LOONGSON3R3)
|
||||
FCOMMON_OPT += -loongson3 -static
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), LOONGSON3B)
|
||||
ifeq ($(CORE), LOONGSON3R4)
|
||||
FCOMMON_OPT += -loongson3 -static
|
||||
endif
|
||||
|
||||
|
|
@ -1108,11 +1216,11 @@ CCOMMON_OPT += -n32
|
|||
else
|
||||
CCOMMON_OPT += -n64
|
||||
endif
|
||||
ifeq ($(CORE), LOONGSON3A)
|
||||
ifeq ($(CORE), LOONGSON3R3)
|
||||
CCOMMON_OPT += -loongson3 -static
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), LOONGSON3B)
|
||||
ifeq ($(CORE), LOONGSON3R4)
|
||||
CCOMMON_OPT += -loongson3 -static
|
||||
endif
|
||||
|
||||
|
|
@ -1180,6 +1288,8 @@ CCOMMON_OPT += -fPIC
|
|||
endif
|
||||
ifeq ($(F_COMPILER), SUN)
|
||||
FCOMMON_OPT += -pic
|
||||
else ifeq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -PIC
|
||||
else
|
||||
FCOMMON_OPT += -fPIC
|
||||
endif
|
||||
|
|
@ -1223,10 +1333,8 @@ ifdef SMP
|
|||
CCOMMON_OPT += -DSMP_SERVER
|
||||
|
||||
ifeq ($(ARCH), mips64)
|
||||
ifneq ($(CORE), LOONGSON3B)
|
||||
USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
|
|
@ -1259,6 +1367,10 @@ CCOMMON_OPT += -DUSE_PAPI
|
|||
EXTRALIB += -lpapi -lperfctr
|
||||
endif
|
||||
|
||||
ifdef BUFFERSIZE
|
||||
CCOMMON_OPT += -DBUFFERSIZE=$(BUFFERSIZE)
|
||||
endif
|
||||
|
||||
ifdef DYNAMIC_THREADS
|
||||
CCOMMON_OPT += -DDYNAMIC_THREADS
|
||||
endif
|
||||
|
|
@ -1342,11 +1454,9 @@ endif
|
|||
|
||||
ifneq ($(ARCH), x86_64)
|
||||
ifneq ($(ARCH), x86)
|
||||
ifneq ($(CORE), LOONGSON3B)
|
||||
NO_AFFINITY = 1
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef NO_AFFINITY
|
||||
ifeq ($(NO_AFFINITY), 0)
|
||||
|
|
@ -1438,6 +1548,10 @@ LAPACK_FFLAGS := $(FFLAGS)
|
|||
LAPACK_FPFLAGS := $(FPFLAGS)
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER),NAG)
|
||||
LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||
endif
|
||||
|
||||
LAPACK_CFLAGS = $(CFLAGS)
|
||||
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
|
||||
ifdef INTERFACE64
|
||||
|
|
@ -1566,8 +1680,10 @@ export HAVE_VFP
|
|||
export HAVE_VFPV3
|
||||
export HAVE_VFPV4
|
||||
export HAVE_NEON
|
||||
export HAVE_MSA
|
||||
export MSA_FLAGS
|
||||
ifndef NO_MSA
|
||||
export HAVE_MSA
|
||||
export MSA_FLAGS
|
||||
endif
|
||||
export KERNELDIR
|
||||
export FUNCTION_PROFILE
|
||||
export TARGET_CORE
|
||||
|
|
|
|||
17
Makefile.x86
17
Makefile.x86
|
|
@ -1,10 +1,21 @@
|
|||
# COMPILER_PREFIX = mingw32-
|
||||
|
||||
ifdef HAVE_SSE
|
||||
CCOMMON_OPT += -msse
|
||||
FCOMMON_OPT += -msse
|
||||
ifneq ($(DYNAMIC_ARCH),1)
|
||||
ADD_CPUFLAGS = 1
|
||||
else
|
||||
ifdef TARGET_CORE
|
||||
ADD_CPUFLAGS = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef ADD_CPUFLAGS
|
||||
ifdef HAVE_SSE
|
||||
CCOMMON_OPT += -msse
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -msse
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), Interix)
|
||||
ARFLAGS = -m x86
|
||||
|
|
|
|||
|
|
@ -8,42 +8,57 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
|
||||
ifneq ($(DYNAMIC_ARCH),1)
|
||||
ADD_CPUFLAGS = 1
|
||||
else
|
||||
ifdef TARGET_CORE
|
||||
ADD_CPUFLAGS = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef ADD_CPUFLAGS
|
||||
ifdef HAVE_SSE3
|
||||
CCOMMON_OPT += -msse3
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -msse3
|
||||
endif
|
||||
endif
|
||||
ifdef HAVE_SSSE3
|
||||
CCOMMON_OPT += -mssse3
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -mssse3
|
||||
endif
|
||||
endif
|
||||
ifdef HAVE_SSE4_1
|
||||
CCOMMON_OPT += -msse4.1
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -msse4.1
|
||||
endif
|
||||
endif
|
||||
ifndef OLDGCC
|
||||
ifdef HAVE_AVX
|
||||
CCOMMON_OPT += -mavx
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -mavx
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifndef NO_AVX2
|
||||
ifdef HAVE_AVX2
|
||||
CCOMMON_OPT += -mavx2
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -mavx2
|
||||
endif
|
||||
endif
|
||||
ifndef OLDGCC
|
||||
ifdef HAVE_FMA3
|
||||
CCOMMON_OPT += -mfma
|
||||
FCOMMON_OPT += -mfma
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), SKYLAKEX)
|
||||
ifndef DYNAMIC_ARCH
|
||||
ifndef NO_AVX512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
FCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
|
|
@ -56,17 +71,22 @@ endif
|
|||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), COOPERLAKE)
|
||||
ifndef DYNAMIC_ARCH
|
||||
ifndef NO_AVX512
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
# cooperlake support was added in 10.1
|
||||
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
|
||||
CCOMMON_OPT += -march=cooperlake
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=cooperlake
|
||||
endif
|
||||
else # gcc not support, fallback to avx512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
|
|
@ -80,6 +100,34 @@ endif
|
|||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), SAPPHIRERAPIDS)
|
||||
ifndef NO_AVX512
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
# sapphire rapids support was added in 11
|
||||
ifeq ($(GCCVERSIONGTEQ11), 1)
|
||||
CCOMMON_OPT += -march=sapphirerapids
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=sapphirerapids
|
||||
endif
|
||||
else # gcc not support, fallback to avx512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
FCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
FCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef HAVE_AVX2
|
||||
|
|
@ -112,6 +160,7 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
|
||||
ifeq ($(OSNAME), Interix)
|
||||
|
|
|
|||
18
README.md
18
README.md
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
[](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
|
||||
Travis CI: [](https://travis-ci.org/xianyi/OpenBLAS)
|
||||
Travis CI: [](https://travis-ci.com/xianyi/OpenBLAS)
|
||||
|
||||
AppVeyor: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
|
||||
|
||||
|
|
@ -13,17 +13,21 @@ Drone CI: [ library based on GotoBLAS2 1.13 BSD version.
|
||||
|
||||
Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
|
||||
|
||||
For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib:
|
||||
<https://www.netlib.org/blas>. On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six
|
||||
20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare <https://ocw.mit.edu/resources/res-18-010-a-2020-vision-of-linear-algebra-spring-2020/> or Youtube <https://www.youtube.com/playlist?list=PLUl4u3cNGP61iQEFiWLE21EJCxwmWvvek> may be helpful.
|
||||
|
||||
## Binary Packages
|
||||
|
||||
We provide official binary packages for the following platform:
|
||||
|
||||
* Windows x86/x86_64
|
||||
|
||||
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/).
|
||||
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/xianyi/OpenBLAS/releases](https://github.com/xianyi/OpenBLAS/releases).
|
||||
|
||||
## Installation from Source
|
||||
|
||||
|
|
@ -124,6 +128,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
|||
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
|
||||
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
|
||||
- **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
|
||||
- **Intel Cooper Lake**: as Skylake-X with improved BFLOAT16 support.
|
||||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
|
||||
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
||||
|
|
@ -149,6 +154,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
|||
|
||||
- **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS
|
||||
- **Cortex-A53**: same as ARMV8 (different cpu specifications)
|
||||
- **Cortex-A55**: same as ARMV8 (different cpu specifications)
|
||||
- **Cortex A57**: Optimized Level-3 and Level-2 functions
|
||||
- **Cortex A72**: same as A57 ( different cpu specifications)
|
||||
- **Cortex A73**: same as A57 (different cpu specifications)
|
||||
|
|
@ -174,10 +180,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
|||
|
||||
#### RISC-V
|
||||
|
||||
- **C910V**: Optimized Leve-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1.
|
||||
- **C910V**: Optimized Level-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1.
|
||||
```sh
|
||||
make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran
|
||||
```
|
||||
(also known to work on C906)
|
||||
|
||||
### Support for multiple targets in a single library
|
||||
|
||||
|
|
@ -208,7 +215,8 @@ Please note that it is not possible to combine support for different architectur
|
|||
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
||||
- **AIX**: Supported on PPC up to POWER8
|
||||
- **Haiku**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **SunOS**: Supported by the community. We don't actively test the library on this OS:
|
||||
- **SunOS**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **Cortex-M**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-on-Cortex-M>.
|
||||
|
||||
## Usage
|
||||
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ HASWELL
|
|||
SKYLAKEX
|
||||
ATOM
|
||||
COOPERLAKE
|
||||
SAPPHIRERAPIDS
|
||||
|
||||
b)AMD CPU:
|
||||
ATHLON
|
||||
|
|
@ -92,6 +93,9 @@ CORTEXA57
|
|||
CORTEXA72
|
||||
CORTEXA73
|
||||
NEOVERSEN1
|
||||
NEOVERSEV1
|
||||
NEOVERSEN2
|
||||
CORTEXA55
|
||||
EMAG8180
|
||||
FALKOR
|
||||
THUNDERX
|
||||
|
|
@ -109,3 +113,9 @@ Z14
|
|||
RISCV64_GENERIC
|
||||
C910V
|
||||
|
||||
11.LOONGARCH64:
|
||||
LOONGSON3R5
|
||||
|
||||
12. Elbrus E2000:
|
||||
E2K
|
||||
|
||||
|
|
|
|||
36
appveyor.yml
36
appveyor.yml
|
|
@ -29,15 +29,15 @@ environment:
|
|||
global:
|
||||
CONDA_INSTALL_LOCN: C:\\Miniconda36-x64
|
||||
matrix:
|
||||
- COMPILER: clang-cl
|
||||
WITH_FORTRAN: yes
|
||||
- COMPILER: clang-cl
|
||||
DYNAMIC_ARCH: ON
|
||||
WITH_FORTRAN: no
|
||||
- COMPILER: cl
|
||||
- COMPILER: MinGW64-gcc-7.2.0-mingw
|
||||
DYNAMIC_ARCH: OFF
|
||||
WITH_FORTRAN: ignore
|
||||
# - COMPILER: clang-cl
|
||||
# WITH_FORTRAN: ON
|
||||
# - COMPILER: clang-cl
|
||||
# DYNAMIC_ARCH: ON
|
||||
# WITH_FORTRAN: OFF
|
||||
# - COMPILER: cl
|
||||
# - COMPILER: MinGW64-gcc-7.2.0-mingw
|
||||
# DYNAMIC_ARCH: OFF
|
||||
# WITH_FORTRAN: ignore
|
||||
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
|
||||
COMPILER: MinGW-gcc-6.3.0-32
|
||||
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
|
||||
|
|
@ -46,13 +46,10 @@ environment:
|
|||
|
||||
install:
|
||||
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
|
||||
- if [%COMPILER%]==[clang-cl] conda update --yes -n base conda
|
||||
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
|
||||
- if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake
|
||||
|
||||
- if [%WITH_FORTRAN%]==[no] conda install --yes --quiet ninja
|
||||
- if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet -c isuruf kitware-ninja
|
||||
- if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet flang
|
||||
|
||||
- if [%COMPILER%]==[clang-cl] conda config --set auto_update_conda false
|
||||
- if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1
|
||||
- if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
|
||||
- if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%"
|
||||
- if [%COMPILER%]==[clang-cl] set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%"
|
||||
|
|
@ -68,15 +65,14 @@ before_build:
|
|||
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
|
||||
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
||||
- if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DMSVC_STATIC_CRT=ON ..
|
||||
- if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
||||
- if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON ..
|
||||
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
|
||||
|
||||
build_script:
|
||||
- cmake --build .
|
||||
|
||||
test_script:
|
||||
- echo Running Test
|
||||
- cd utest
|
||||
- openblas_utest
|
||||
- ctest -j2
|
||||
|
||||
|
|
|
|||
|
|
@ -4,14 +4,22 @@ trigger:
|
|||
branches:
|
||||
include:
|
||||
- develop
|
||||
|
||||
resources:
|
||||
containers:
|
||||
- container: oneapi-hpckit
|
||||
image: intel/oneapi-hpckit:latest
|
||||
options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so'
|
||||
- container: oneapi-basekit
|
||||
image: intel/oneapi-basekit:latest
|
||||
options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so'
|
||||
|
||||
jobs:
|
||||
# manylinux1 is useful to test because the
|
||||
# standard Docker container uses an old version
|
||||
# of gcc / glibc
|
||||
- job: manylinux1_gcc
|
||||
pool:
|
||||
vmImage: 'ubuntu-16.04'
|
||||
vmImage: 'ubuntu-latest'
|
||||
steps:
|
||||
- script: |
|
||||
echo "FROM quay.io/pypa/manylinux1_x86_64
|
||||
|
|
@ -27,7 +35,7 @@ jobs:
|
|||
displayName: Run manylinux1 docker build
|
||||
- job: Intel_SDE_skx
|
||||
pool:
|
||||
vmImage: 'ubuntu-16.04'
|
||||
vmImage: 'ubuntu-latest'
|
||||
steps:
|
||||
- script: |
|
||||
# at the time of writing the available Azure Ubuntu vm image
|
||||
|
|
@ -67,5 +75,189 @@ jobs:
|
|||
cd utest
|
||||
dir
|
||||
openblas_utest.exe
|
||||
|
||||
|
||||
- job: Windows_mingw_gmake
|
||||
pool:
|
||||
vmImage: 'windows-latest'
|
||||
steps:
|
||||
- script: |
|
||||
mingw32-make CC=gcc FC=gfortran DYNAMIC_ARCH=1 DYNAMIC_LIST="NEHALEM SANDYBRIDGE HASWELL"
|
||||
|
||||
- job: Windows_clang_cmake
|
||||
pool:
|
||||
vmImage: 'windows-latest'
|
||||
steps:
|
||||
- script: |
|
||||
set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%"
|
||||
set "LIB=C:\Miniconda\Library\lib;%LIB%"
|
||||
set "CPATH=C:\Miniconda\Library\include;%CPATH%
|
||||
conda config --add channels conda-forge --force
|
||||
conda config --set auto_update_conda false
|
||||
conda install --yes ninja
|
||||
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DNOFORTRAN=1 -DMSVC_STATIC_CRT=ON ..
|
||||
cmake --build . --config Release
|
||||
ctest
|
||||
|
||||
- job: Windows_flang_clang
|
||||
pool:
|
||||
vmImage: 'windows-latest'
|
||||
steps:
|
||||
- script: |
|
||||
set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%"
|
||||
set "LIB=C:\Miniconda\Library\lib;%LIB%"
|
||||
set "CPATH=C:\Miniconda\Library\include;%CPATH%"
|
||||
conda config --add channels conda-forge --force
|
||||
conda config --set auto_update_conda false
|
||||
conda install --yes --quiet ninja flang
|
||||
mkdir build
|
||||
cd build
|
||||
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
|
||||
cmake --build . --config Release
|
||||
ctest
|
||||
|
||||
- job: OSX_OpenMP
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10
|
||||
make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 PREFIX=../blasinst install
|
||||
ls -lR ../blasinst
|
||||
|
||||
- job: OSX_GCC_Nothreads
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
make USE_THREADS=0 CC=gcc-10 FC=gfortran-10
|
||||
|
||||
- job: OSX_OpenMP_Clang
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
variables:
|
||||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
brew install llvm libomp
|
||||
make TARGET=CORE2 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10
|
||||
|
||||
- job: OSX_OpenMP_Clang_cmake
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
variables:
|
||||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
brew install llvm libomp
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNOFORTRAN=1 -DNO_AVX512=1 ..
|
||||
make
|
||||
ctest
|
||||
|
||||
- job: OSX_dynarch_cmake
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
variables:
|
||||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
steps:
|
||||
- script: |
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON ..
|
||||
cmake --build .
|
||||
ctest
|
||||
|
||||
- job: OSX_Ifort_Clang
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
variables:
|
||||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg
|
||||
LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
MACOS_FORTRAN_COMPONENTS: intel.oneapi.mac.ifort-compiler
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
brew install llvm libomp
|
||||
sudo mkdir -p /opt/intel
|
||||
sudo chown $USER /opt/intel
|
||||
displayName: prepare for cache restore
|
||||
- task: Cache@2
|
||||
inputs:
|
||||
path: /opt/intel/oneapi
|
||||
key: '"install" | "$(MACOS_HPCKIT_URL)" | "$(MACOS_FORTRAN_COMPONENTS)"'
|
||||
cacheHitVar: CACHE_RESTORED
|
||||
- script: |
|
||||
curl --output webimage.dmg --url $(MACOS_HPCKIT_URL) --retry 5 --retry-delay 5
|
||||
hdiutil attach webimage.dmg
|
||||
sudo /Volumes/"$(basename "$(MACOS_HPCKIT_URL)" .dmg)"/bootstrapper.app/Contents/MacOS/bootstrapper -s --action install --components="$(MACOS_FORTRAN_COMPONENTS)" --eula=accept --continue-with-optional-error=yes --log-dir=.
|
||||
installer_exit_code=$?
|
||||
hdiutil detach /Volumes/"$(basename "$URL" .dmg)" -quiet
|
||||
exit $installer_exit_code
|
||||
displayName: install
|
||||
condition: ne(variables.CACHE_RESTORED, 'true')
|
||||
- script: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
make CC=/usr/local/opt/llvm/bin/clang FC=ifort
|
||||
|
||||
- job: OSX_NDK_ARMV7
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
brew install --cask android-ndk
|
||||
export ANDROID_NDK_HOME=/usr/local/share/android-ndk
|
||||
make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/llvm-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4
|
||||
|
||||
- job: OSX_IOS_ARMV8
|
||||
pool:
|
||||
vmImage: 'macOS-11'
|
||||
variables:
|
||||
CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch arm64 -miphoneos-version-min=10.0
|
||||
steps:
|
||||
- script: |
|
||||
make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
|
||||
|
||||
- job: OSX_IOS_ARMV7
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
variables:
|
||||
CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch armv7 -miphoneos-version-min=5.1
|
||||
steps:
|
||||
- script: |
|
||||
make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
|
||||
|
||||
- job: ALPINE_MUSL
|
||||
pool:
|
||||
vmImage: 'ubuntu-latest'
|
||||
steps:
|
||||
- script: |
|
||||
wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \
|
||||
&& echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc alpine-chroot-install' | sha1sum -c \
|
||||
|| exit 1
|
||||
alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
|
||||
sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo'
|
||||
alpine make DYNAMIC_ARCH=1 BINARY=64
|
||||
alpine make DYNAMIC_ARCH=1 BINARY=64 PREFIX=mytestdir install
|
||||
alpine ls -l mytestdir/include
|
||||
alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c
|
||||
alpine echo "#include <openblas_config.h>" >>test_install.c
|
||||
alpine echo "int main(){" >> test_install.c
|
||||
alpine echo "cpu_set_t* cpu_set = NULL;}" >>test_install.c
|
||||
alpine gcc -Imytestdir/include test_install.c -Lmytestdir/lib -lopenblas -lpthread -lgfortran -o test_install
|
||||
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@
|
|||
#include <time.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#elif defined(__APPLE__)
|
||||
#include <mach/mach_time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
|
|
@ -74,6 +76,9 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
|
||||
struct timeval start, stop;
|
||||
#elif defined(__APPLE__)
|
||||
mach_timebase_info_data_t info;
|
||||
uint64_t start = 0, stop = 0;
|
||||
#else
|
||||
struct timespec start = { 0, 0 }, stop = { 0, 0 };
|
||||
#endif
|
||||
|
|
@ -82,6 +87,9 @@ double getsec()
|
|||
{
|
||||
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
|
||||
return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
#elif defined(__APPLE__)
|
||||
mach_timebase_info(&info);
|
||||
return (double)(((stop - start) * info.numer)/info.denom) * 1.e-9;
|
||||
#else
|
||||
return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9;
|
||||
#endif
|
||||
|
|
@ -90,6 +98,8 @@ double getsec()
|
|||
void begin() {
|
||||
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
#elif defined(__APPLE__)
|
||||
start = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
|
||||
#else
|
||||
clock_gettime(CLOCK_REALTIME, &start);
|
||||
#endif
|
||||
|
|
@ -98,7 +108,9 @@ void begin() {
|
|||
void end() {
|
||||
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
#elif defined(__APPLE__)
|
||||
stop = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
|
||||
#else
|
||||
clock_gettime(CLOCK_REALTIME, &stop);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -125,7 +125,7 @@ int main(int argc, char *argv[]){
|
|||
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < n * COMPSIZE; i++){
|
||||
a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -162,7 +162,7 @@ int main(int argc, char *argv[]){
|
|||
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < n * COMPSIZE; i++){
|
||||
a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -72,13 +72,17 @@ int main(int argc, char *argv[]){
|
|||
FLOAT *a,*work;
|
||||
FLOAT wkopt[4];
|
||||
blasint *ipiv;
|
||||
blasint m, i, j, info,lwork;
|
||||
blasint m, i, j, l, info,lwork;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
int loops = 1;
|
||||
|
||||
double time1;
|
||||
double time1,timeg;
|
||||
|
||||
char *p;
|
||||
char btest = 'I';
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
|
|
@ -86,6 +90,9 @@ int main(int argc, char *argv[]){
|
|||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step);
|
||||
|
||||
|
|
@ -124,32 +131,41 @@ int main(int argc, char *argv[]){
|
|||
fprintf(stderr, " SIZE FLops Time Lwork\n");
|
||||
|
||||
for(m = from; m <= to; m += step){
|
||||
|
||||
timeg = 0.;
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
|
||||
GETRF (&m, &m, a, &m, ipiv, &info);
|
||||
for (l = 0; l < loops; l++) {
|
||||
|
||||
if (btest == 'F') begin();
|
||||
GETRF (&m, &m, a, &m, ipiv, &info);
|
||||
if (btest == 'F') {
|
||||
end();
|
||||
timeg += getsec();
|
||||
}
|
||||
if (info) {
|
||||
fprintf(stderr, "Matrix is not singular .. %d\n", info);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
begin();
|
||||
if (btest == 'I') begin();
|
||||
|
||||
lwork = -1;
|
||||
GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info);
|
||||
|
||||
lwork = (blasint)wkopt[0];
|
||||
GETRI(&m, a, &m, ipiv, work, &lwork, &info);
|
||||
end();
|
||||
if (btest == 'I') end();
|
||||
|
||||
if (info) {
|
||||
fprintf(stderr, "failed compute inverse matrix .. %d\n", info);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
time1 = getsec();
|
||||
|
||||
if (btest == 'I')
|
||||
timeg += getsec();
|
||||
|
||||
} // loops
|
||||
time1 = timeg/(double)loops;
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops : %10.2f Sec : %d\n",
|
||||
COMPSIZE * COMPSIZE * (4.0/3.0 * (double)m * (double)m *(double)m - (double)m *(double)m + 5.0/3.0* (double)m) / time1 * 1.e-6,time1,lwork);
|
||||
|
|
|
|||
|
|
@ -72,17 +72,21 @@ int main(int argc, char *argv[]){
|
|||
FLOAT *a, *b;
|
||||
blasint *ipiv;
|
||||
|
||||
blasint m, i, j, info;
|
||||
blasint m, i, j, l, info;
|
||||
blasint unit = 1;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
int loops = 1;
|
||||
|
||||
FLOAT maxerr;
|
||||
|
||||
double time1, time2;
|
||||
double time1, time2, timeg1,timeg2;
|
||||
|
||||
char *p;
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
|
|
@ -110,9 +114,9 @@ int main(int argc, char *argv[]){
|
|||
fprintf(stderr, " SIZE Residual Decompose Solve Total\n");
|
||||
|
||||
for(m = from; m <= to; m += step){
|
||||
|
||||
timeg1 = timeg2 = 0.;
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
|
||||
for (l = 0; l < loops; l++) {
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < m * COMPSIZE; i++){
|
||||
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
|
|
@ -138,7 +142,7 @@ int main(int argc, char *argv[]){
|
|||
exit(1);
|
||||
}
|
||||
|
||||
time1 = getsec();
|
||||
timeg1 += getsec();
|
||||
|
||||
begin();
|
||||
|
||||
|
|
@ -151,8 +155,10 @@ int main(int argc, char *argv[]){
|
|||
exit(1);
|
||||
}
|
||||
|
||||
time2 = getsec();
|
||||
|
||||
timeg2 += getsec();
|
||||
} //loops
|
||||
time1=timeg1/(double)loops;
|
||||
time2=timeg2/(double)loops;
|
||||
maxerr = 0.;
|
||||
|
||||
for(i = 0; i < m; i++){
|
||||
|
|
|
|||
|
|
@ -99,14 +99,15 @@ int main(int argc, char *argv[]){
|
|||
char *p;
|
||||
char btest = 'F';
|
||||
|
||||
blasint m, i, j, info, uplos=0;
|
||||
double flops;
|
||||
blasint m, i, j, l, info, uplos=0;
|
||||
double flops = 0.;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
int loops = 1;
|
||||
|
||||
double time1;
|
||||
double time1, timeg;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
|
|
@ -119,6 +120,8 @@ int main(int argc, char *argv[]){
|
|||
|
||||
if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]);
|
||||
|
||||
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
|
||||
|
|
@ -129,19 +132,21 @@ int main(int argc, char *argv[]){
|
|||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
for(m = from; m <= to; m += step){
|
||||
|
||||
for(m = from; m <= to; m += step){
|
||||
timeg=0.;
|
||||
for (l = 0; l < loops; l++) {
|
||||
#ifndef COMPLEX
|
||||
if (uplos & 1) {
|
||||
for (j = 0; j < m; j++) {
|
||||
for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = 0.;
|
||||
a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
|
||||
a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
|
||||
for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5;
|
||||
}
|
||||
} else {
|
||||
for (j = 0; j < m; j++) {
|
||||
for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5;
|
||||
a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
|
||||
a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
|
||||
for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = 0.;
|
||||
}
|
||||
}
|
||||
|
|
@ -192,8 +197,8 @@ int main(int argc, char *argv[]){
|
|||
exit(1);
|
||||
}
|
||||
|
||||
time1 = getsec();
|
||||
flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6;
|
||||
if ( btest == 'F')
|
||||
timeg += getsec();
|
||||
|
||||
if ( btest == 'S' )
|
||||
{
|
||||
|
|
@ -214,9 +219,7 @@ int main(int argc, char *argv[]){
|
|||
fprintf(stderr, "Potrs info = %d\n", info);
|
||||
exit(1);
|
||||
}
|
||||
time1 = getsec();
|
||||
flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6;
|
||||
|
||||
timeg += getsec();
|
||||
}
|
||||
|
||||
if ( btest == 'I' )
|
||||
|
|
@ -232,11 +235,17 @@ int main(int argc, char *argv[]){
|
|||
fprintf(stderr, "Potri info = %d\n", info);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
time1 = getsec();
|
||||
flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6;
|
||||
timeg += getsec();
|
||||
}
|
||||
|
||||
} // loops
|
||||
|
||||
time1 = timeg/(double)loops;
|
||||
if ( btest == 'F')
|
||||
flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6;
|
||||
if ( btest == 'S')
|
||||
flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6;
|
||||
if ( btest == 'I')
|
||||
flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6;
|
||||
fprintf(stderr, "%8d : %10.2f MFlops : %10.3f Sec : Test=%c\n",m,flops ,time1,btest);
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -46,14 +46,17 @@ int main(int argc, char *argv[]){
|
|||
|
||||
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
|
||||
|
||||
blasint m, i, j;
|
||||
blasint m, i, j, l;
|
||||
blasint inc_x= 1;
|
||||
blasint inc_y= 1;
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
int loops = 1;
|
||||
|
||||
double time1;
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
|
||||
double time1,timeg;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
|
|
@ -85,8 +88,9 @@ int main(int argc, char *argv[]){
|
|||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg = 0.;
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
for (l = 0; l < loops; l++) {
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
|
@ -107,8 +111,10 @@ int main(int argc, char *argv[]){
|
|||
|
||||
end();
|
||||
|
||||
time1 = getsec();
|
||||
timeg += getsec();
|
||||
} // loops
|
||||
|
||||
time1 = timeg/(double)loops;
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time1 * 1.e-6);
|
||||
|
|
|
|||
|
|
@ -56,17 +56,20 @@ int main(int argc, char *argv[]){
|
|||
|
||||
char uplo='U';
|
||||
char trans='N';
|
||||
|
||||
|
||||
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
|
||||
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
|
||||
|
||||
blasint m, i, j;
|
||||
blasint m, i, j, l;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
int loops = 1;
|
||||
|
||||
double time1;
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
|
||||
double time1,timeg;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
|
|
@ -95,9 +98,12 @@ int main(int argc, char *argv[]){
|
|||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
timeg = 0.;
|
||||
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
|
||||
for(l = 0; l < loops; l++) {
|
||||
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < m * COMPSIZE; i++){
|
||||
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
|
|
@ -111,8 +117,10 @@ int main(int argc, char *argv[]){
|
|||
|
||||
end();
|
||||
|
||||
time1 = getsec();
|
||||
|
||||
timeg += getsec();
|
||||
|
||||
} //loops
|
||||
time1 = timeg / (double)loops;
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
|
||||
|
|
|
|||
66
c_check
66
c_check
|
|
@ -1,11 +1,11 @@
|
|||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
#use File::Basename;
|
||||
# use File::Temp qw(tempfile);
|
||||
|
||||
# Checking cross compile
|
||||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
||||
$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
|
||||
$hostarch = `uname -m | sed -e s/i.86/x86/`;
|
||||
$hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS");
|
||||
chop($hostarch);
|
||||
$hostarch = "x86_64" if ($hostarch eq "amd64");
|
||||
|
|
@ -82,18 +82,20 @@ $os = Interix if ($data =~ /OS_INTERIX/);
|
|||
$os = Android if ($data =~ /OS_ANDROID/);
|
||||
$os = Haiku if ($data =~ /OS_HAIKU/);
|
||||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = power if ($data =~ /ARCH_POWER/);
|
||||
$architecture = mips if ($data =~ /ARCH_MIPS/);
|
||||
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
|
||||
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
|
||||
$architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = e2k if ($data =~ /ARCH_E2K/);
|
||||
$architecture = power if ($data =~ /ARCH_POWER/);
|
||||
$architecture = mips if ($data =~ /ARCH_MIPS/);
|
||||
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
|
||||
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
|
||||
$architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
|
||||
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
||||
|
||||
$defined = 0;
|
||||
|
||||
|
|
@ -123,6 +125,11 @@ if ($architecture eq "zarch") {
|
|||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "e2k") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "alpha") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
|
|
@ -143,6 +150,11 @@ if ($architecture eq "riscv64") {
|
|||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "loongarch64") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($compiler eq "PGI") {
|
||||
$compiler_name .= " -tp p7" if ($binary eq "32");
|
||||
$compiler_name .= " -tp p7-64" if ($binary eq "64");
|
||||
|
|
@ -199,7 +211,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
|||
} else {
|
||||
$tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
|
||||
$code = '"addvi.b $w0, $w1, 1"';
|
||||
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
|
||||
$msa_flags = "-mmsa -mfp64 -mload-store-pairs";
|
||||
print $tmpf "#include <msa.h>\n\n";
|
||||
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
|
||||
|
||||
|
|
@ -215,17 +227,19 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
|||
}
|
||||
}
|
||||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = power if ($data =~ /ARCH_POWER/);
|
||||
$architecture = mips if ($data =~ /ARCH_MIPS/);
|
||||
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
|
||||
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
|
||||
$architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = e2k if ($data =~ /ARCH_E2K/);
|
||||
$architecture = power if ($data =~ /ARCH_POWER/);
|
||||
$architecture = mips if ($data =~ /ARCH_MIPS/);
|
||||
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
|
||||
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
|
||||
$architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
||||
|
||||
$binformat = bin32;
|
||||
$binformat = bin64 if ($data =~ /BINARY_64/);
|
||||
|
|
|
|||
7
cblas.h
7
cblas.h
|
|
@ -125,9 +125,14 @@ void cblas_zswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx,
|
|||
|
||||
void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
|
||||
void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s);
|
||||
void cblas_csrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
|
||||
void cblas_zdrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s);
|
||||
|
||||
void cblas_srotg(float *a, float *b, float *c, float *s);
|
||||
void cblas_drotg(double *a, double *b, double *c, double *s);
|
||||
void cblas_crotg(void *a, void *b, float *c, void *s);
|
||||
void cblas_zrotg(void *a, void *b, double *c, void *s);
|
||||
|
||||
|
||||
void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P);
|
||||
void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P);
|
||||
|
|
@ -395,6 +400,8 @@ void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE
|
|||
float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy);
|
||||
void cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy);
|
||||
|
||||
void cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
|
||||
OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
|
|
|||
|
|
@ -44,7 +44,10 @@ endif ()
|
|||
|
||||
if (DYNAMIC_ARCH)
|
||||
if (ARM64)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 NEOVERSEV1 NEOVERSEN2 THUNDERX3T110)
|
||||
if (DYNAMIC_LIST)
|
||||
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (POWER)
|
||||
|
|
@ -106,7 +109,11 @@ if (${ARCH} STREQUAL "ia64")
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (MIPS64)
|
||||
if (MIPS32 OR MIPS64)
|
||||
set(NO_BINARY_MODE 1)
|
||||
endif ()
|
||||
|
||||
if (LOONGARCH64)
|
||||
set(NO_BINARY_MODE 1)
|
||||
endif ()
|
||||
|
||||
|
|
|
|||
|
|
@ -15,6 +15,11 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS
|
|||
|
||||
if (NO_BINARY_MODE)
|
||||
|
||||
if (MIPS32)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=32")
|
||||
set(BINARY_DEFINED 1)
|
||||
endif ()
|
||||
|
||||
if (MIPS64)
|
||||
if (BINARY64)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=64")
|
||||
|
|
@ -29,6 +34,15 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS
|
|||
set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64")
|
||||
endif ()
|
||||
|
||||
if (LOONGARCH64)
|
||||
if (BINARY64)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64")
|
||||
else ()
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32")
|
||||
endif ()
|
||||
set(BINARY_DEFINED 1)
|
||||
endif ()
|
||||
|
||||
if (CMAKE_SYSTEM_NAME STREQUAL "AIX")
|
||||
set(BINARY_DEFINED 1)
|
||||
endif ()
|
||||
|
|
@ -117,6 +131,65 @@ if (${CORE} STREQUAL COOPERLAKE)
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL SAPPHIRERAPIDS)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (NOT NO_AVX512)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=sapphirerapids")
|
||||
else ()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512")
|
||||
endif()
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL A64FX)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx")
|
||||
else ()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
|
||||
endif()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL ARMV8SVE)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL POWER10)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math")
|
||||
else ()
|
||||
message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10." )
|
||||
endif()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL POWER9)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math")
|
||||
else ()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
|
||||
message(WARNING "Compiler GCC.${GCC_VERSION} does not fully support Power9.")
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL POWER8)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (HAVE_AVX2)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2")
|
||||
|
|
@ -124,9 +197,9 @@ if (NOT DYNAMIC_ARCH)
|
|||
if (HAVE_AVX)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mavx")
|
||||
endif ()
|
||||
if (HAVE_FMA3)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mfma")
|
||||
endif ()
|
||||
# if (HAVE_FMA3)
|
||||
#set (CCOMMON_OPT "${CCOMMON_OPT} -mfma")
|
||||
#endif ()
|
||||
if (HAVE_SSE)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -msse")
|
||||
endif ()
|
||||
|
|
|
|||
|
|
@ -20,19 +20,16 @@
|
|||
# NEEDBUNDERSCORE
|
||||
# NEED2UNDERSCORES
|
||||
|
||||
if (NOT NO_LAPACK)
|
||||
include(CheckLanguage)
|
||||
check_language(Fortran)
|
||||
if(CMAKE_Fortran_COMPILER)
|
||||
enable_language(Fortran)
|
||||
else()
|
||||
message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK")
|
||||
include(CheckLanguage)
|
||||
check_language(Fortran)
|
||||
if(CMAKE_Fortran_COMPILER)
|
||||
enable_language(Fortran)
|
||||
else()
|
||||
if (NOT NO_LAPACK)
|
||||
message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK")
|
||||
endif()
|
||||
set (NOFORTRAN 1)
|
||||
set (NO_LAPACK 1)
|
||||
endif()
|
||||
else()
|
||||
include(CMakeForceCompiler)
|
||||
CMAKE_FORCE_Fortran_COMPILER(gfortran GNU)
|
||||
endif()
|
||||
|
||||
if (NOT ONLY_CBLAS)
|
||||
|
|
|
|||
|
|
@ -3,11 +3,6 @@
|
|||
## Description: Ported from portion of OpenBLAS/Makefile.system
|
||||
## Sets Fortran related variables.
|
||||
|
||||
if (INTERFACE64)
|
||||
set(SUFFIX64 64)
|
||||
set(SUFFIX64_UNDERSCORE _64)
|
||||
endif()
|
||||
|
||||
if (${F_COMPILER} STREQUAL "FLANG")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
|
||||
if (BINARY64 AND INTERFACE64)
|
||||
|
|
@ -61,6 +56,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN")
|
|||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32")
|
||||
endif ()
|
||||
endif ()
|
||||
if (LOONGARCH64)
|
||||
if (BINARY64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
|
||||
else ()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
|
||||
endif ()
|
||||
endif ()
|
||||
else ()
|
||||
if (BINARY64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
|
||||
|
|
@ -97,7 +99,7 @@ endif ()
|
|||
|
||||
if (${F_COMPILER} STREQUAL "IBM")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM")
|
||||
# FCOMMON_OPT += -qarch=440
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -qrecur")
|
||||
if (BINARY64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -q64")
|
||||
if (INTERFACE64)
|
||||
|
|
|
|||
|
|
@ -1,212 +1,218 @@
|
|||
# helper functions for the kernel CMakeLists.txt
|
||||
|
||||
function(SetFallback KERNEL SOURCE_PATH)
|
||||
if (NOT (DEFINED ${KERNEL}))
|
||||
set(${KERNEL} ${SOURCE_PATH} PARENT_SCOPE)
|
||||
endif ()
|
||||
endfunction()
|
||||
|
||||
# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file.
|
||||
macro(SetDefaultL1)
|
||||
set(SAMAXKERNEL amax.S)
|
||||
set(DAMAXKERNEL amax.S)
|
||||
set(QAMAXKERNEL amax.S)
|
||||
set(CAMAXKERNEL zamax.S)
|
||||
set(ZAMAXKERNEL zamax.S)
|
||||
set(XAMAXKERNEL zamax.S)
|
||||
set(SAMINKERNEL amin.S)
|
||||
set(DAMINKERNEL amin.S)
|
||||
set(QAMINKERNEL amin.S)
|
||||
set(CAMINKERNEL zamin.S)
|
||||
set(ZAMINKERNEL zamin.S)
|
||||
set(XAMINKERNEL zamin.S)
|
||||
set(SMAXKERNEL max.S)
|
||||
set(DMAXKERNEL max.S)
|
||||
set(QMAXKERNEL max.S)
|
||||
set(SMINKERNEL min.S)
|
||||
set(DMINKERNEL min.S)
|
||||
set(QMINKERNEL min.S)
|
||||
set(ISAMAXKERNEL iamax.S)
|
||||
set(IDAMAXKERNEL iamax.S)
|
||||
set(IQAMAXKERNEL iamax.S)
|
||||
set(ICAMAXKERNEL izamax.S)
|
||||
set(IZAMAXKERNEL izamax.S)
|
||||
set(IXAMAXKERNEL izamax.S)
|
||||
set(ISAMINKERNEL iamin.S)
|
||||
set(IDAMINKERNEL iamin.S)
|
||||
set(IQAMINKERNEL iamin.S)
|
||||
set(ICAMINKERNEL izamin.S)
|
||||
set(IZAMINKERNEL izamin.S)
|
||||
set(IXAMINKERNEL izamin.S)
|
||||
set(ISMAXKERNEL iamax.S)
|
||||
set(IDMAXKERNEL iamax.S)
|
||||
set(IQMAXKERNEL iamax.S)
|
||||
set(ISMINKERNEL iamin.S)
|
||||
set(IDMINKERNEL iamin.S)
|
||||
set(IQMINKERNEL iamin.S)
|
||||
set(SASUMKERNEL asum.S)
|
||||
set(DASUMKERNEL asum.S)
|
||||
set(CASUMKERNEL zasum.S)
|
||||
set(ZASUMKERNEL zasum.S)
|
||||
set(QASUMKERNEL asum.S)
|
||||
set(XASUMKERNEL zasum.S)
|
||||
set(SAXPYKERNEL axpy.S)
|
||||
set(DAXPYKERNEL axpy.S)
|
||||
set(CAXPYKERNEL zaxpy.S)
|
||||
set(ZAXPYKERNEL zaxpy.S)
|
||||
set(QAXPYKERNEL axpy.S)
|
||||
set(XAXPYKERNEL zaxpy.S)
|
||||
set(SCOPYKERNEL copy.S)
|
||||
set(DCOPYKERNEL copy.S)
|
||||
set(CCOPYKERNEL zcopy.S)
|
||||
set(ZCOPYKERNEL zcopy.S)
|
||||
set(QCOPYKERNEL copy.S)
|
||||
set(XCOPYKERNEL zcopy.S)
|
||||
set(SDOTKERNEL dot.S)
|
||||
set(DDOTKERNEL dot.S)
|
||||
set(CDOTKERNEL zdot.S)
|
||||
set(ZDOTKERNEL zdot.S)
|
||||
set(QDOTKERNEL dot.S)
|
||||
set(XDOTKERNEL zdot.S)
|
||||
set(SNRM2KERNEL nrm2.S)
|
||||
set(DNRM2KERNEL nrm2.S)
|
||||
set(QNRM2KERNEL nrm2.S)
|
||||
set(CNRM2KERNEL znrm2.S)
|
||||
set(ZNRM2KERNEL znrm2.S)
|
||||
set(XNRM2KERNEL znrm2.S)
|
||||
set(SROTKERNEL rot.S)
|
||||
set(DROTKERNEL rot.S)
|
||||
set(QROTKERNEL rot.S)
|
||||
set(CROTKERNEL zrot.S)
|
||||
set(ZROTKERNEL zrot.S)
|
||||
set(XROTKERNEL zrot.S)
|
||||
set(SSCALKERNEL scal.S)
|
||||
set(DSCALKERNEL scal.S)
|
||||
set(CSCALKERNEL zscal.S)
|
||||
set(ZSCALKERNEL zscal.S)
|
||||
set(QSCALKERNEL scal.S)
|
||||
set(XSCALKERNEL zscal.S)
|
||||
set(SSWAPKERNEL swap.S)
|
||||
set(DSWAPKERNEL swap.S)
|
||||
set(CSWAPKERNEL zswap.S)
|
||||
set(ZSWAPKERNEL zswap.S)
|
||||
set(QSWAPKERNEL swap.S)
|
||||
set(XSWAPKERNEL zswap.S)
|
||||
set(SGEMVNKERNEL gemv_n.S)
|
||||
set(SGEMVTKERNEL gemv_t.S)
|
||||
set(DGEMVNKERNEL gemv_n.S)
|
||||
set(DGEMVTKERNEL gemv_t.S)
|
||||
set(CGEMVNKERNEL zgemv_n.S)
|
||||
set(CGEMVTKERNEL zgemv_t.S)
|
||||
set(ZGEMVNKERNEL zgemv_n.S)
|
||||
set(ZGEMVTKERNEL zgemv_t.S)
|
||||
set(QGEMVNKERNEL gemv_n.S)
|
||||
set(QGEMVTKERNEL gemv_t.S)
|
||||
set(XGEMVNKERNEL zgemv_n.S)
|
||||
set(XGEMVTKERNEL zgemv_t.S)
|
||||
set(SCABS_KERNEL ../generic/cabs.c)
|
||||
set(DCABS_KERNEL ../generic/cabs.c)
|
||||
set(QCABS_KERNEL ../generic/cabs.c)
|
||||
set(LSAME_KERNEL ../generic/lsame.c)
|
||||
set(SAXPBYKERNEL ../arm/axpby.c)
|
||||
set(DAXPBYKERNEL ../arm/axpby.c)
|
||||
set(CAXPBYKERNEL ../arm/zaxpby.c)
|
||||
set(ZAXPBYKERNEL ../arm/zaxpby.c)
|
||||
set(SSUMKERNEL sum.S)
|
||||
set(DSUMKERNEL sum.S)
|
||||
set(CSUMKERNEL zsum.S)
|
||||
set(ZSUMKERNEL zsum.S)
|
||||
set(QSUMKERNEL sum.S)
|
||||
set(XSUMKERNEL zsum.S)
|
||||
SetFallback(SAMAXKERNEL amax.S)
|
||||
SetFallback(DAMAXKERNEL amax.S)
|
||||
SetFallback(QAMAXKERNEL amax.S)
|
||||
SetFallback(CAMAXKERNEL zamax.S)
|
||||
SetFallback(ZAMAXKERNEL zamax.S)
|
||||
SetFallback(XAMAXKERNEL zamax.S)
|
||||
SetFallback(SAMINKERNEL amin.S)
|
||||
SetFallback(DAMINKERNEL amin.S)
|
||||
SetFallback(QAMINKERNEL amin.S)
|
||||
SetFallback(CAMINKERNEL zamin.S)
|
||||
SetFallback(ZAMINKERNEL zamin.S)
|
||||
SetFallback(XAMINKERNEL zamin.S)
|
||||
SetFallback(SMAXKERNEL max.S)
|
||||
SetFallback(DMAXKERNEL max.S)
|
||||
SetFallback(QMAXKERNEL max.S)
|
||||
SetFallback(SMINKERNEL min.S)
|
||||
SetFallback(DMINKERNEL min.S)
|
||||
SetFallback(QMINKERNEL min.S)
|
||||
SetFallback(ISAMAXKERNEL iamax.S)
|
||||
SetFallback(IDAMAXKERNEL iamax.S)
|
||||
SetFallback(IQAMAXKERNEL iamax.S)
|
||||
SetFallback(ICAMAXKERNEL izamax.S)
|
||||
SetFallback(IZAMAXKERNEL izamax.S)
|
||||
SetFallback(IXAMAXKERNEL izamax.S)
|
||||
SetFallback(ISAMINKERNEL iamin.S)
|
||||
SetFallback(IDAMINKERNEL iamin.S)
|
||||
SetFallback(IQAMINKERNEL iamin.S)
|
||||
SetFallback(ICAMINKERNEL izamin.S)
|
||||
SetFallback(IZAMINKERNEL izamin.S)
|
||||
SetFallback(IXAMINKERNEL izamin.S)
|
||||
SetFallback(ISMAXKERNEL iamax.S)
|
||||
SetFallback(IDMAXKERNEL iamax.S)
|
||||
SetFallback(IQMAXKERNEL iamax.S)
|
||||
SetFallback(ISMINKERNEL iamin.S)
|
||||
SetFallback(IDMINKERNEL iamin.S)
|
||||
SetFallback(IQMINKERNEL iamin.S)
|
||||
SetFallback(SASUMKERNEL asum.S)
|
||||
SetFallback(DASUMKERNEL asum.S)
|
||||
SetFallback(CASUMKERNEL zasum.S)
|
||||
SetFallback(ZASUMKERNEL zasum.S)
|
||||
SetFallback(QASUMKERNEL asum.S)
|
||||
SetFallback(XASUMKERNEL zasum.S)
|
||||
SetFallback(SAXPYKERNEL axpy.S)
|
||||
SetFallback(DAXPYKERNEL axpy.S)
|
||||
SetFallback(CAXPYKERNEL zaxpy.S)
|
||||
SetFallback(ZAXPYKERNEL zaxpy.S)
|
||||
SetFallback(QAXPYKERNEL axpy.S)
|
||||
SetFallback(XAXPYKERNEL zaxpy.S)
|
||||
SetFallback(SCOPYKERNEL copy.S)
|
||||
SetFallback(DCOPYKERNEL copy.S)
|
||||
SetFallback(CCOPYKERNEL zcopy.S)
|
||||
SetFallback(ZCOPYKERNEL zcopy.S)
|
||||
SetFallback(QCOPYKERNEL copy.S)
|
||||
SetFallback(XCOPYKERNEL zcopy.S)
|
||||
SetFallback(SDOTKERNEL dot.S)
|
||||
SetFallback(DDOTKERNEL dot.S)
|
||||
SetFallback(CDOTKERNEL zdot.S)
|
||||
SetFallback(ZDOTKERNEL zdot.S)
|
||||
SetFallback(QDOTKERNEL dot.S)
|
||||
SetFallback(XDOTKERNEL zdot.S)
|
||||
SetFallback(SNRM2KERNEL nrm2.S)
|
||||
SetFallback(DNRM2KERNEL nrm2.S)
|
||||
SetFallback(QNRM2KERNEL nrm2.S)
|
||||
SetFallback(CNRM2KERNEL znrm2.S)
|
||||
SetFallback(ZNRM2KERNEL znrm2.S)
|
||||
SetFallback(XNRM2KERNEL znrm2.S)
|
||||
SetFallback(SROTKERNEL rot.S)
|
||||
SetFallback(DROTKERNEL rot.S)
|
||||
SetFallback(QROTKERNEL rot.S)
|
||||
SetFallback(CROTKERNEL zrot.S)
|
||||
SetFallback(ZROTKERNEL zrot.S)
|
||||
SetFallback(XROTKERNEL zrot.S)
|
||||
SetFallback(SSCALKERNEL scal.S)
|
||||
SetFallback(DSCALKERNEL scal.S)
|
||||
SetFallback(CSCALKERNEL zscal.S)
|
||||
SetFallback(ZSCALKERNEL zscal.S)
|
||||
SetFallback(QSCALKERNEL scal.S)
|
||||
SetFallback(XSCALKERNEL zscal.S)
|
||||
SetFallback(SSWAPKERNEL swap.S)
|
||||
SetFallback(DSWAPKERNEL swap.S)
|
||||
SetFallback(CSWAPKERNEL zswap.S)
|
||||
SetFallback(ZSWAPKERNEL zswap.S)
|
||||
SetFallback(QSWAPKERNEL swap.S)
|
||||
SetFallback(XSWAPKERNEL zswap.S)
|
||||
SetFallback(SGEMVNKERNEL gemv_n.S)
|
||||
SetFallback(SGEMVTKERNEL gemv_t.S)
|
||||
SetFallback(DGEMVNKERNEL gemv_n.S)
|
||||
SetFallback(DGEMVTKERNEL gemv_t.S)
|
||||
SetFallback(CGEMVNKERNEL zgemv_n.S)
|
||||
SetFallback(CGEMVTKERNEL zgemv_t.S)
|
||||
SetFallback(ZGEMVNKERNEL zgemv_n.S)
|
||||
SetFallback(ZGEMVTKERNEL zgemv_t.S)
|
||||
SetFallback(QGEMVNKERNEL gemv_n.S)
|
||||
SetFallback(QGEMVTKERNEL gemv_t.S)
|
||||
SetFallback(XGEMVNKERNEL zgemv_n.S)
|
||||
SetFallback(XGEMVTKERNEL zgemv_t.S)
|
||||
SetFallback(SCABS_KERNEL ../generic/cabs.c)
|
||||
SetFallback(DCABS_KERNEL ../generic/cabs.c)
|
||||
SetFallback(QCABS_KERNEL ../generic/cabs.c)
|
||||
SetFallback(LSAME_KERNEL ../generic/lsame.c)
|
||||
SetFallback(SAXPBYKERNEL ../arm/axpby.c)
|
||||
SetFallback(DAXPBYKERNEL ../arm/axpby.c)
|
||||
SetFallback(CAXPBYKERNEL ../arm/zaxpby.c)
|
||||
SetFallback(ZAXPBYKERNEL ../arm/zaxpby.c)
|
||||
SetFallback(SSUMKERNEL sum.S)
|
||||
SetFallback(DSUMKERNEL sum.S)
|
||||
SetFallback(CSUMKERNEL zsum.S)
|
||||
SetFallback(ZSUMKERNEL zsum.S)
|
||||
SetFallback(QSUMKERNEL sum.S)
|
||||
SetFallback(XSUMKERNEL zsum.S)
|
||||
if (BUILD_BFLOAT16)
|
||||
set(SHAMINKERNEL ../arm/amin.c)
|
||||
set(SHAMAXKERNEL ../arm/amax.c)
|
||||
set(SHMAXKERNEL ../arm/max.c)
|
||||
set(SHMINKERNEL ../arm/min.c)
|
||||
set(ISHAMAXKERNEL ../arm/iamax.c)
|
||||
set(ISHAMINKERNEL ../arm/iamin.c)
|
||||
set(ISHMAXKERNEL ../arm/imax.c)
|
||||
set(ISHMINKERNEL ../arm/imin.c)
|
||||
set(SHASUMKERNEL ../arm/asum.c)
|
||||
set(SHAXPYKERNEL ../arm/axpy.c)
|
||||
set(SHAXPBYKERNEL ../arm/axpby.c)
|
||||
set(SHCOPYKERNEL ../arm/copy.c)
|
||||
set(SBDOTKERNEL ../x86_64/sbdot.c)
|
||||
set(SHROTKERNEL ../arm/rot.c)
|
||||
set(SHSCALKERNEL ../arm/scal.c)
|
||||
set(SHNRM2KERNEL ../arm/nrm2.c)
|
||||
set(SHSUMKERNEL ../arm/sum.c)
|
||||
set(SHSWAPKERNEL ../arm/swap.c)
|
||||
set(TOBF16KERNEL ../x86_64/tobf16.c)
|
||||
set(BF16TOKERNEL ../x86_64/bf16to.c)
|
||||
SetFallback(SHAMINKERNEL ../arm/amin.c)
|
||||
SetFallback(SHAMAXKERNEL ../arm/amax.c)
|
||||
SetFallback(SHMAXKERNEL ../arm/max.c)
|
||||
SetFallback(SHMINKERNEL ../arm/min.c)
|
||||
SetFallback(ISHAMAXKERNEL ../arm/iamax.c)
|
||||
SetFallback(ISHAMINKERNEL ../arm/iamin.c)
|
||||
SetFallback(ISHMAXKERNEL ../arm/imax.c)
|
||||
SetFallback(ISHMINKERNEL ../arm/imin.c)
|
||||
SetFallback(SHASUMKERNEL ../arm/asum.c)
|
||||
SetFallback(SHAXPYKERNEL ../arm/axpy.c)
|
||||
SetFallback(SHAXPBYKERNEL ../arm/axpby.c)
|
||||
SetFallback(SHCOPYKERNEL ../arm/copy.c)
|
||||
SetFallback(SBDOTKERNEL ../x86_64/sbdot.c)
|
||||
SetFallback(SHROTKERNEL ../arm/rot.c)
|
||||
SetFallback(SHSCALKERNEL ../arm/scal.c)
|
||||
SetFallback(SHNRM2KERNEL ../arm/nrm2.c)
|
||||
SetFallback(SHSUMKERNEL ../arm/sum.c)
|
||||
SetFallback(SHSWAPKERNEL ../arm/swap.c)
|
||||
SetFallback(TOBF16KERNEL ../x86_64/tobf16.c)
|
||||
SetFallback(BF16TOKERNEL ../x86_64/bf16to.c)
|
||||
SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
|
||||
SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
|
||||
endif ()
|
||||
endmacro ()
|
||||
|
||||
macro(SetDefaultL2)
|
||||
set(SGEMVNKERNEL ../arm/gemv_n.c)
|
||||
set(SGEMVTKERNEL ../arm/gemv_t.c)
|
||||
set(DGEMVNKERNEL gemv_n.S)
|
||||
set(DGEMVTKERNEL gemv_t.S)
|
||||
set(CGEMVNKERNEL zgemv_n.S)
|
||||
set(CGEMVTKERNEL zgemv_t.S)
|
||||
set(ZGEMVNKERNEL zgemv_n.S)
|
||||
set(ZGEMVTKERNEL zgemv_t.S)
|
||||
set(QGEMVNKERNEL gemv_n.S)
|
||||
set(QGEMVTKERNEL gemv_t.S)
|
||||
set(XGEMVNKERNEL zgemv_n.S)
|
||||
set(XGEMVTKERNEL zgemv_t.S)
|
||||
set(SGERKERNEL ../generic/ger.c)
|
||||
set(DGERKERNEL ../generic/ger.c)
|
||||
set(QGERKERNEL ../generic/ger.c)
|
||||
set(CGERUKERNEL ../generic/zger.c)
|
||||
set(CGERCKERNEL ../generic/zger.c)
|
||||
set(ZGERUKERNEL ../generic/zger.c)
|
||||
set(ZGERCKERNEL ../generic/zger.c)
|
||||
set(XGERUKERNEL ../generic/zger.c)
|
||||
set(XGERCKERNEL ../generic/zger.c)
|
||||
set(SSYMV_U_KERNEL ../generic/symv_k.c)
|
||||
set(SSYMV_L_KERNEL ../generic/symv_k.c)
|
||||
set(DSYMV_U_KERNEL ../generic/symv_k.c)
|
||||
set(DSYMV_L_KERNEL ../generic/symv_k.c)
|
||||
set(QSYMV_U_KERNEL ../generic/symv_k.c)
|
||||
set(QSYMV_L_KERNEL ../generic/symv_k.c)
|
||||
set(CSYMV_U_KERNEL ../generic/zsymv_k.c)
|
||||
set(CSYMV_L_KERNEL ../generic/zsymv_k.c)
|
||||
set(ZSYMV_U_KERNEL ../generic/zsymv_k.c)
|
||||
set(ZSYMV_L_KERNEL ../generic/zsymv_k.c)
|
||||
set(XSYMV_U_KERNEL ../generic/zsymv_k.c)
|
||||
set(XSYMV_L_KERNEL ../generic/zsymv_k.c)
|
||||
set(CHEMV_U_KERNEL ../generic/zhemv_k.c)
|
||||
set(CHEMV_L_KERNEL ../generic/zhemv_k.c)
|
||||
set(CHEMV_V_KERNEL ../generic/zhemv_k.c)
|
||||
set(CHEMV_M_KERNEL ../generic/zhemv_k.c)
|
||||
set(ZHEMV_U_KERNEL ../generic/zhemv_k.c)
|
||||
set(ZHEMV_L_KERNEL ../generic/zhemv_k.c)
|
||||
set(ZHEMV_V_KERNEL ../generic/zhemv_k.c)
|
||||
set(ZHEMV_M_KERNEL ../generic/zhemv_k.c)
|
||||
set(XHEMV_U_KERNEL ../generic/zhemv_k.c)
|
||||
set(XHEMV_L_KERNEL ../generic/zhemv_k.c)
|
||||
set(XHEMV_V_KERNEL ../generic/zhemv_k.c)
|
||||
set(XHEMV_M_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(SGEMVNKERNEL ../arm/gemv_n.c)
|
||||
SetFallback(SGEMVTKERNEL ../arm/gemv_t.c)
|
||||
SetFallback(DGEMVNKERNEL gemv_n.S)
|
||||
SetFallback(DGEMVTKERNEL gemv_t.S)
|
||||
SetFallback(CGEMVNKERNEL zgemv_n.S)
|
||||
SetFallback(CGEMVTKERNEL zgemv_t.S)
|
||||
SetFallback(ZGEMVNKERNEL zgemv_n.S)
|
||||
SetFallback(ZGEMVTKERNEL zgemv_t.S)
|
||||
SetFallback(QGEMVNKERNEL gemv_n.S)
|
||||
SetFallback(QGEMVTKERNEL gemv_t.S)
|
||||
SetFallback(XGEMVNKERNEL zgemv_n.S)
|
||||
SetFallback(XGEMVTKERNEL zgemv_t.S)
|
||||
SetFallback(SGERKERNEL ../generic/ger.c)
|
||||
SetFallback(DGERKERNEL ../generic/ger.c)
|
||||
SetFallback(QGERKERNEL ../generic/ger.c)
|
||||
SetFallback(CGERUKERNEL ../generic/zger.c)
|
||||
SetFallback(CGERCKERNEL ../generic/zger.c)
|
||||
SetFallback(ZGERUKERNEL ../generic/zger.c)
|
||||
SetFallback(ZGERCKERNEL ../generic/zger.c)
|
||||
SetFallback(XGERUKERNEL ../generic/zger.c)
|
||||
SetFallback(XGERCKERNEL ../generic/zger.c)
|
||||
SetFallback(SSYMV_U_KERNEL ../generic/symv_k.c)
|
||||
SetFallback(SSYMV_L_KERNEL ../generic/symv_k.c)
|
||||
SetFallback(DSYMV_U_KERNEL ../generic/symv_k.c)
|
||||
SetFallback(DSYMV_L_KERNEL ../generic/symv_k.c)
|
||||
SetFallback(QSYMV_U_KERNEL ../generic/symv_k.c)
|
||||
SetFallback(QSYMV_L_KERNEL ../generic/symv_k.c)
|
||||
SetFallback(CSYMV_U_KERNEL ../generic/zsymv_k.c)
|
||||
SetFallback(CSYMV_L_KERNEL ../generic/zsymv_k.c)
|
||||
SetFallback(ZSYMV_U_KERNEL ../generic/zsymv_k.c)
|
||||
SetFallback(ZSYMV_L_KERNEL ../generic/zsymv_k.c)
|
||||
SetFallback(XSYMV_U_KERNEL ../generic/zsymv_k.c)
|
||||
SetFallback(XSYMV_L_KERNEL ../generic/zsymv_k.c)
|
||||
SetFallback(CHEMV_U_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(CHEMV_L_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(CHEMV_V_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(CHEMV_M_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(ZHEMV_U_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(ZHEMV_L_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(ZHEMV_V_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(ZHEMV_M_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(XHEMV_U_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(XHEMV_L_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(XHEMV_V_KERNEL ../generic/zhemv_k.c)
|
||||
SetFallback(XHEMV_M_KERNEL ../generic/zhemv_k.c)
|
||||
if (BUILD_BFLOAT16)
|
||||
set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
|
||||
set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
|
||||
set(SHGERKERNEL ../generic/ger.c)
|
||||
SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
|
||||
SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
|
||||
SetFallback(SHGERKERNEL ../generic/ger.c)
|
||||
endif ()
|
||||
endmacro ()
|
||||
|
||||
macro(SetDefaultL3)
|
||||
set(SGEADD_KERNEL ../generic/geadd.c)
|
||||
set(DGEADD_KERNEL ../generic/geadd.c)
|
||||
set(CGEADD_KERNEL ../generic/zgeadd.c)
|
||||
set(ZGEADD_KERNEL ../generic/zgeadd.c)
|
||||
SetFallback(SGEADD_KERNEL ../generic/geadd.c)
|
||||
SetFallback(DGEADD_KERNEL ../generic/geadd.c)
|
||||
SetFallback(CGEADD_KERNEL ../generic/zgeadd.c)
|
||||
SetFallback(ZGEADD_KERNEL ../generic/zgeadd.c)
|
||||
if (BUILD_BFLOAT16)
|
||||
set(SHGEADD_KERNEL ../generic/geadd.c)
|
||||
set(SBGEMMKERNEL ../generic/gemmkernel_2x2.c)
|
||||
set(SBGEMM_BETA ../generic/gemm_beta.c)
|
||||
set(SBGEMMINCOPY ../generic/gemm_ncopy_2.c)
|
||||
set(SBGEMMITCOPY ../generic/gemm_tcopy_2.c)
|
||||
set(SBGEMMONCOPY ../generic/gemm_ncopy_2.c)
|
||||
set(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c)
|
||||
set(SBGEMMINCOPYOBJ sbgemm_incopy.o)
|
||||
set(SBGEMMITCOPYOBJ sbgemm_itcopy.o)
|
||||
set(SBGEMMONCOPYOBJ sbgemm_oncopy.o)
|
||||
set(SBGEMMOTCOPYOBJ sbgemm_otcopy.o)
|
||||
SetFallback(SHGEADD_KERNEL ../generic/geadd.c)
|
||||
SetFallback(SBGEMMKERNEL ../generic/gemmkernel_2x2.c)
|
||||
SetFallback(SBGEMM_BETA ../generic/gemm_beta.c)
|
||||
SetFallback(SBGEMMINCOPY ../generic/gemm_ncopy_2.c)
|
||||
SetFallback(SBGEMMITCOPY ../generic/gemm_tcopy_2.c)
|
||||
SetFallback(SBGEMMONCOPY ../generic/gemm_ncopy_2.c)
|
||||
SetFallback(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c)
|
||||
SetFallback(SBGEMMINCOPYOBJ sbgemm_incopy.o)
|
||||
SetFallback(SBGEMMITCOPYOBJ sbgemm_itcopy.o)
|
||||
SetFallback(SBGEMMONCOPYOBJ sbgemm_oncopy.o)
|
||||
SetFallback(SBGEMMOTCOPYOBJ sbgemm_otcopy.o)
|
||||
endif ()
|
||||
|
||||
endmacro ()
|
||||
|
|
|
|||
|
|
@ -66,7 +66,7 @@ set(SLASRC
|
|||
slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f
|
||||
slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
|
||||
slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
|
||||
slarf.f slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
|
||||
slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
|
||||
slarrv.f slartv.f
|
||||
slarz.f slarzb.f slarzt.f slasy2.f
|
||||
slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f
|
||||
|
|
@ -112,14 +112,14 @@ set(SLASRC
|
|||
sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f
|
||||
stpqrt.f stpqrt2.f stpmqrt.f stprfb.f
|
||||
sgelqt.f sgelqt3.f sgemlqt.f
|
||||
sgetsls.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f
|
||||
sgetsls.f sgetsqrhrt.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f
|
||||
sgelq.f slaswlq.f slamswlq.f sgemlq.f
|
||||
stplqt.f stplqt2.f stpmlqt.f
|
||||
ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f
|
||||
ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
|
||||
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
|
||||
sgesvdq.f slaorhr_col_getrfnp.f
|
||||
slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f )
|
||||
slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f )
|
||||
|
||||
set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
|
||||
sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
|
||||
|
|
@ -171,7 +171,7 @@ set(CLASRC
|
|||
claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f
|
||||
claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
|
||||
claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
|
||||
clarf.f clarfb.f clarfg.f clarfgp.f clarft.f
|
||||
clarf.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
|
||||
clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f clartv.f
|
||||
clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f
|
||||
clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f
|
||||
|
|
@ -209,14 +209,14 @@ set(CLASRC
|
|||
cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f
|
||||
ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f
|
||||
cgelqt.f cgelqt3.f cgemlqt.f
|
||||
cgetsls.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f
|
||||
cgetsls.f cgetsqrhrt.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f
|
||||
cgelq.f claswlq.f clamswlq.f cgemlq.f
|
||||
ctplqt.f ctplqt2.f ctpmlqt.f
|
||||
chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f
|
||||
cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
|
||||
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
|
||||
cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f
|
||||
cungtsqr.f cunhr_col.f )
|
||||
cungtsqr.f cungtsqr_row.f cunhr_col.f )
|
||||
|
||||
set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
|
||||
cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
|
||||
|
|
@ -253,7 +253,7 @@ set(DLASRC
|
|||
dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
|
||||
dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
|
||||
dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
|
||||
dlarf.f dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
|
||||
dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
|
||||
dlargv.f dlarrv.f dlartv.f
|
||||
dlarz.f dlarzb.f dlarzt.f dlasy2.f
|
||||
dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f
|
||||
|
|
@ -300,14 +300,14 @@ set(DLASRC
|
|||
dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f
|
||||
dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f
|
||||
dgelqt.f dgelqt3.f dgemlqt.f
|
||||
dgetsls.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f
|
||||
dgetsls.f dgetsqrhrt.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f
|
||||
dgelq.f dlaswlq.f dlamswlq.f dgemlq.f
|
||||
dtplqt.f dtplqt2.f dtpmlqt.f
|
||||
dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f
|
||||
dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
|
||||
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
|
||||
dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
|
||||
dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f )
|
||||
dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f )
|
||||
|
||||
set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
|
||||
dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
|
||||
|
|
@ -360,7 +360,7 @@ set(ZLASRC
|
|||
zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f
|
||||
zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
|
||||
zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
|
||||
zlarcm.f zlarf.f zlarfb.f
|
||||
zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f
|
||||
zlarfg.f zlarfgp.f zlarft.f
|
||||
zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f
|
||||
zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
|
||||
|
|
@ -402,13 +402,13 @@ set(ZLASRC
|
|||
ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f
|
||||
ztplqt.f ztplqt2.f ztpmlqt.f
|
||||
zgelqt.f zgelqt3.f zgemlqt.f
|
||||
zgetsls.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f
|
||||
zgetsls.f zgetsqrhrt.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f
|
||||
zgelq.f zlaswlq.f zlamswlq.f zgemlq.f
|
||||
zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f
|
||||
zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
|
||||
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
|
||||
zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
|
||||
zungtsqr.f zunhr_col.f)
|
||||
zungtsqr.f zungtsqr_row.f zunhr_col.f)
|
||||
|
||||
set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
|
||||
zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
|
||||
|
|
|
|||
|
|
@ -114,6 +114,8 @@ set(CSRC
|
|||
lapacke_cgetrs_work.c
|
||||
lapacke_cgetsls.c
|
||||
lapacke_cgetsls_work.c
|
||||
lapacke_cgetsqrhrt.c
|
||||
lapacke_cgetsqrhrt_work.c
|
||||
lapacke_cggbak.c
|
||||
lapacke_cggbak_work.c
|
||||
lapacke_cggbal.c
|
||||
|
|
@ -590,6 +592,8 @@ set(CSRC
|
|||
lapacke_cungrq_work.c
|
||||
lapacke_cungtr.c
|
||||
lapacke_cungtr_work.c
|
||||
lapacke_cungtsqr_row.c
|
||||
lapacke_cungtsqr_row_work.c
|
||||
lapacke_cunmbr.c
|
||||
lapacke_cunmbr_work.c
|
||||
lapacke_cunmhr.c
|
||||
|
|
@ -735,6 +739,8 @@ set(DSRC
|
|||
lapacke_dgetrs_work.c
|
||||
lapacke_dgetsls.c
|
||||
lapacke_dgetsls_work.c
|
||||
lapacke_dgetsqrhrt.c
|
||||
lapacke_dgetsqrhrt_work.c
|
||||
lapacke_dggbak.c
|
||||
lapacke_dggbak_work.c
|
||||
lapacke_dggbal.c
|
||||
|
|
@ -862,6 +868,8 @@ set(DSRC
|
|||
lapacke_dorgrq_work.c
|
||||
lapacke_dorgtr.c
|
||||
lapacke_dorgtr_work.c
|
||||
lapacke_dorgtsqr_row.c
|
||||
lapacke_dorgtsqr_row_work.c
|
||||
lapacke_dormbr.c
|
||||
lapacke_dormbr_work.c
|
||||
lapacke_dormhr.c
|
||||
|
|
@ -1309,6 +1317,8 @@ set(SSRC
|
|||
lapacke_sgetrs_work.c
|
||||
lapacke_sgetsls.c
|
||||
lapacke_sgetsls_work.c
|
||||
lapacke_sgetsqrhrt.c
|
||||
lapacke_sgetsqrhrt_work.c
|
||||
lapacke_sggbak.c
|
||||
lapacke_sggbak_work.c
|
||||
lapacke_sggbal.c
|
||||
|
|
@ -1435,6 +1445,8 @@ set(SSRC
|
|||
lapacke_sorgrq_work.c
|
||||
lapacke_sorgtr.c
|
||||
lapacke_sorgtr_work.c
|
||||
lapacke_sorgtsqr_row.c
|
||||
lapacke_sorgtsqr_row_work.c
|
||||
lapacke_sormbr.c
|
||||
lapacke_sormbr_work.c
|
||||
lapacke_sormhr.c
|
||||
|
|
@ -1877,6 +1889,8 @@ set(ZSRC
|
|||
lapacke_zgetrs_work.c
|
||||
lapacke_zgetsls.c
|
||||
lapacke_zgetsls_work.c
|
||||
lapacke_zgetsqrhrt.c
|
||||
lapacke_zgetsqrhrt_work.c
|
||||
lapacke_zggbak.c
|
||||
lapacke_zggbak_work.c
|
||||
lapacke_zggbal.c
|
||||
|
|
@ -2351,6 +2365,8 @@ set(ZSRC
|
|||
lapacke_zungrq_work.c
|
||||
lapacke_zungtr.c
|
||||
lapacke_zungtr_work.c
|
||||
lapacke_zungtsqr_row.c
|
||||
lapacke_zungtsqr_row_work.c
|
||||
lapacke_zunmbr.c
|
||||
lapacke_zunmbr_work.c
|
||||
lapacke_zunmhr.c
|
||||
|
|
@ -2499,6 +2515,5 @@ foreach (Utils_FILE ${Utils_SRC})
|
|||
endforeach ()
|
||||
|
||||
set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include")
|
||||
configure_file("${lapacke_include_dir}/lapacke_mangling_with_flags.h.in" "${lapacke_include_dir}/lapacke_mangling.h" COPYONLY)
|
||||
include_directories(${lapacke_include_dir})
|
||||
set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")
|
||||
|
|
|
|||
|
|
@ -127,6 +127,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
"#define DLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t16384\n")
|
||||
set(HAVE_SSE 1)
|
||||
set(HAVE_SSE2 1)
|
||||
set(HAVE_SSE3 1)
|
||||
set(HAVE_SSSE3 1)
|
||||
set(SGEMM_UNROLL_M 8)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
|
|
@ -177,7 +181,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53")
|
||||
elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53" OR "${TCORE}" STREQUAL "CORTEXA55")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t32768\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
|
|
@ -243,11 +247,11 @@ endif ()
|
|||
"#define L1_CODE_ASSOCIATIVE\t4\n"
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t2\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t4\n"
|
||||
"#define L2_SIZE\t1048576\n\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define L2_ASSOCIATIVE\t16\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define L2_ASSOCIATIVE\t8\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t48\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_VFPV4\n"
|
||||
"#define HAVE_VFPV3\n"
|
||||
|
|
@ -263,6 +267,62 @@ endif ()
|
|||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "NEOVERSEV1")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t65536\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
"#define L1_CODE_ASSOCIATIVE\t4\n"
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t4\n"
|
||||
"#define L2_SIZE\t1048576\n\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define L2_ASSOCIATIVE\t8\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t48\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_VFPV4\n"
|
||||
"#define HAVE_VFPV3\n"
|
||||
"#define HAVE_VFP\n"
|
||||
"#define HAVE_NEON\n"
|
||||
"#define HAVE_SVE\n"
|
||||
"#define ARMV8\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "NEOVERSEN2")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t65536\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
"#define L1_CODE_ASSOCIATIVE\t4\n"
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t2\n"
|
||||
"#define L2_SIZE\t1048576\n\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define L2_ASSOCIATIVE\t8\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t48\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_VFPV4\n"
|
||||
"#define HAVE_VFPV3\n"
|
||||
"#define HAVE_VFP\n"
|
||||
"#define HAVE_NEON\n"
|
||||
"#define HAVE_SVE\n"
|
||||
"#define ARMV8\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "FALKOR")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t65536\n"
|
||||
|
|
@ -416,7 +476,7 @@ endif ()
|
|||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "VORTEX")
|
||||
elseif ("${TCORE}" STREQUAL "VORTEX")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define ARMV8\n"
|
||||
"#define L1_CODE_SIZE\t32768\n"
|
||||
|
|
@ -439,6 +499,34 @@ elseif ("${TCORE}" STREQUAL "VORTEX")
|
|||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "P5600")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L2_SIZE 1048576\n"
|
||||
"#define DTB_SIZE 4096\n"
|
||||
"#define DTB_DEFAULT_ENTRIES 64\n")
|
||||
set(SGEMM_UNROLL_M 2)
|
||||
set(SGEMM_UNROLL_N 2)
|
||||
set(DGEMM_UNROLL_M 2)
|
||||
set(DGEMM_UNROLL_N 2)
|
||||
set(CGEMM_UNROLL_M 2)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" MATCHES "MIPS")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L2_SIZE 262144\n"
|
||||
"#define DTB_SIZE 4096\n"
|
||||
"#define DTB_DEFAULT_ENTRIES 64\n")
|
||||
set(SGEMM_UNROLL_M 2)
|
||||
set(SGEMM_UNROLL_N 2)
|
||||
set(DGEMM_UNROLL_M 2)
|
||||
set(DGEMM_UNROLL_N 2)
|
||||
set(CGEMM_UNROLL_M 2)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "POWER6")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE 32768\n"
|
||||
|
|
|
|||
|
|
@ -33,15 +33,18 @@ endif ()
|
|||
if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
|
||||
message(STATUS "Compiling a ${BINARY}-bit binary.")
|
||||
set(NO_AVX 1)
|
||||
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE")
|
||||
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE" OR ${TARGET} STREQUAL "SAPPHIRERAPIDS")
|
||||
set(TARGET "NEHALEM")
|
||||
endif ()
|
||||
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
|
||||
set(TARGET "BARCELONA")
|
||||
endif ()
|
||||
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53")
|
||||
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53" OR ${TARGET} STREQUAL "CORTEXA55")
|
||||
set(TARGET "ARMV7")
|
||||
endif ()
|
||||
if (${TARGET} STREQUAL "POWER8" OR ${TARGET} STREQUAL "POWER9" OR ${TARGET} STREQUAL "POWER10")
|
||||
set(TARGET "POWER6")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
|
||||
|
|
@ -102,6 +105,18 @@ if (CMAKE_C_COMPILER STREQUAL loongcc)
|
|||
set(GETARCH_FLAGS "${GETARCH_FLAGS} -static")
|
||||
endif ()
|
||||
|
||||
if (POWER)
|
||||
set(NO_WARMUP 1)
|
||||
set(HAVE_GAS 1)
|
||||
if (CMAKE_ASM_COMPILER_ID STREQUAL "GNU")
|
||||
set(HAVE_GAS 0)
|
||||
elseif (CMAKE_ASM_COMPILER_ID STREQUAL "Clang")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -fno-integrated-as")
|
||||
set(HAVE_GAS 0)
|
||||
endif ()
|
||||
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DHAVE_GAS=${HAVE_GAS}")
|
||||
endif ()
|
||||
|
||||
#if don't use Fortran, it will only compile CBLAS.
|
||||
if (ONLY_CBLAS)
|
||||
set(NO_LAPACK 1)
|
||||
|
|
@ -148,16 +163,36 @@ endif ()
|
|||
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
|
||||
if (DEFINED TARGET)
|
||||
if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512)
|
||||
# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 10.09)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
|
||||
else()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
|
||||
# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
||||
# endif()
|
||||
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 8.99)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
|
||||
else()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL SAPPHIRERAPIDS AND NOT NO_AVX512)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 11.0)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids")
|
||||
else()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 12.0)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids")
|
||||
else()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
|
|
@ -182,11 +217,11 @@ if (DEFINED TARGET)
|
|||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
||||
endif()
|
||||
endif()
|
||||
if (DEFINED HAVE_FMA3)
|
||||
if (NOT NO_AVX2)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
|
||||
endif()
|
||||
endif()
|
||||
# if (DEFINED HAVE_FMA3)
|
||||
# if (NOT NO_AVX2)
|
||||
# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
|
||||
# endif()
|
||||
# endif()
|
||||
if (DEFINED HAVE_SSE)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
|
||||
endif()
|
||||
|
|
@ -202,6 +237,27 @@ if (DEFINED TARGET)
|
|||
if (DEFINED HAVE_SSE4_1)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1")
|
||||
endif()
|
||||
|
||||
if (${TARGET} STREQUAL POWER10)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math")
|
||||
else ()
|
||||
message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10.")
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL POWER9)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math")
|
||||
else ()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
|
||||
message(WARNING "Compiler GCC.${GCC_VERSION} does not support fully Power9.")
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL POWER8)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
|
||||
endif()
|
||||
endif()
|
||||
if (DEFINED BINARY)
|
||||
message(STATUS "Compiling a ${BINARY}-bit binary.")
|
||||
|
|
@ -219,6 +275,11 @@ include("${PROJECT_SOURCE_DIR}/cmake/arch.cmake")
|
|||
# C Compiler dependent settings
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
|
||||
|
||||
if (INTERFACE64)
|
||||
set(SUFFIX64 64)
|
||||
set(SUFFIX64_UNDERSCORE _64)
|
||||
endif()
|
||||
|
||||
if (NOT NOFORTRAN)
|
||||
# Fortran Compiler dependent settings
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
|
||||
|
|
@ -233,6 +294,11 @@ if (BINARY64)
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if(EMBEDDED)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DOS_EMBEDDED")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16")
|
||||
endif()
|
||||
|
||||
if (NEED_PIC)
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "IBM")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -qpic=large")
|
||||
|
|
@ -249,8 +315,15 @@ if (NEED_PIC)
|
|||
endif()
|
||||
endif ()
|
||||
|
||||
if (X86_64 OR ${CORE} STREQUAL POWER10)
|
||||
set(SMALL_MATRIX_OPT TRUE)
|
||||
endif ()
|
||||
if (SMALL_MATRIX_OPT)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT")
|
||||
endif ()
|
||||
|
||||
if (DYNAMIC_ARCH)
|
||||
if (X86 OR X86_64 OR ARM64 OR PPC)
|
||||
if (X86 OR X86_64 OR ARM64 OR POWER)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
|
||||
if (DYNAMIC_OLDER)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
|
||||
|
|
@ -290,6 +363,10 @@ if (NO_AVX2)
|
|||
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX2")
|
||||
endif ()
|
||||
|
||||
if (NO_AVX512)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
|
||||
endif ()
|
||||
|
||||
if (USE_THREAD)
|
||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
# NO_AFFINITY = 1
|
||||
|
|
@ -449,6 +526,9 @@ endif()
|
|||
if (BUILD_COMPLEX16)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16")
|
||||
endif()
|
||||
if (BUILD_BFLOAT16)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_BFLOAT16")
|
||||
endif()
|
||||
if(NOT MSVC)
|
||||
set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}")
|
||||
endif()
|
||||
|
|
|
|||
|
|
@ -20,11 +20,11 @@ endif()
|
|||
|
||||
|
||||
|
||||
if(CMAKE_COMPILER_IS_GNUCC AND WIN32)
|
||||
if(MINGW)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine
|
||||
OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE
|
||||
OUTPUT_VARIABLE OPENBLAS_MINGW_TARGET_MACHINE
|
||||
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||
if(OPENBLAS_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64")
|
||||
if(OPENBLAS_MINGW_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64")
|
||||
set(MINGW64 1)
|
||||
endif()
|
||||
endif()
|
||||
|
|
@ -35,9 +35,11 @@ if(CMAKE_CL_64 OR MINGW64)
|
|||
elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING))
|
||||
set(X86 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
|
||||
set(PPC 1)
|
||||
set(POWER 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
|
||||
set(MIPS64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
|
||||
set(LOONGARCH64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
||||
if (NOT BINARY)
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
|
|
@ -71,6 +73,8 @@ elseif (${CMAKE_CROSSCOMPILING})
|
|||
else ()
|
||||
set(X86 1)
|
||||
endif()
|
||||
elseif (${TARGET} STREQUAL "P5600" OR ${TARGET} MATCHES "MIPS.*")
|
||||
set(MIPS32 1)
|
||||
elseif (${TARGET} STREQUAL "ARMV7")
|
||||
set(ARM 1)
|
||||
else()
|
||||
|
|
@ -84,8 +88,12 @@ if (X86_64)
|
|||
set(ARCH "x86_64")
|
||||
elseif(X86)
|
||||
set(ARCH "x86")
|
||||
elseif(PPC)
|
||||
elseif(POWER)
|
||||
set(ARCH "power")
|
||||
elseif(MIPS32)
|
||||
set(ARCH "mips")
|
||||
elseif(MIPS64)
|
||||
set(ARCH "mips64")
|
||||
elseif(ARM)
|
||||
set(ARCH "arm")
|
||||
elseif(ARM64)
|
||||
|
|
@ -95,7 +103,7 @@ else()
|
|||
endif ()
|
||||
|
||||
if (NOT BINARY)
|
||||
if (X86_64 OR ARM64 OR PPC OR MIPS64)
|
||||
if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64)
|
||||
set(BINARY 64)
|
||||
else ()
|
||||
set(BINARY 32)
|
||||
|
|
|
|||
|
|
@ -15,35 +15,83 @@ endfunction ()
|
|||
# Reads a Makefile into CMake vars.
|
||||
macro(ParseMakefileVars MAKEFILE_IN)
|
||||
message(STATUS "Reading vars from ${MAKEFILE_IN}...")
|
||||
set (IfElse 0)
|
||||
set (ElseSeen 0)
|
||||
set (C_COMPILER ${CMAKE_C_COMPILER_ID})
|
||||
set (IfElse 0)
|
||||
set (ElseSeen 0)
|
||||
set (SkipIfs 0)
|
||||
set (SkipElse 0)
|
||||
file(STRINGS ${MAKEFILE_IN} makefile_contents)
|
||||
foreach (makefile_line ${makefile_contents})
|
||||
#message(STATUS "parsing ${makefile_line}")
|
||||
if (${IfElse} GREATER 0)
|
||||
#message(STATUS "parsing ${makefile_line}")
|
||||
# Skip the entire scope of the else statement given that the if statement that precedes it has the valid condition.
|
||||
# The variable SkipIfs is used to identify which endif statement closes the scope of the else statement.
|
||||
if (${SkipElse} EQUAL 1)
|
||||
#message(STATUS "skipping ${makefile_line}")
|
||||
string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
MATH(EXPR SkipIfs "${SkipIfs}+1")
|
||||
endif ()
|
||||
string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
# message(STATUS "ENDIF ${makefile_line}")
|
||||
set (IfElse 0)
|
||||
set (ElseSeen 0)
|
||||
if (${SkipIfs} EQUAL 0)
|
||||
set (SkipElse 0)
|
||||
else ()
|
||||
MATH(EXPR SkipIfs "${SkipIfs}-1")
|
||||
endif ()
|
||||
endif ()
|
||||
continue ()
|
||||
endif ()
|
||||
# The variable IfElse is greater than 0 if and only if the previously parsed line is an if statement.
|
||||
if (${IfElse} GREATER 0)
|
||||
# If the current scope is the one that has to be skipped, the if/endif/else statements
|
||||
# along with it till the endif that closes the current scope have to be ignored as well.
|
||||
string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1))
|
||||
#message(STATUS "skipping ${makefile_line}")
|
||||
MATH(EXPR SkipIfs "${SkipIfs}+1")
|
||||
continue ()
|
||||
endif ()
|
||||
endif ()
|
||||
string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
if (${SkipIfs} EQUAL 0)
|
||||
#message(STATUS "ENDIF ${makefile_line}")
|
||||
set (IfElse 0)
|
||||
set (ElseSeen 0)
|
||||
else ()
|
||||
#message(STATUS "skipping ${makefile_line}")
|
||||
MATH(EXPR SkipIfs "${SkipIfs}-1")
|
||||
endif ()
|
||||
continue ()
|
||||
endif ()
|
||||
string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
# message(STATUS "ELSE ${makefile_line}")
|
||||
set (ElseSeen 1)
|
||||
continue ()
|
||||
endif()
|
||||
if ( (${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR ( ${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1))
|
||||
# message(STATUS "skipping ${makefile_line}")
|
||||
continue ()
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
if (${SkipIfs} EQUAL 0)
|
||||
#message(STATUS "ELSE ${makefile_line}")
|
||||
set (ElseSeen 1)
|
||||
else ()
|
||||
#message(STATUS "skipping ${makefile_line}")
|
||||
endif ()
|
||||
continue ()
|
||||
endif()
|
||||
# Skip the lines that are not part of the path that has to be taken.
|
||||
if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1) OR (${SkipIfs} GREATER 0))
|
||||
#message(STATUS "skipping ${makefile_line}")
|
||||
continue ()
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
# Skip commented lines (the ones that start with '#')
|
||||
string(REGEX MATCH "[ \t]*\\#.*$" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "skipping ${makefile_line}")
|
||||
continue ()
|
||||
endif ()
|
||||
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "match on ${line_match}")
|
||||
#message(STATUS "match on ${line_match}")
|
||||
set(var_name ${CMAKE_MATCH_1})
|
||||
# set(var_value ${CMAKE_MATCH_2})
|
||||
#set(var_value ${CMAKE_MATCH_2})
|
||||
string(STRIP ${CMAKE_MATCH_2} var_value)
|
||||
# check for Makefile variables in the string, e.g. $(TSUFFIX)
|
||||
string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value})
|
||||
|
|
@ -54,36 +102,93 @@ macro(ParseMakefileVars MAKEFILE_IN)
|
|||
string(REPLACE "$(${make_var})" "${${make_var}}" var_value ${var_value})
|
||||
endforeach ()
|
||||
set(${var_name} ${var_value})
|
||||
else ()
|
||||
string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "match on include ${line_match}")
|
||||
ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1})
|
||||
else ()
|
||||
# message(STATUS "unmatched line ${line_match}")
|
||||
string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
# message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
|
||||
if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})
|
||||
# message (STATUS "condition is true")
|
||||
set (IfElse 1)
|
||||
else ()
|
||||
set (IfElse 2)
|
||||
endif ()
|
||||
continue ()
|
||||
endif ()
|
||||
# Include a new file to be parsed
|
||||
string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "match on include ${line_match}")
|
||||
ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1})
|
||||
continue ()
|
||||
endif ()
|
||||
# The if statement that precedes this else has the path taken
|
||||
# Thus, this else statement has to be skipped.
|
||||
string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "skipping ${makefile_line}")
|
||||
set (SkipElse 1)
|
||||
continue()
|
||||
endif()
|
||||
# Example 1: ifdef HAVE_MSA
|
||||
# Example 2: ifndef ZNRM2KERNEL
|
||||
string(REGEX MATCH "(ifdef|ifndef) ([0-9_A-Z]+)" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "${CMAKE_MATCH_1} first: ${CMAKE_MATCH_2}")
|
||||
set (ElseSeen 0)
|
||||
if (${CMAKE_MATCH_2})
|
||||
if (${CMAKE_MATCH_1} STREQUAL "ifdef")
|
||||
#message (STATUS "condition is true")
|
||||
set (IfElse 1)
|
||||
else ()
|
||||
string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
# message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
|
||||
if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}))
|
||||
# message (STATUS "condition is true")
|
||||
set (IfElse 1)
|
||||
else ()
|
||||
set (IfElse 2)
|
||||
endif ()
|
||||
endif ()
|
||||
set (IfElse 2)
|
||||
endif ()
|
||||
else ()
|
||||
if (${CMAKE_MATCH_1} STREQUAL "ifdef")
|
||||
set (IfElse 2)
|
||||
else ()
|
||||
#message (STATUS "condition is true")
|
||||
set (IfElse 1)
|
||||
endif ()
|
||||
endif ()
|
||||
continue ()
|
||||
endif ()
|
||||
# Example 1: ifeq ($(SGEMM_UNROLL_M), 16)
|
||||
# Example 2: ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
|
||||
# Example 3: ifeq ($(__BYTE_ORDER__)$(ELF_VERSION),__ORDER_BIG_ENDIAN__2)
|
||||
# Ignore the second group since (?:...) does not work on cmake
|
||||
string(REGEX MATCH "ifeq \\(\\$\\(([0-9_A-Z]+)\\)(([0-9_A-Za-z]*)\\$\\(([0-9_A-Z]+)\\))?,[ \t]*([0-9_A-Za-z]+)\\)" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4} fourth: ${CMAKE_MATCH_5}")
|
||||
if (DEFINED ${CMAKE_MATCH_1})
|
||||
if (DEFINED ${CMAKE_MATCH_4})
|
||||
set (STR ${${CMAKE_MATCH_1}}${CMAKE_MATCH_3}${${CMAKE_MATCH_4}})
|
||||
else ()
|
||||
set (STR ${${CMAKE_MATCH_1}})
|
||||
endif ()
|
||||
if (${STR} STREQUAL ${CMAKE_MATCH_5})
|
||||
#message (STATUS "condition is true")
|
||||
set (IfElse 1)
|
||||
continue ()
|
||||
endif ()
|
||||
endif ()
|
||||
set (IfElse 2)
|
||||
continue ()
|
||||
endif ()
|
||||
# Example 1 (Group 3): ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
# Example 2 (Group 4): ifneq ($(C_COMPILER), PGI)
|
||||
string(REGEX MATCH "ifneq \\(\\$\\(([0-9_A-Z]+)\\),[ \t]*(\\$\\(([0-9_A-Z]+)\\)|([0-9_A-Z]+))\\)" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4}")
|
||||
set (ElseSeen 0)
|
||||
set (HasValidGroup 0)
|
||||
if (DEFINED ${CMAKE_MATCH_3})
|
||||
set (HasValidGroup 1)
|
||||
set (STR ${${CMAKE_MATCH_3}})
|
||||
elseif (NOT ${CMAKE_MATCH_4} STREQUAL "")
|
||||
set (HasValidGroup 1)
|
||||
set (STR ${CMAKE_MATCH_4})
|
||||
endif ()
|
||||
if (DEFINED ${CMAKE_MATCH_1} AND ${HasValidGroup} EQUAL 1)
|
||||
if (NOT (${${CMAKE_MATCH_1}} STREQUAL ${STR}))
|
||||
#message (STATUS "condition is true")
|
||||
set (IfElse 1)
|
||||
continue ()
|
||||
endif ()
|
||||
endif ()
|
||||
set (IfElse 2)
|
||||
continue ()
|
||||
endif ()
|
||||
#message(STATUS "unmatched line ${line_match}")
|
||||
endforeach ()
|
||||
endmacro ()
|
||||
|
||||
|
|
@ -154,31 +259,31 @@ endfunction ()
|
|||
# STRING - compiles only the given type (e.g. DOUBLE)
|
||||
function(GenerateNamedObjects sources_in)
|
||||
|
||||
if (DEFINED ARGV1)
|
||||
if (${ARGC} GREATER 1)
|
||||
set(defines_in ${ARGV1})
|
||||
endif ()
|
||||
|
||||
if (DEFINED ARGV2 AND NOT "${ARGV2}" STREQUAL "")
|
||||
if (${ARGC} GREATER 2 AND NOT "${ARGV2}" STREQUAL "")
|
||||
set(name_in ${ARGV2})
|
||||
# strip off extension for kernel files that pass in the object name.
|
||||
get_filename_component(name_in ${name_in} NAME_WE)
|
||||
endif ()
|
||||
|
||||
if (DEFINED ARGV3)
|
||||
if (${ARGC} GREATER 3)
|
||||
set(use_cblas ${ARGV3})
|
||||
else ()
|
||||
set(use_cblas false)
|
||||
endif ()
|
||||
|
||||
if (DEFINED ARGV4)
|
||||
if (${ARGC} GREATER 4)
|
||||
set(replace_last_with ${ARGV4})
|
||||
endif ()
|
||||
|
||||
if (DEFINED ARGV5)
|
||||
if (${ARGC} GREATER 5)
|
||||
set(append_with ${ARGV5})
|
||||
endif ()
|
||||
|
||||
if (DEFINED ARGV6)
|
||||
if (${ARGC} GREATER 6)
|
||||
set(no_float_type ${ARGV6})
|
||||
else ()
|
||||
set(no_float_type false)
|
||||
|
|
@ -193,7 +298,7 @@ function(GenerateNamedObjects sources_in)
|
|||
set(real_only false)
|
||||
set(complex_only false)
|
||||
set(mangle_complex_sources false)
|
||||
if (DEFINED ARGV7 AND NOT "${ARGV7}" STREQUAL "")
|
||||
if (${ARGC} GREATER 7 AND NOT "${ARGV7}" STREQUAL "")
|
||||
if (${ARGV7} EQUAL 1)
|
||||
set(real_only true)
|
||||
elseif (${ARGV7} EQUAL 2)
|
||||
|
|
@ -251,6 +356,19 @@ function(GenerateNamedObjects sources_in)
|
|||
# now add the object and set the defines
|
||||
set(obj_defines ${defines_in})
|
||||
|
||||
list(FIND obj_defines "RC" def_idx)
|
||||
if (${def_idx} GREATER -1)
|
||||
# list(REMOVE_AT ${obj_defines} ${def_idx})
|
||||
list (REMOVE_ITEM obj_defines "RC")
|
||||
list(APPEND obj_defines "RC=RC")
|
||||
endif ()
|
||||
list(FIND obj_defines "CR" def_idx)
|
||||
if (${def_idx} GREATER -1)
|
||||
# list(REMOVE_AT ${obj_defines} ${def_idx})
|
||||
list (REMOVE_ITEM obj_defines "CR")
|
||||
list(APPEND obj_defines "CR=CR")
|
||||
endif ()
|
||||
|
||||
if (use_cblas)
|
||||
set(obj_name "cblas_${obj_name}")
|
||||
list(APPEND obj_defines "CBLAS")
|
||||
|
|
@ -295,7 +413,15 @@ function(GenerateNamedObjects sources_in)
|
|||
configure_file(${new_source_file}.tmp ${new_source_file} COPYONLY)
|
||||
file(REMOVE ${new_source_file}.tmp)
|
||||
list(APPEND SRC_LIST_OUT ${new_source_file})
|
||||
|
||||
message (STATUS ${new_source_file})
|
||||
if (DEFINED HAVE_FMA3)
|
||||
if ( ${new_source_file} MATCHES "(s|d?)rot_k.*c")
|
||||
set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma")
|
||||
endif ()
|
||||
if ( ${new_source_file} MATCHES "dgemv_t_k.*c")
|
||||
set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma")
|
||||
endif ()
|
||||
endif ()
|
||||
endforeach ()
|
||||
endforeach ()
|
||||
|
||||
|
|
@ -318,17 +444,17 @@ endfunction ()
|
|||
function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_defines_in replace_scheme)
|
||||
|
||||
set(alternate_name_in "")
|
||||
if (DEFINED ARGV5)
|
||||
if (${ARGC} GREATER 5)
|
||||
set(alternate_name_in ${ARGV5})
|
||||
endif ()
|
||||
|
||||
set(no_float_type false)
|
||||
if (DEFINED ARGV6)
|
||||
if (${ARGC} GREATER 6)
|
||||
set(no_float_type ${ARGV6})
|
||||
endif ()
|
||||
|
||||
set(complex_filename_scheme "")
|
||||
if (DEFINED ARGV7)
|
||||
if (${ARGC} GREATER 7)
|
||||
set(complex_filename_scheme ${ARGV7})
|
||||
endif ()
|
||||
|
||||
|
|
|
|||
32
common.h
32
common.h
|
|
@ -122,7 +122,7 @@ extern "C" {
|
|||
#define ATOM GOTO_ATOM
|
||||
#undef GOTO_ATOM
|
||||
#endif
|
||||
#else
|
||||
#elif !defined(OS_EMBEDDED)
|
||||
#include <sys/mman.h>
|
||||
#ifndef NO_SYSV_IPC
|
||||
#include <sys/shm.h>
|
||||
|
|
@ -134,6 +134,9 @@ extern "C" {
|
|||
#if defined(SMP) || defined(USE_LOCKING)
|
||||
#include <pthread.h>
|
||||
#endif
|
||||
#else
|
||||
#include <time.h>
|
||||
#include <math.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_SUNOS)
|
||||
|
|
@ -413,6 +416,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
|||
#include "common_alpha.h"
|
||||
#endif
|
||||
|
||||
#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(__CET__) && defined(__has_include)
|
||||
#if __has_include(<cet.h>)
|
||||
#include <cet.h>
|
||||
#endif
|
||||
#endif
|
||||
#ifndef _CET_ENDBR
|
||||
#define _CET_ENDBR
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_X86
|
||||
#include "common_x86.h"
|
||||
#endif
|
||||
|
|
@ -437,7 +449,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
|||
#include "common_mips.h"
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#ifdef ARCH_RISCV64
|
||||
#include "common_riscv64.h"
|
||||
#endif
|
||||
|
|
@ -458,6 +470,14 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
|||
#include "common_zarch.h"
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_LOONGARCH64
|
||||
#include "common_loongarch64.h"
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_E2K
|
||||
#include "common_e2k.h"
|
||||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#ifdef OS_WINDOWSSTORE
|
||||
typedef char env_var_t[MAX_PATH];
|
||||
|
|
@ -488,10 +508,12 @@ static inline unsigned long long rpcc(void){
|
|||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec;
|
||||
#else
|
||||
#elif !defined(OS_EMBEDDED)
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv,NULL);
|
||||
return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
#define RPCC_DEFINED
|
||||
|
|
@ -521,6 +543,10 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
|||
#include "common_linux.h"
|
||||
#endif
|
||||
|
||||
#ifdef OS_EMBEDDED
|
||||
#define DTB_DEFAULT_ENTRIES 64
|
||||
#endif
|
||||
|
||||
#define MMAP_ACCESS (PROT_READ | PROT_WRITE)
|
||||
|
||||
#ifdef __NetBSD__
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define INLINE inline
|
||||
|
||||
#ifdef F_INTERFACE_FLANG
|
||||
#if defined( F_INTERFACE_FLANG) || defined(F_INTERFACE_PGI)
|
||||
#define RETURN_BY_STACK
|
||||
#else
|
||||
#define RETURN_BY_COMPLEX
|
||||
|
|
@ -120,7 +120,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
.text ;
|
||||
.p2align 2 ;
|
||||
.global REALNAME ;
|
||||
#ifndef __APPLE__
|
||||
#if !defined(__APPLE__) && !defined(_WIN32)
|
||||
.type REALNAME, %function ;
|
||||
#endif
|
||||
REALNAME:
|
||||
|
|
|
|||
45
common_c.h
45
common_c.h
|
|
@ -232,6 +232,8 @@
|
|||
|
||||
#define CGEADD_K cgeadd_k
|
||||
|
||||
#define CGEMM_SMALL_MATRIX_PERMIT cgemm_small_matrix_permit
|
||||
|
||||
#else
|
||||
|
||||
#define CAMAX_K gotoblas -> camax_k
|
||||
|
|
@ -426,8 +428,51 @@
|
|||
|
||||
#define CGEADD_K gotoblas -> cgeadd_k
|
||||
|
||||
#define CGEMM_SMALL_MATRIX_PERMIT gotoblas -> cgemm_small_matrix_permit
|
||||
|
||||
#endif
|
||||
|
||||
#define CGEMM_SMALL_KERNEL_NN FUNC_OFFSET(cgemm_small_kernel_nn)
|
||||
#define CGEMM_SMALL_KERNEL_NT FUNC_OFFSET(cgemm_small_kernel_nt)
|
||||
#define CGEMM_SMALL_KERNEL_NR FUNC_OFFSET(cgemm_small_kernel_nr)
|
||||
#define CGEMM_SMALL_KERNEL_NC FUNC_OFFSET(cgemm_small_kernel_nc)
|
||||
|
||||
#define CGEMM_SMALL_KERNEL_TN FUNC_OFFSET(cgemm_small_kernel_tn)
|
||||
#define CGEMM_SMALL_KERNEL_TT FUNC_OFFSET(cgemm_small_kernel_tt)
|
||||
#define CGEMM_SMALL_KERNEL_TR FUNC_OFFSET(cgemm_small_kernel_tr)
|
||||
#define CGEMM_SMALL_KERNEL_TC FUNC_OFFSET(cgemm_small_kernel_tc)
|
||||
|
||||
#define CGEMM_SMALL_KERNEL_RN FUNC_OFFSET(cgemm_small_kernel_rn)
|
||||
#define CGEMM_SMALL_KERNEL_RT FUNC_OFFSET(cgemm_small_kernel_rt)
|
||||
#define CGEMM_SMALL_KERNEL_RR FUNC_OFFSET(cgemm_small_kernel_rr)
|
||||
#define CGEMM_SMALL_KERNEL_RC FUNC_OFFSET(cgemm_small_kernel_rc)
|
||||
|
||||
#define CGEMM_SMALL_KERNEL_CN FUNC_OFFSET(cgemm_small_kernel_cn)
|
||||
#define CGEMM_SMALL_KERNEL_CT FUNC_OFFSET(cgemm_small_kernel_ct)
|
||||
#define CGEMM_SMALL_KERNEL_CR FUNC_OFFSET(cgemm_small_kernel_cr)
|
||||
#define CGEMM_SMALL_KERNEL_CC FUNC_OFFSET(cgemm_small_kernel_cc)
|
||||
|
||||
#define CGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(cgemm_small_kernel_b0_nn)
|
||||
#define CGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(cgemm_small_kernel_b0_nt)
|
||||
#define CGEMM_SMALL_KERNEL_B0_NR FUNC_OFFSET(cgemm_small_kernel_b0_nr)
|
||||
#define CGEMM_SMALL_KERNEL_B0_NC FUNC_OFFSET(cgemm_small_kernel_b0_nc)
|
||||
|
||||
#define CGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(cgemm_small_kernel_b0_tn)
|
||||
#define CGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(cgemm_small_kernel_b0_tt)
|
||||
#define CGEMM_SMALL_KERNEL_B0_TR FUNC_OFFSET(cgemm_small_kernel_b0_tr)
|
||||
#define CGEMM_SMALL_KERNEL_B0_TC FUNC_OFFSET(cgemm_small_kernel_b0_tc)
|
||||
|
||||
#define CGEMM_SMALL_KERNEL_B0_RN FUNC_OFFSET(cgemm_small_kernel_b0_rn)
|
||||
#define CGEMM_SMALL_KERNEL_B0_RT FUNC_OFFSET(cgemm_small_kernel_b0_rt)
|
||||
#define CGEMM_SMALL_KERNEL_B0_RR FUNC_OFFSET(cgemm_small_kernel_b0_rr)
|
||||
#define CGEMM_SMALL_KERNEL_B0_RC FUNC_OFFSET(cgemm_small_kernel_b0_rc)
|
||||
|
||||
#define CGEMM_SMALL_KERNEL_B0_CN FUNC_OFFSET(cgemm_small_kernel_b0_cn)
|
||||
#define CGEMM_SMALL_KERNEL_B0_CT FUNC_OFFSET(cgemm_small_kernel_b0_ct)
|
||||
#define CGEMM_SMALL_KERNEL_B0_CR FUNC_OFFSET(cgemm_small_kernel_b0_cr)
|
||||
#define CGEMM_SMALL_KERNEL_B0_CC FUNC_OFFSET(cgemm_small_kernel_b0_cc)
|
||||
|
||||
|
||||
#define CGEMM_NN cgemm_nn
|
||||
#define CGEMM_CN cgemm_cn
|
||||
#define CGEMM_TN cgemm_tn
|
||||
|
|
|
|||
15
common_d.h
15
common_d.h
|
|
@ -157,6 +157,8 @@
|
|||
#define DIMATCOPY_K_RT dimatcopy_k_rt
|
||||
#define DGEADD_K dgeadd_k
|
||||
|
||||
#define DGEMM_SMALL_MATRIX_PERMIT dgemm_small_matrix_permit
|
||||
|
||||
#else
|
||||
|
||||
#define DAMAX_K gotoblas -> damax_k
|
||||
|
|
@ -281,8 +283,21 @@
|
|||
|
||||
#define DGEADD_K gotoblas -> dgeadd_k
|
||||
|
||||
#define DGEMM_SMALL_MATRIX_PERMIT gotoblas -> dgemm_small_matrix_permit
|
||||
|
||||
#endif
|
||||
|
||||
#define DGEMM_SMALL_KERNEL_NN FUNC_OFFSET(dgemm_small_kernel_nn)
|
||||
#define DGEMM_SMALL_KERNEL_NT FUNC_OFFSET(dgemm_small_kernel_nt)
|
||||
#define DGEMM_SMALL_KERNEL_TN FUNC_OFFSET(dgemm_small_kernel_tn)
|
||||
#define DGEMM_SMALL_KERNEL_TT FUNC_OFFSET(dgemm_small_kernel_tt)
|
||||
|
||||
#define DGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(dgemm_small_kernel_b0_nn)
|
||||
#define DGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(dgemm_small_kernel_b0_nt)
|
||||
#define DGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(dgemm_small_kernel_b0_tn)
|
||||
#define DGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(dgemm_small_kernel_b0_tt)
|
||||
|
||||
|
||||
#define DGEMM_NN dgemm_nn
|
||||
#define DGEMM_CN dgemm_tn
|
||||
#define DGEMM_TN dgemm_tn
|
||||
|
|
|
|||
|
|
@ -0,0 +1,64 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2011-2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
|
||||
#ifndef COMMON_E2K
|
||||
#define COMMON_E2K
|
||||
|
||||
#ifdef ASSEMBLER
|
||||
#error
|
||||
#endif
|
||||
|
||||
#define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
|
||||
#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
|
||||
#define RMB
|
||||
|
||||
#define INLINE __attribute__((__always_inline__)) inline
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y) {
|
||||
return x / y;
|
||||
}
|
||||
|
||||
#ifndef PAGESIZE
|
||||
#define PAGESIZE ( 4 << 10)
|
||||
#endif
|
||||
#define HUGE_PAGESIZE ( 2 << 20)
|
||||
|
||||
#ifndef BUFFERSIZE
|
||||
#define BUFFER_SIZE (32 << 20)
|
||||
#else
|
||||
#define BUFFER_SIZE (32 << BUFFERSIZE)
|
||||
#endif
|
||||
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#endif
|
||||
|
||||
|
|
@ -709,6 +709,13 @@ int BLASFUNC(cpotrf)(char *, blasint *, float *, blasint *, blasint *);
|
|||
int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *);
|
||||
int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *);
|
||||
|
||||
int BLASFUNC(spotri)(char *, blasint *, float *, blasint *, blasint *);
|
||||
int BLASFUNC(dpotri)(char *, blasint *, double *, blasint *, blasint *);
|
||||
int BLASFUNC(qpotri)(char *, blasint *, xdouble *, blasint *, blasint *);
|
||||
int BLASFUNC(cpotri)(char *, blasint *, float *, blasint *, blasint *);
|
||||
int BLASFUNC(zpotri)(char *, blasint *, double *, blasint *, blasint *);
|
||||
int BLASFUNC(xpotri)(char *, blasint *, xdouble *, blasint *, blasint *);
|
||||
|
||||
int BLASFUNC(spotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *);
|
||||
int BLASFUNC(dpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *);
|
||||
int BLASFUNC(qpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *);
|
||||
|
|
|
|||
123
common_level3.h
123
common_level3.h
|
|
@ -515,6 +515,129 @@ int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xidouble *, xidouble *, xidouble
|
|||
int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);
|
||||
#endif
|
||||
|
||||
#ifdef SMALL_MATRIX_OPT
|
||||
int sbgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
|
||||
|
||||
int sbgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int sbgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int sbgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int sbgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
|
||||
int sgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
|
||||
|
||||
int sgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int sgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int sgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int sgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
|
||||
int dgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta);
|
||||
|
||||
int dgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
|
||||
int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
|
||||
int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
|
||||
int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
|
||||
|
||||
int sbgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int sbgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int sbgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int sbgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
|
||||
int sgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int sgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int sgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int sgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
|
||||
int dgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int dgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int dgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int dgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
|
||||
int cgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha0, float alpha1, float beta0, float beta1);
|
||||
|
||||
int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
|
||||
int cgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
|
||||
int cgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
|
||||
int cgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
|
||||
int zgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha0, double alpha1, double beta0, double beta1);
|
||||
|
||||
int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
|
||||
int zgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
|
||||
int zgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
|
||||
int zgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
|
||||
int cgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
|
||||
int cgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
|
||||
int cgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
|
||||
int cgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int cgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
|
||||
int zgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
|
||||
int zgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
|
||||
int zgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
|
||||
int zgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int zgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
|
||||
#endif
|
||||
|
||||
int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
|
||||
int cgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
|
||||
int cgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
|
||||
|
|
|
|||
|
|
@ -75,18 +75,10 @@ static inline int my_mbind(void *addr, unsigned long len, int mode,
|
|||
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
|
||||
return 0;
|
||||
#else
|
||||
#if defined (LOONGSON3B)
|
||||
#if defined (__64BIT__)
|
||||
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
|
||||
#else
|
||||
return 0; //NULL Implementation on Loongson 3B 32bit.
|
||||
#endif
|
||||
#else
|
||||
//Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34
|
||||
// unsigned long null_nodemask=0;
|
||||
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,199 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2011-2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#ifndef COMMON_LOONGARCH64
|
||||
#define COMMON_LOONGARCH64
|
||||
|
||||
#define MB __sync_synchronize()
|
||||
#define WMB __sync_synchronize()
|
||||
#define RMB __sync_synchronize()
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
}
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory")
|
||||
#else
|
||||
#define GET_IMAGE(res) __asm__ __volatile__("fmov.s %0, $f2" : "=f"(res) : : "memory")
|
||||
#endif
|
||||
|
||||
#define GET_IMAGE_CANCEL
|
||||
|
||||
#else
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define LD fld.d
|
||||
#define ST fst.d
|
||||
#define MADD fmadd.d
|
||||
#define NMADD fnmadd.d
|
||||
#define MSUB fmsub.d
|
||||
#define NMSUB fnmsub.d
|
||||
#define ADD fadd.d
|
||||
#define SUB fsub.d
|
||||
#define MUL fmul.d
|
||||
#define MOV fmov.d
|
||||
#define CMOVT fsel
|
||||
#define MTC movgr2fr.d
|
||||
#define FABS fabs.d
|
||||
#define CMPEQ fcmp.ceq.d
|
||||
#define CMPLE fcmp.cle.d
|
||||
#define CMPLT fcmp.clt.d
|
||||
#define NEG fneg.d
|
||||
#else
|
||||
#define LD fld.s
|
||||
#define ST fst.s
|
||||
#define MADD fmadd.s
|
||||
#define NMADD fnmadd.s
|
||||
#define MSUB fmsub.s
|
||||
#define NMSUB fnmsub.s
|
||||
#define ADD fadd.s
|
||||
#define SUB fsub.s
|
||||
#define MUL fmul.s
|
||||
#define MOV fmov.s
|
||||
#define CMOVT fsel
|
||||
#define MTC movgr2fr.w
|
||||
#define FABS fabs.s
|
||||
#define CMPEQ fcmp.ceq.s
|
||||
#define CMPLE fcmp.cle.s
|
||||
#define CMPLT fcmp.clt.s
|
||||
#define NEG fneg.s
|
||||
#endif /* defined(DOUBLE) */
|
||||
|
||||
#if defined(__64BIT__) && defined(USE64BITINT)
|
||||
#define LDINT ld.d
|
||||
#define LDARG ld.d
|
||||
#define SDARG st.d
|
||||
#elif defined(__64BIT__) && !defined(USE64BITINT)
|
||||
#define LDINT ld.w
|
||||
#define LDARG ld.d
|
||||
#define SDARG st.d
|
||||
#else
|
||||
#define LDINT ld.w
|
||||
#define LDARG ld.w
|
||||
#define SDARG st.w
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef F_INTERFACE
|
||||
#define REALNAME ASMNAME
|
||||
#else
|
||||
#define REALNAME ASMFNAME
|
||||
#endif /* defined(F_INTERFACE) */
|
||||
|
||||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||
|
||||
#define PROLOGUE \
|
||||
.text ;\
|
||||
.align 5 ;\
|
||||
.globl REALNAME ;\
|
||||
.type REALNAME, @function ;\
|
||||
REALNAME: ;\
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
#define GNUSTACK .section .note.GNU-stack,"",@progbits
|
||||
#else
|
||||
#define GNUSTACK
|
||||
#endif /* defined(__linux__) && defined(__ELF__) */
|
||||
|
||||
#define EPILOGUE \
|
||||
.end REALNAME ;\
|
||||
GNUSTACK
|
||||
|
||||
#define PROFCODE
|
||||
|
||||
#define MOVT(dst, src, cc) \
|
||||
bceqz cc, 1f; \
|
||||
add.d dst, src, $r0; \
|
||||
1:
|
||||
|
||||
#endif /* defined(ASSEMBLER) && !defined(NEEDPARAM) */
|
||||
|
||||
#endif /* defined(ASSEMBLER) */
|
||||
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#define BUFFER_SIZE ( 32 << 20)
|
||||
|
||||
#define PAGESIZE (16UL << 10)
|
||||
#define FIXED_PAGESIZE (16UL << 10)
|
||||
#define HUGE_PAGESIZE ( 2 << 20)
|
||||
|
||||
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
|
||||
|
||||
#ifndef MAP_ANONYMOUS
|
||||
#define MAP_ANONYMOUS MAP_ANON
|
||||
#endif
|
||||
|
||||
#endif
|
||||
123
common_macro.h
123
common_macro.h
|
|
@ -644,6 +644,17 @@
|
|||
|
||||
#define GEADD_K DGEADD_K
|
||||
|
||||
#define GEMM_SMALL_MATRIX_PERMIT DGEMM_SMALL_MATRIX_PERMIT
|
||||
|
||||
#define GEMM_SMALL_KERNEL_NN DGEMM_SMALL_KERNEL_NN
|
||||
#define GEMM_SMALL_KERNEL_NT DGEMM_SMALL_KERNEL_NT
|
||||
#define GEMM_SMALL_KERNEL_TN DGEMM_SMALL_KERNEL_TN
|
||||
#define GEMM_SMALL_KERNEL_TT DGEMM_SMALL_KERNEL_TT
|
||||
#define GEMM_SMALL_KERNEL_B0_NN DGEMM_SMALL_KERNEL_B0_NN
|
||||
#define GEMM_SMALL_KERNEL_B0_NT DGEMM_SMALL_KERNEL_B0_NT
|
||||
#define GEMM_SMALL_KERNEL_B0_TN DGEMM_SMALL_KERNEL_B0_TN
|
||||
#define GEMM_SMALL_KERNEL_B0_TT DGEMM_SMALL_KERNEL_B0_TT
|
||||
|
||||
#elif defined(BFLOAT16)
|
||||
|
||||
#define D_TO_BF16_K SBDTOBF16_K
|
||||
|
|
@ -931,6 +942,18 @@
|
|||
|
||||
#define GEADD_K SGEADD_K
|
||||
|
||||
#define GEMM_SMALL_MATRIX_PERMIT SBGEMM_SMALL_MATRIX_PERMIT
|
||||
|
||||
#define GEMM_SMALL_KERNEL_NN SBGEMM_SMALL_KERNEL_NN
|
||||
#define GEMM_SMALL_KERNEL_NT SBGEMM_SMALL_KERNEL_NT
|
||||
#define GEMM_SMALL_KERNEL_TN SBGEMM_SMALL_KERNEL_TN
|
||||
#define GEMM_SMALL_KERNEL_TT SBGEMM_SMALL_KERNEL_TT
|
||||
|
||||
#define GEMM_SMALL_KERNEL_B0_NN SBGEMM_SMALL_KERNEL_B0_NN
|
||||
#define GEMM_SMALL_KERNEL_B0_NT SBGEMM_SMALL_KERNEL_B0_NT
|
||||
#define GEMM_SMALL_KERNEL_B0_TN SBGEMM_SMALL_KERNEL_B0_TN
|
||||
#define GEMM_SMALL_KERNEL_B0_TT SBGEMM_SMALL_KERNEL_B0_TT
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
|
@ -1236,6 +1259,19 @@
|
|||
#define IMATCOPY_K_RT SIMATCOPY_K_RT
|
||||
|
||||
#define GEADD_K SGEADD_K
|
||||
|
||||
#define GEMM_SMALL_MATRIX_PERMIT SGEMM_SMALL_MATRIX_PERMIT
|
||||
|
||||
#define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN
|
||||
#define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT
|
||||
#define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN
|
||||
#define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT
|
||||
|
||||
#define GEMM_SMALL_KERNEL_B0_NN SGEMM_SMALL_KERNEL_B0_NN
|
||||
#define GEMM_SMALL_KERNEL_B0_NT SGEMM_SMALL_KERNEL_B0_NT
|
||||
#define GEMM_SMALL_KERNEL_B0_TN SGEMM_SMALL_KERNEL_B0_TN
|
||||
#define GEMM_SMALL_KERNEL_B0_TT SGEMM_SMALL_KERNEL_B0_TT
|
||||
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
|
|
@ -2063,6 +2099,48 @@
|
|||
|
||||
#define GEADD_K ZGEADD_K
|
||||
|
||||
#define GEMM_SMALL_MATRIX_PERMIT ZGEMM_SMALL_MATRIX_PERMIT
|
||||
|
||||
#define GEMM_SMALL_KERNEL_NN ZGEMM_SMALL_KERNEL_NN
|
||||
#define GEMM_SMALL_KERNEL_NT ZGEMM_SMALL_KERNEL_NT
|
||||
#define GEMM_SMALL_KERNEL_NR ZGEMM_SMALL_KERNEL_NR
|
||||
#define GEMM_SMALL_KERNEL_NC ZGEMM_SMALL_KERNEL_NC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_TN ZGEMM_SMALL_KERNEL_TN
|
||||
#define GEMM_SMALL_KERNEL_TT ZGEMM_SMALL_KERNEL_TT
|
||||
#define GEMM_SMALL_KERNEL_TR ZGEMM_SMALL_KERNEL_TR
|
||||
#define GEMM_SMALL_KERNEL_TC ZGEMM_SMALL_KERNEL_TC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_RN ZGEMM_SMALL_KERNEL_RN
|
||||
#define GEMM_SMALL_KERNEL_RT ZGEMM_SMALL_KERNEL_RT
|
||||
#define GEMM_SMALL_KERNEL_RR ZGEMM_SMALL_KERNEL_RR
|
||||
#define GEMM_SMALL_KERNEL_RC ZGEMM_SMALL_KERNEL_RC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_CN ZGEMM_SMALL_KERNEL_CN
|
||||
#define GEMM_SMALL_KERNEL_CT ZGEMM_SMALL_KERNEL_CT
|
||||
#define GEMM_SMALL_KERNEL_CR ZGEMM_SMALL_KERNEL_CR
|
||||
#define GEMM_SMALL_KERNEL_CC ZGEMM_SMALL_KERNEL_CC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_B0_NN ZGEMM_SMALL_KERNEL_B0_NN
|
||||
#define GEMM_SMALL_KERNEL_B0_NT ZGEMM_SMALL_KERNEL_B0_NT
|
||||
#define GEMM_SMALL_KERNEL_B0_NR ZGEMM_SMALL_KERNEL_B0_NR
|
||||
#define GEMM_SMALL_KERNEL_B0_NC ZGEMM_SMALL_KERNEL_B0_NC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_B0_TN ZGEMM_SMALL_KERNEL_B0_TN
|
||||
#define GEMM_SMALL_KERNEL_B0_TT ZGEMM_SMALL_KERNEL_B0_TT
|
||||
#define GEMM_SMALL_KERNEL_B0_TR ZGEMM_SMALL_KERNEL_B0_TR
|
||||
#define GEMM_SMALL_KERNEL_B0_TC ZGEMM_SMALL_KERNEL_B0_TC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_B0_RN ZGEMM_SMALL_KERNEL_B0_RN
|
||||
#define GEMM_SMALL_KERNEL_B0_RT ZGEMM_SMALL_KERNEL_B0_RT
|
||||
#define GEMM_SMALL_KERNEL_B0_RR ZGEMM_SMALL_KERNEL_B0_RR
|
||||
#define GEMM_SMALL_KERNEL_B0_RC ZGEMM_SMALL_KERNEL_B0_RC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_B0_CN ZGEMM_SMALL_KERNEL_B0_CN
|
||||
#define GEMM_SMALL_KERNEL_B0_CT ZGEMM_SMALL_KERNEL_B0_CT
|
||||
#define GEMM_SMALL_KERNEL_B0_CR ZGEMM_SMALL_KERNEL_B0_CR
|
||||
#define GEMM_SMALL_KERNEL_B0_CC ZGEMM_SMALL_KERNEL_B0_CC
|
||||
|
||||
#else
|
||||
|
||||
#define AMAX_K CAMAX_K
|
||||
|
|
@ -2486,11 +2564,54 @@
|
|||
|
||||
#define GEADD_K CGEADD_K
|
||||
|
||||
#define GEMM_SMALL_MATRIX_PERMIT CGEMM_SMALL_MATRIX_PERMIT
|
||||
|
||||
#define GEMM_SMALL_KERNEL_NN CGEMM_SMALL_KERNEL_NN
|
||||
#define GEMM_SMALL_KERNEL_NT CGEMM_SMALL_KERNEL_NT
|
||||
#define GEMM_SMALL_KERNEL_NR CGEMM_SMALL_KERNEL_NR
|
||||
#define GEMM_SMALL_KERNEL_NC CGEMM_SMALL_KERNEL_NC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_TN CGEMM_SMALL_KERNEL_TN
|
||||
#define GEMM_SMALL_KERNEL_TT CGEMM_SMALL_KERNEL_TT
|
||||
#define GEMM_SMALL_KERNEL_TR CGEMM_SMALL_KERNEL_TR
|
||||
#define GEMM_SMALL_KERNEL_TC CGEMM_SMALL_KERNEL_TC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_RN CGEMM_SMALL_KERNEL_RN
|
||||
#define GEMM_SMALL_KERNEL_RT CGEMM_SMALL_KERNEL_RT
|
||||
#define GEMM_SMALL_KERNEL_RR CGEMM_SMALL_KERNEL_RR
|
||||
#define GEMM_SMALL_KERNEL_RC CGEMM_SMALL_KERNEL_RC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_CN CGEMM_SMALL_KERNEL_CN
|
||||
#define GEMM_SMALL_KERNEL_CT CGEMM_SMALL_KERNEL_CT
|
||||
#define GEMM_SMALL_KERNEL_CR CGEMM_SMALL_KERNEL_CR
|
||||
#define GEMM_SMALL_KERNEL_CC CGEMM_SMALL_KERNEL_CC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_B0_NN CGEMM_SMALL_KERNEL_B0_NN
|
||||
#define GEMM_SMALL_KERNEL_B0_NT CGEMM_SMALL_KERNEL_B0_NT
|
||||
#define GEMM_SMALL_KERNEL_B0_NR CGEMM_SMALL_KERNEL_B0_NR
|
||||
#define GEMM_SMALL_KERNEL_B0_NC CGEMM_SMALL_KERNEL_B0_NC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_B0_TN CGEMM_SMALL_KERNEL_B0_TN
|
||||
#define GEMM_SMALL_KERNEL_B0_TT CGEMM_SMALL_KERNEL_B0_TT
|
||||
#define GEMM_SMALL_KERNEL_B0_TR CGEMM_SMALL_KERNEL_B0_TR
|
||||
#define GEMM_SMALL_KERNEL_B0_TC CGEMM_SMALL_KERNEL_B0_TC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_B0_RN CGEMM_SMALL_KERNEL_B0_RN
|
||||
#define GEMM_SMALL_KERNEL_B0_RT CGEMM_SMALL_KERNEL_B0_RT
|
||||
#define GEMM_SMALL_KERNEL_B0_RR CGEMM_SMALL_KERNEL_B0_RR
|
||||
#define GEMM_SMALL_KERNEL_B0_RC CGEMM_SMALL_KERNEL_B0_RC
|
||||
|
||||
#define GEMM_SMALL_KERNEL_B0_CN CGEMM_SMALL_KERNEL_B0_CN
|
||||
#define GEMM_SMALL_KERNEL_B0_CT CGEMM_SMALL_KERNEL_B0_CT
|
||||
#define GEMM_SMALL_KERNEL_B0_CR CGEMM_SMALL_KERNEL_B0_CR
|
||||
#define GEMM_SMALL_KERNEL_B0_CC CGEMM_SMALL_KERNEL_B0_CC
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\
|
||||
|| defined(ARCH_LOONGARCH64) || defined(ARCH_E2K)
|
||||
extern BLASLONG gemm_offset_a;
|
||||
extern BLASLONG gemm_offset_b;
|
||||
extern BLASLONG sbgemm_p;
|
||||
|
|
|
|||
|
|
@ -229,12 +229,7 @@ REALNAME: ;\
|
|||
|
||||
#define BUFFER_SIZE ( 32 << 21)
|
||||
|
||||
#if defined(LOONGSON3A)
|
||||
#define PAGESIZE (16UL << 10)
|
||||
#define FIXED_PAGESIZE (16UL << 10)
|
||||
#endif
|
||||
|
||||
#if defined(LOONGSON3B)
|
||||
#if defined(LOONGSON3R3) || defined(LOONGSON3R4)
|
||||
#define PAGESIZE (16UL << 10)
|
||||
#define FIXED_PAGESIZE (16UL << 10)
|
||||
#endif
|
||||
|
|
@ -250,7 +245,7 @@ REALNAME: ;\
|
|||
#define MAP_ANONYMOUS MAP_ANON
|
||||
#endif
|
||||
|
||||
#if defined(LOONGSON3A) || defined(LOONGSON3B)
|
||||
#if defined(LOONGSON3R3) || defined(LOONGSON3R4)
|
||||
#define PREFETCHD_(x) ld $0, x
|
||||
#define PREFETCHD(x) PREFETCHD_(x)
|
||||
#else
|
||||
|
|
|
|||
132
common_param.h
132
common_param.h
|
|
@ -145,6 +145,19 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG);
|
|||
int (*sbneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*sblaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *);
|
||||
|
||||
#ifdef SMALL_MATRIX_OPT
|
||||
int (*sbgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
|
||||
|
||||
int (*sbgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int (*sbgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int (*sbgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int (*sbgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
|
||||
int (*sbgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*sbgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*sbgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*sbgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX)
|
||||
|
|
@ -207,6 +220,20 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
|
|||
int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||
#endif
|
||||
#ifdef BUILD_SINGLE
|
||||
#ifdef SMALL_MATRIX_OPT
|
||||
int (*sgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
|
||||
|
||||
int (*sgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int (*sgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int (*sgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
int (*sgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
|
||||
|
||||
int (*sgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*sgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*sgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*sgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
#endif
|
||||
|
||||
int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
|
|
@ -314,6 +341,19 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
|
|||
int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
|
||||
#endif
|
||||
#ifdef BUILD_DOUBLE
|
||||
#ifdef SMALL_MATRIX_OPT
|
||||
int (*dgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta);
|
||||
|
||||
int (*dgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
|
||||
int (*dgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
|
||||
int (*dgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
|
||||
int (*dgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
|
||||
|
||||
int (*dgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*dgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*dgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*dgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
#endif
|
||||
int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
|
||||
int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
|
||||
int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
|
||||
|
|
@ -513,6 +553,50 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
|
|||
int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*cgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||
|
||||
#ifdef SMALL_MATRIX_OPT
|
||||
int (*cgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha0, float alpha1, float beta0, float beta1);
|
||||
|
||||
int (*cgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_nr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_nc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
|
||||
int (*cgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_tr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_tc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
|
||||
int (*cgemm_small_kernel_rn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_rt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_rr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_rc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
|
||||
int (*cgemm_small_kernel_cn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_ct )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_cr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_cc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
|
||||
|
||||
int (*cgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_nr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_nc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
|
||||
int (*cgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_tr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_tc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
|
||||
int (*cgemm_small_kernel_b0_rn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_rt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_rr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_rc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
|
||||
int (*cgemm_small_kernel_b0_cn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_ct )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_cr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
int (*cgemm_small_kernel_b0_cc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
|
||||
#endif
|
||||
|
||||
int (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
|
|
@ -679,6 +763,50 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
|
|||
int (*zgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
|
||||
int (*zgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
|
||||
|
||||
#ifdef SMALL_MATRIX_OPT
|
||||
int (*zgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha0, double alpha1, double beta0, double beta1);
|
||||
|
||||
int (*zgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_nr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_nc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
|
||||
int (*zgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_tr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_tc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
|
||||
int (*zgemm_small_kernel_rn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_rt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_rr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_rc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
|
||||
int (*zgemm_small_kernel_cn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_ct )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_cr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_cc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
|
||||
|
||||
int (*zgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_nr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_nc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
|
||||
int (*zgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_tr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_tc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
|
||||
int (*zgemm_small_kernel_b0_rn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_rt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_rr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_rc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
|
||||
int (*zgemm_small_kernel_b0_cn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_ct )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_cr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
int (*zgemm_small_kernel_b0_cc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
|
||||
#endif
|
||||
|
||||
int (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
|
||||
int (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
|
||||
int (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG);
|
||||
|
|
@ -1069,6 +1197,8 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
|
|||
|
||||
extern gotoblas_t *gotoblas;
|
||||
|
||||
#define FUNC_OFFSET(func) (size_t)(&((gotoblas_t *)NULL)->func)
|
||||
|
||||
#define DTB_ENTRIES gotoblas -> dtb_entries
|
||||
#define GEMM_OFFSET_A gotoblas -> offsetA
|
||||
#define GEMM_OFFSET_B gotoblas -> offsetB
|
||||
|
|
@ -1174,6 +1304,8 @@ extern gotoblas_t *gotoblas;
|
|||
|
||||
#else
|
||||
|
||||
#define FUNC_OFFSET(func) (size_t)(func)
|
||||
|
||||
#define DTB_ENTRIES DTB_DEFAULT_ENTRIES
|
||||
|
||||
#define GEMM_OFFSET_A GEMM_DEFAULT_OFFSET_A
|
||||
|
|
|
|||
15
common_s.h
15
common_s.h
|
|
@ -164,6 +164,8 @@
|
|||
|
||||
#define SGEADD_K sgeadd_k
|
||||
|
||||
#define SGEMM_SMALL_MATRIX_PERMIT sgemm_small_matrix_permit
|
||||
|
||||
#else
|
||||
|
||||
#define SAMAX_K gotoblas -> samax_k
|
||||
|
|
@ -299,8 +301,21 @@
|
|||
|
||||
#define SGEADD_K gotoblas -> sgeadd_k
|
||||
|
||||
#define SGEMM_SMALL_MATRIX_PERMIT gotoblas -> sgemm_small_matrix_permit
|
||||
|
||||
#endif
|
||||
|
||||
#define SGEMM_SMALL_KERNEL_NN FUNC_OFFSET(sgemm_small_kernel_nn)
|
||||
#define SGEMM_SMALL_KERNEL_NT FUNC_OFFSET(sgemm_small_kernel_nt)
|
||||
#define SGEMM_SMALL_KERNEL_TN FUNC_OFFSET(sgemm_small_kernel_tn)
|
||||
#define SGEMM_SMALL_KERNEL_TT FUNC_OFFSET(sgemm_small_kernel_tt)
|
||||
|
||||
#define SGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(sgemm_small_kernel_b0_nn)
|
||||
#define SGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(sgemm_small_kernel_b0_nt)
|
||||
#define SGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(sgemm_small_kernel_b0_tn)
|
||||
#define SGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(sgemm_small_kernel_b0_tt)
|
||||
|
||||
|
||||
#define SGEMM_NN sgemm_nn
|
||||
#define SGEMM_CN sgemm_tn
|
||||
#define SGEMM_TN sgemm_tn
|
||||
|
|
|
|||
12
common_sb.h
12
common_sb.h
|
|
@ -24,6 +24,7 @@
|
|||
#define SBGEMM_BETA sbgemm_beta
|
||||
#define SBGEMM_KERNEL sbgemm_kernel
|
||||
|
||||
#define SBGEMM_SMALL_MATRIX_PERMIT sbgemm_small_matrix_permit
|
||||
#else
|
||||
|
||||
#define SBDOT_K gotoblas -> sbdot_k
|
||||
|
|
@ -41,8 +42,19 @@
|
|||
#define SBGEMM_BETA gotoblas -> sbgemm_beta
|
||||
#define SBGEMM_KERNEL gotoblas -> sbgemm_kernel
|
||||
|
||||
#define SBGEMM_SMALL_MATRIX_PERMIT gotoblas -> sbgemm_small_matrix_permit
|
||||
#endif
|
||||
|
||||
#define SBGEMM_SMALL_KERNEL_NN FUNC_OFFSET(sbgemm_small_kernel_nn)
|
||||
#define SBGEMM_SMALL_KERNEL_NT FUNC_OFFSET(sbgemm_small_kernel_nt)
|
||||
#define SBGEMM_SMALL_KERNEL_TN FUNC_OFFSET(sbgemm_small_kernel_tn)
|
||||
#define SBGEMM_SMALL_KERNEL_TT FUNC_OFFSET(sbgemm_small_kernel_tt)
|
||||
|
||||
#define SBGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(sbgemm_small_kernel_b0_nn)
|
||||
#define SBGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(sbgemm_small_kernel_b0_nt)
|
||||
#define SBGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(sbgemm_small_kernel_b0_tn)
|
||||
#define SBGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(sbgemm_small_kernel_b0_tt)
|
||||
|
||||
#define SBGEMM_NN sbgemm_nn
|
||||
#define SBGEMM_CN sbgemm_tn
|
||||
#define SBGEMM_TN sbgemm_tn
|
||||
|
|
|
|||
|
|
@ -340,7 +340,8 @@ REALNAME:
|
|||
.align 16; \
|
||||
.globl REALNAME ;\
|
||||
.type REALNAME, @function; \
|
||||
REALNAME:
|
||||
REALNAME: \
|
||||
_CET_ENDBR
|
||||
|
||||
#ifdef PROFILE
|
||||
#define PROFCODE call mcount
|
||||
|
|
|
|||
|
|
@ -451,7 +451,8 @@ REALNAME:
|
|||
.align 512; \
|
||||
.globl REALNAME ;\
|
||||
.type REALNAME, @function; \
|
||||
REALNAME:
|
||||
REALNAME: \
|
||||
_CET_ENDBR
|
||||
|
||||
#ifdef PROFILE
|
||||
#define PROFCODE call *mcount@GOTPCREL(%rip)
|
||||
|
|
|
|||
45
common_z.h
45
common_z.h
|
|
@ -232,6 +232,8 @@
|
|||
|
||||
#define ZGEADD_K zgeadd_k
|
||||
|
||||
#define ZGEMM_SMALL_MATRIX_PERMIT zgemm_small_matrix_permit
|
||||
|
||||
#else
|
||||
|
||||
#define ZAMAX_K gotoblas -> zamax_k
|
||||
|
|
@ -426,8 +428,51 @@
|
|||
|
||||
#define ZGEADD_K gotoblas -> zgeadd_k
|
||||
|
||||
#define ZGEMM_SMALL_MATRIX_PERMIT gotoblas -> zgemm_small_matrix_permit
|
||||
|
||||
#endif
|
||||
|
||||
#define ZGEMM_SMALL_KERNEL_NN FUNC_OFFSET(zgemm_small_kernel_nn)
|
||||
#define ZGEMM_SMALL_KERNEL_NT FUNC_OFFSET(zgemm_small_kernel_nt)
|
||||
#define ZGEMM_SMALL_KERNEL_NR FUNC_OFFSET(zgemm_small_kernel_nr)
|
||||
#define ZGEMM_SMALL_KERNEL_NC FUNC_OFFSET(zgemm_small_kernel_nc)
|
||||
|
||||
#define ZGEMM_SMALL_KERNEL_TN FUNC_OFFSET(zgemm_small_kernel_tn)
|
||||
#define ZGEMM_SMALL_KERNEL_TT FUNC_OFFSET(zgemm_small_kernel_tt)
|
||||
#define ZGEMM_SMALL_KERNEL_TR FUNC_OFFSET(zgemm_small_kernel_tr)
|
||||
#define ZGEMM_SMALL_KERNEL_TC FUNC_OFFSET(zgemm_small_kernel_tc)
|
||||
|
||||
#define ZGEMM_SMALL_KERNEL_RN FUNC_OFFSET(zgemm_small_kernel_rn)
|
||||
#define ZGEMM_SMALL_KERNEL_RT FUNC_OFFSET(zgemm_small_kernel_rt)
|
||||
#define ZGEMM_SMALL_KERNEL_RR FUNC_OFFSET(zgemm_small_kernel_rr)
|
||||
#define ZGEMM_SMALL_KERNEL_RC FUNC_OFFSET(zgemm_small_kernel_rc)
|
||||
|
||||
#define ZGEMM_SMALL_KERNEL_CN FUNC_OFFSET(zgemm_small_kernel_cn)
|
||||
#define ZGEMM_SMALL_KERNEL_CT FUNC_OFFSET(zgemm_small_kernel_ct)
|
||||
#define ZGEMM_SMALL_KERNEL_CR FUNC_OFFSET(zgemm_small_kernel_cr)
|
||||
#define ZGEMM_SMALL_KERNEL_CC FUNC_OFFSET(zgemm_small_kernel_cc)
|
||||
|
||||
#define ZGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(zgemm_small_kernel_b0_nn)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(zgemm_small_kernel_b0_nt)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_NR FUNC_OFFSET(zgemm_small_kernel_b0_nr)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_NC FUNC_OFFSET(zgemm_small_kernel_b0_nc)
|
||||
|
||||
#define ZGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(zgemm_small_kernel_b0_tn)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(zgemm_small_kernel_b0_tt)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_TR FUNC_OFFSET(zgemm_small_kernel_b0_tr)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_TC FUNC_OFFSET(zgemm_small_kernel_b0_tc)
|
||||
|
||||
#define ZGEMM_SMALL_KERNEL_B0_RN FUNC_OFFSET(zgemm_small_kernel_b0_rn)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_RT FUNC_OFFSET(zgemm_small_kernel_b0_rt)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_RR FUNC_OFFSET(zgemm_small_kernel_b0_rr)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_RC FUNC_OFFSET(zgemm_small_kernel_b0_rc)
|
||||
|
||||
#define ZGEMM_SMALL_KERNEL_B0_CN FUNC_OFFSET(zgemm_small_kernel_b0_cn)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_CT FUNC_OFFSET(zgemm_small_kernel_b0_ct)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_CR FUNC_OFFSET(zgemm_small_kernel_b0_cr)
|
||||
#define ZGEMM_SMALL_KERNEL_B0_CC FUNC_OFFSET(zgemm_small_kernel_b0_cc)
|
||||
|
||||
|
||||
#define ZGEMM_NN zgemm_nn
|
||||
#define ZGEMM_CN zgemm_cn
|
||||
#define ZGEMM_TN zgemm_tn
|
||||
|
|
|
|||
|
|
@ -1,13 +1,14 @@
|
|||
include ../Makefile.rule
|
||||
TOPDIR = ..
|
||||
include $(TOPDIR)/Makefile.system
|
||||
|
||||
all :: dgemv_tester dgemm_tester
|
||||
|
||||
dgemv_tester :
|
||||
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester
|
||||
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester
|
||||
./dgemv_tester
|
||||
|
||||
dgemm_tester : dgemv_tester
|
||||
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester
|
||||
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester
|
||||
./dgemm_tester
|
||||
|
||||
clean ::
|
||||
|
|
|
|||
4
cpuid.h
4
cpuid.h
|
|
@ -54,6 +54,7 @@
|
|||
#define VENDOR_TRANSMETA 9
|
||||
#define VENDOR_NSC 10
|
||||
#define VENDOR_HYGON 11
|
||||
#define VENDOR_ZHAOXIN 12
|
||||
#define VENDOR_UNKNOWN 99
|
||||
|
||||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
|
||||
|
|
@ -119,6 +120,7 @@
|
|||
#define CORE_SKYLAKEX 28
|
||||
#define CORE_DHYANA 29
|
||||
#define CORE_COOPERLAKE 30
|
||||
#define CORE_SAPPHIRERAPIDS 31
|
||||
|
||||
#define HAVE_SSE (1 << 0)
|
||||
#define HAVE_SSE2 (1 << 1)
|
||||
|
|
@ -144,6 +146,7 @@
|
|||
#define HAVE_AVX512VL (1 << 21)
|
||||
#define HAVE_AVX2 (1 << 22)
|
||||
#define HAVE_AVX512BF16 (1 << 23)
|
||||
#define HAVE_AMXBF16 (1 << 24)
|
||||
|
||||
#define CACHE_INFO_L1_I 1
|
||||
#define CACHE_INFO_L1_D 2
|
||||
|
|
@ -221,6 +224,7 @@ typedef struct {
|
|||
#define CPUTYPE_SKYLAKEX 52
|
||||
#define CPUTYPE_DHYANA 53
|
||||
#define CPUTYPE_COOPERLAKE 54
|
||||
#define CPUTYPE_SAPPHIRERAPIDS 55
|
||||
|
||||
#define CPUTYPE_HYGON_UNKNOWN 99
|
||||
|
||||
|
|
|
|||
349
cpuid_arm64.c
349
cpuid_arm64.c
|
|
@ -26,20 +26,25 @@
|
|||
*****************************************************************************/
|
||||
|
||||
#include <string.h>
|
||||
#ifdef OS_DARWIN
|
||||
#ifdef __APPLE__
|
||||
#include <sys/sysctl.h>
|
||||
int32_t value;
|
||||
size_t length=sizeof(value);
|
||||
int64_t value64;
|
||||
size_t length64=sizeof(value64);
|
||||
#endif
|
||||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_ARMV8 1
|
||||
// Arm
|
||||
#define CPU_CORTEXA53 2
|
||||
#define CPU_CORTEXA55 14
|
||||
#define CPU_CORTEXA57 3
|
||||
#define CPU_CORTEXA72 4
|
||||
#define CPU_CORTEXA73 5
|
||||
#define CPU_NEOVERSEN1 11
|
||||
#define CPU_NEOVERSEV1 16
|
||||
#define CPU_NEOVERSEN2 17
|
||||
// Qualcomm
|
||||
#define CPU_FALKOR 6
|
||||
// Cavium
|
||||
|
|
@ -52,6 +57,8 @@ size_t length=sizeof(value);
|
|||
#define CPU_EMAG8180 10
|
||||
// Apple
|
||||
#define CPU_VORTEX 13
|
||||
// Fujitsu
|
||||
#define CPU_A64FX 15
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKNOWN",
|
||||
|
|
@ -66,8 +73,12 @@ static char *cpuname[] = {
|
|||
"TSV110",
|
||||
"EMAG8180",
|
||||
"NEOVERSEN1",
|
||||
"NEOVERSEV1"
|
||||
"NEOVERSEN2"
|
||||
"THUNDERX3T110",
|
||||
"VORTEX"
|
||||
"VORTEX",
|
||||
"CORTEXA55",
|
||||
"A64FX"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
|
|
@ -83,8 +94,12 @@ static char *cpuname_lower[] = {
|
|||
"tsv110",
|
||||
"emag8180",
|
||||
"neoversen1",
|
||||
"neoversev1",
|
||||
"neoversen2",
|
||||
"thunderx3t110",
|
||||
"vortex"
|
||||
"vortex",
|
||||
"cortexa55",
|
||||
"a64fx"
|
||||
};
|
||||
|
||||
int get_feature(char *search)
|
||||
|
|
@ -161,6 +176,12 @@ int detect(void)
|
|||
return CPU_CORTEXA73;
|
||||
else if (strstr(cpu_part, "0xd0c"))
|
||||
return CPU_NEOVERSEN1;
|
||||
else if (strstr(cpu_part, "0xd40"))
|
||||
return CPU_NEOVERSEV1;
|
||||
else if (strstr(cpu_part, "0xd49"))
|
||||
return CPU_NEOVERSEN2;
|
||||
else if (strstr(cpu_part, "0xd05"))
|
||||
return CPU_CORTEXA55;
|
||||
}
|
||||
// Qualcomm
|
||||
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
|
||||
|
|
@ -178,6 +199,9 @@ int detect(void)
|
|||
// Ampere
|
||||
else if (strstr(cpu_implementer, "0x50") && strstr(cpu_part, "0x000"))
|
||||
return CPU_EMAG8180;
|
||||
// Fujitsu
|
||||
else if (strstr(cpu_implementer, "0x46") && strstr(cpu_part, "0x001"))
|
||||
return CPU_A64FX;
|
||||
}
|
||||
|
||||
p = (char *) NULL ;
|
||||
|
|
@ -207,9 +231,9 @@ int detect(void)
|
|||
|
||||
}
|
||||
#else
|
||||
#ifdef DARWIN
|
||||
#ifdef __APPLE__
|
||||
sysctlbyname("hw.cpufamily",&value,&length,NULL,0);
|
||||
if (value ==131287967) return CPU_VORTEX;
|
||||
if (value ==131287967|| value == 458787763 ) return CPU_VORTEX;
|
||||
#endif
|
||||
return CPU_ARMV8;
|
||||
#endif
|
||||
|
|
@ -260,7 +284,7 @@ int n=0;
|
|||
|
||||
printf("#define NUM_CORES %d\n",n);
|
||||
#endif
|
||||
#ifdef DARWIN
|
||||
#ifdef __APPLE__
|
||||
sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0);
|
||||
printf("#define NUM_CORES %d\n",value);
|
||||
#endif
|
||||
|
|
@ -280,153 +304,196 @@ void get_cpuconfig(void)
|
|||
switch (d)
|
||||
{
|
||||
|
||||
case CPU_CORTEXA53:
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
// Fall-through
|
||||
case CPU_ARMV8:
|
||||
// Minimum parameters for ARMv8 (based on A53)
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 262144\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
case CPU_CORTEXA53:
|
||||
case CPU_CORTEXA55:
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
// Fall-through
|
||||
case CPU_ARMV8:
|
||||
// Minimum parameters for ARMv8 (based on A53)
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 262144\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
break;
|
||||
|
||||
case CPU_CORTEXA57:
|
||||
case CPU_CORTEXA72:
|
||||
case CPU_CORTEXA73:
|
||||
case CPU_CORTEXA57:
|
||||
case CPU_CORTEXA72:
|
||||
case CPU_CORTEXA73:
|
||||
// Common minimum settings for these Arm cores
|
||||
// Can change a lot, but we need to be conservative
|
||||
// TODO: detect info from /sys if possible
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
printf("#define L1_CODE_SIZE 49152\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 3\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 2\n");
|
||||
printf("#define L2_SIZE 524288\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
case CPU_NEOVERSEN1:
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
printf("#define L1_CODE_SIZE 65536\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 4\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 4\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
printf("#define L1_CODE_SIZE 49152\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 3\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 2\n");
|
||||
printf("#define L2_SIZE 524288\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
case CPU_NEOVERSEN1:
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
printf("#define L1_CODE_SIZE 65536\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 4\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 4\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 48\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
|
||||
case CPU_FALKOR:
|
||||
printf("#define FALKOR\n");
|
||||
printf("#define L1_CODE_SIZE 65536\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 128\n");
|
||||
printf("#define L2_SIZE 524288\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
break;
|
||||
case CPU_NEOVERSEV1:
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
printf("#define L1_CODE_SIZE 65536\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 4\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 4\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 48\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
|
||||
case CPU_THUNDERX:
|
||||
printf("#define THUNDERX\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 128\n");
|
||||
printf("#define L2_SIZE 16777216\n");
|
||||
printf("#define L2_LINESIZE 128\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
break;
|
||||
case CPU_NEOVERSEN2:
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
printf("#define L1_CODE_SIZE 65536\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 4\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 4\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 48\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
|
||||
case CPU_THUNDERX2T99:
|
||||
printf("#define THUNDERX2T99 \n");
|
||||
printf("#define L1_CODE_SIZE 32768 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
|
||||
printf("#define L1_DATA_SIZE 32768 \n");
|
||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
|
||||
printf("#define L2_SIZE 262144 \n");
|
||||
printf("#define L2_LINESIZE 64 \n");
|
||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||
printf("#define L3_SIZE 33554432 \n");
|
||||
printf("#define L3_LINESIZE 64 \n");
|
||||
printf("#define L3_ASSOCIATIVE 32 \n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
case CPU_FALKOR:
|
||||
printf("#define FALKOR\n");
|
||||
printf("#define L1_CODE_SIZE 65536\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 128\n");
|
||||
printf("#define L2_SIZE 524288\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
break;
|
||||
|
||||
case CPU_THUNDERX:
|
||||
printf("#define THUNDERX\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 128\n");
|
||||
printf("#define L2_SIZE 16777216\n");
|
||||
printf("#define L2_LINESIZE 128\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
break;
|
||||
|
||||
case CPU_THUNDERX2T99:
|
||||
printf("#define THUNDERX2T99 \n");
|
||||
printf("#define L1_CODE_SIZE 32768 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
|
||||
printf("#define L1_DATA_SIZE 32768 \n");
|
||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
|
||||
printf("#define L2_SIZE 262144 \n");
|
||||
printf("#define L2_LINESIZE 64 \n");
|
||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||
printf("#define L3_SIZE 33554432 \n");
|
||||
printf("#define L3_LINESIZE 64 \n");
|
||||
printf("#define L3_ASSOCIATIVE 32 \n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
|
||||
case CPU_TSV110:
|
||||
printf("#define TSV110 \n");
|
||||
printf("#define L1_CODE_SIZE 65536 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 4 \n");
|
||||
printf("#define L1_DATA_SIZE 65536 \n");
|
||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 4 \n");
|
||||
printf("#define L2_SIZE 524228 \n");
|
||||
printf("#define L2_LINESIZE 64 \n");
|
||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
case CPU_TSV110:
|
||||
printf("#define TSV110 \n");
|
||||
printf("#define L1_CODE_SIZE 65536 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 4 \n");
|
||||
printf("#define L1_DATA_SIZE 65536 \n");
|
||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 4 \n");
|
||||
printf("#define L2_SIZE 524228 \n");
|
||||
printf("#define L2_LINESIZE 64 \n");
|
||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
|
||||
case CPU_EMAG8180:
|
||||
// Minimum parameters for ARMv8 (based on A53)
|
||||
printf("#define EMAG8180\n");
|
||||
printf("#define L1_CODE_SIZE 32768\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 262144\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
case CPU_EMAG8180:
|
||||
// Minimum parameters for ARMv8 (based on A53)
|
||||
printf("#define EMAG8180\n");
|
||||
printf("#define L1_CODE_SIZE 32768\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 262144\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
|
||||
case CPU_THUNDERX3T110:
|
||||
printf("#define THUNDERX3T110 \n");
|
||||
printf("#define L1_CODE_SIZE 65536 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
|
||||
printf("#define L1_DATA_SIZE 32768 \n");
|
||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
|
||||
printf("#define L2_SIZE 524288 \n");
|
||||
printf("#define L2_LINESIZE 64 \n");
|
||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||
printf("#define L3_SIZE 94371840 \n");
|
||||
printf("#define L3_LINESIZE 64 \n");
|
||||
printf("#define L3_ASSOCIATIVE 32 \n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
#ifdef DARWIN
|
||||
case CPU_VORTEX:
|
||||
printf("#define VORTEX \n");
|
||||
sysctlbyname("hw.l1icachesize",&value,&length,NULL,0);
|
||||
printf("#define L1_CODE_SIZE %d \n",value);
|
||||
sysctlbyname("hw.cachelinesize",&value,&length,NULL,0);
|
||||
printf("#define L1_CODE_LINESIZE %d \n",value);
|
||||
sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0);
|
||||
printf("#define L1_DATA_SIZE %d \n",value);
|
||||
sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0);
|
||||
printf("#define L2_SIZE %d \n",value);
|
||||
break;
|
||||
case CPU_THUNDERX3T110:
|
||||
printf("#define THUNDERX3T110 \n");
|
||||
printf("#define L1_CODE_SIZE 65536 \n");
|
||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
|
||||
printf("#define L1_DATA_SIZE 32768 \n");
|
||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
|
||||
printf("#define L2_SIZE 524288 \n");
|
||||
printf("#define L2_LINESIZE 64 \n");
|
||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||
printf("#define L3_SIZE 94371840 \n");
|
||||
printf("#define L3_LINESIZE 64 \n");
|
||||
printf("#define L3_ASSOCIATIVE 32 \n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
#ifdef __APPLE__
|
||||
case CPU_VORTEX:
|
||||
printf("#define VORTEX \n");
|
||||
sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0);
|
||||
printf("#define L1_CODE_SIZE %lld \n",value64);
|
||||
sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0);
|
||||
printf("#define L1_CODE_LINESIZE %lld \n",value64);
|
||||
sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0);
|
||||
printf("#define L1_DATA_SIZE %lld \n",value64);
|
||||
sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0);
|
||||
printf("#define L2_SIZE %lld \n",value64);
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
#endif
|
||||
case CPU_A64FX:
|
||||
printf("#define A64FX\n");
|
||||
printf("#define L1_CODE_SIZE 65535\n");
|
||||
printf("#define L1_DATA_SIZE 65535\n");
|
||||
printf("#define L1_DATA_LINESIZE 256\n");
|
||||
printf("#define L2_SIZE 8388608\n");
|
||||
printf("#define L2_LINESIZE 256\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
}
|
||||
get_cpucount();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,110 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2011-2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
**********************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_LOONGSON3R5 1
|
||||
|
||||
#define LOONGARCH_CFG2 0x02
|
||||
#define LOONGARCH_LASX 1<<7
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKNOWN",
|
||||
"LOONGSON3R5"
|
||||
};
|
||||
|
||||
int detect(void) {
|
||||
uint32_t reg = 0;
|
||||
|
||||
__asm__ volatile (
|
||||
"cpucfg %0, %1 \n\t"
|
||||
: "+&r"(reg)
|
||||
: "r"(LOONGARCH_CFG2)
|
||||
);
|
||||
|
||||
if (reg & LOONGARCH_LASX)
|
||||
return CPU_LOONGSON3R5;
|
||||
else
|
||||
return CPU_UNKNOWN;
|
||||
}
|
||||
|
||||
char *get_corename(void) {
|
||||
return cpuname[detect()];
|
||||
}
|
||||
|
||||
void get_architecture(void) {
|
||||
printf("LOONGARCH64");
|
||||
}
|
||||
|
||||
void get_subarchitecture(void) {
|
||||
if (detect() == CPU_LOONGSON3R5) {
|
||||
printf("LOONGSON3R5");
|
||||
} else {
|
||||
printf("UNKNOWN");
|
||||
}
|
||||
}
|
||||
|
||||
void get_subdirname(void) {
|
||||
printf("loongarch64");
|
||||
}
|
||||
|
||||
void get_cpuconfig(void) {
|
||||
if (detect() == CPU_LOONGSON3R5) {
|
||||
printf("#define LOONGSON3R5\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
} else {
|
||||
printf("#define LOONGSON3R5\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
}
|
||||
}
|
||||
|
||||
void get_libname(void){
|
||||
if (detect() == CPU_LOONGSON3R5) {
|
||||
printf("loongson3r5\n");
|
||||
} else {
|
||||
printf("loongarch64\n");
|
||||
}
|
||||
}
|
||||
36
cpuid_mips.c
36
cpuid_mips.c
|
|
@ -165,6 +165,7 @@ void get_cpuconfig(void){
|
|||
}else{
|
||||
printf("#define UNKNOWN\n");
|
||||
}
|
||||
if (!get_feature("msa")) printf("#define NO_MSA\n");
|
||||
}
|
||||
|
||||
void get_libname(void){
|
||||
|
|
@ -178,3 +179,38 @@ void get_libname(void){
|
|||
printf("mips\n");
|
||||
}
|
||||
}
|
||||
|
||||
int get_feature(char *search)
|
||||
{
|
||||
|
||||
#ifdef __linux
|
||||
FILE *infile;
|
||||
char buffer[2048], *p,*t;
|
||||
p = (char *) NULL ;
|
||||
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
|
||||
while (fgets(buffer, sizeof(buffer), infile))
|
||||
{
|
||||
|
||||
if (!strncmp("Features", buffer, 8) || !strncmp("ASEs implemented", buffer, 16))
|
||||
{
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
if( p == NULL ) return 0;
|
||||
|
||||
t = strtok(p," ");
|
||||
while( t = strtok(NULL," "))
|
||||
{
|
||||
if (strstr(t, search)) { return(1); }
|
||||
}
|
||||
|
||||
#endif
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
|
|
|||
129
cpuid_mips64.c
129
cpuid_mips64.c
|
|
@ -70,19 +70,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_SICORTEX 1
|
||||
#define CPU_LOONGSON3A 2
|
||||
#define CPU_LOONGSON3B 3
|
||||
#define CPU_I6400 4
|
||||
#define CPU_P6600 5
|
||||
#define CPU_I6500 6
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_SICORTEX 1
|
||||
#define CPU_LOONGSON3R3 2
|
||||
#define CPU_LOONGSON3R4 3
|
||||
#define CPU_I6400 4
|
||||
#define CPU_P6600 5
|
||||
#define CPU_I6500 6
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKNOWN",
|
||||
"SICORTEX",
|
||||
"LOONGSON3A",
|
||||
"LOONGSON3B",
|
||||
"LOONGSON3R3",
|
||||
"LOONGSON3R4",
|
||||
"I6400",
|
||||
"P6600",
|
||||
"I6500"
|
||||
|
|
@ -90,48 +90,13 @@ static char *cpuname[] = {
|
|||
|
||||
int detect(void){
|
||||
|
||||
#ifdef __linux
|
||||
#ifdef linux
|
||||
FILE *infile;
|
||||
char buffer[512], *p;
|
||||
|
||||
p = (char *)NULL;
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("cpu", buffer, 3)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
#if 0
|
||||
fprintf(stderr, "%s\n", p);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
if(p != NULL){
|
||||
if (strstr(p, "Loongson-3A")){
|
||||
return CPU_LOONGSON3A;
|
||||
}else if(strstr(p, "Loongson-3B")){
|
||||
return CPU_LOONGSON3B;
|
||||
}else if (strstr(p, "Loongson-3")){
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
p = (char *)NULL;
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("system type", buffer, 11)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
fclose(infile);
|
||||
if (strstr(p, "loongson3a"))
|
||||
return CPU_LOONGSON3A;
|
||||
}else{
|
||||
return CPU_SICORTEX;
|
||||
}
|
||||
}
|
||||
//Check model name for Loongson3
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
p = (char *)NULL;
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("model name", buffer, 10)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
|
|
@ -139,12 +104,14 @@ int detect(void){
|
|||
}
|
||||
}
|
||||
fclose(infile);
|
||||
if(p != NULL){
|
||||
if (strstr(p, "Loongson-3A")){
|
||||
return CPU_LOONGSON3A;
|
||||
}else if(strstr(p, "Loongson-3B")){
|
||||
return CPU_LOONGSON3B;
|
||||
}
|
||||
if (p != NULL){
|
||||
if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){
|
||||
return CPU_LOONGSON3R3;
|
||||
} else if (strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){
|
||||
return CPU_LOONGSON3R4;
|
||||
} else{
|
||||
return CPU_SICORTEX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return CPU_UNKNOWN;
|
||||
|
|
@ -159,10 +126,10 @@ void get_architecture(void){
|
|||
}
|
||||
|
||||
void get_subarchitecture(void){
|
||||
if(detect()==CPU_LOONGSON3A) {
|
||||
printf("LOONGSON3A");
|
||||
}else if(detect()==CPU_LOONGSON3B){
|
||||
printf("LOONGSON3B");
|
||||
if(detect()==CPU_LOONGSON3R3) {
|
||||
printf("LOONGSON3R3");
|
||||
}else if(detect()==CPU_LOONGSON3R4){
|
||||
printf("LOONGSON3R4");
|
||||
}else if(detect()==CPU_I6400){
|
||||
printf("I6400");
|
||||
}else if(detect()==CPU_P6600){
|
||||
|
|
@ -179,8 +146,8 @@ void get_subdirname(void){
|
|||
}
|
||||
|
||||
void get_cpuconfig(void){
|
||||
if(detect()==CPU_LOONGSON3A) {
|
||||
printf("#define LOONGSON3A\n");
|
||||
if(detect()==CPU_LOONGSON3R3) {
|
||||
printf("#define LOONGSON3R3\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 32\n");
|
||||
printf("#define L2_SIZE 512488\n");
|
||||
|
|
@ -188,8 +155,8 @@ void get_cpuconfig(void){
|
|||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
}else if(detect()==CPU_LOONGSON3B){
|
||||
printf("#define LOONGSON3B\n");
|
||||
}else if(detect()==CPU_LOONGSON3R4){
|
||||
printf("#define LOONGSON3R4\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 32\n");
|
||||
printf("#define L2_SIZE 512488\n");
|
||||
|
|
@ -234,13 +201,14 @@ void get_cpuconfig(void){
|
|||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
}
|
||||
if (!get_feature("msa")) printf("#define NO_MSA\n");
|
||||
}
|
||||
|
||||
void get_libname(void){
|
||||
if(detect()==CPU_LOONGSON3A) {
|
||||
printf("loongson3a\n");
|
||||
}else if(detect()==CPU_LOONGSON3B) {
|
||||
printf("loongson3b\n");
|
||||
if(detect()==CPU_LOONGSON3R3) {
|
||||
printf("loongson3r3\n");
|
||||
}else if(detect()==CPU_LOONGSON3R4) {
|
||||
printf("loongson3r4\n");
|
||||
}else if(detect()==CPU_I6400) {
|
||||
printf("i6400\n");
|
||||
}else if(detect()==CPU_P6600) {
|
||||
|
|
@ -251,3 +219,38 @@ void get_libname(void){
|
|||
printf("mips64\n");
|
||||
}
|
||||
}
|
||||
|
||||
int get_feature(char *search)
|
||||
{
|
||||
|
||||
#ifdef __linux
|
||||
FILE *infile;
|
||||
char buffer[2048], *p,*t;
|
||||
p = (char *) NULL ;
|
||||
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
|
||||
while (fgets(buffer, sizeof(buffer), infile))
|
||||
{
|
||||
|
||||
if (!strncmp("Features", buffer, 8) || !strncmp("ASEs implemented", buffer, 16))
|
||||
{
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
if( p == NULL ) return 0;
|
||||
|
||||
t = strtok(p," ");
|
||||
while( t = strtok(NULL," "))
|
||||
{
|
||||
if (strstr(t, search)) { return(1); }
|
||||
}
|
||||
|
||||
#endif
|
||||
return(0);
|
||||
}
|
||||
|
||||
|
|
|
|||
297
cpuid_x86.c
297
cpuid_x86.c
|
|
@ -1,3 +1,4 @@
|
|||
//{
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
|
|
@ -266,6 +267,31 @@ int support_avx512_bf16(){
|
|||
#endif
|
||||
}
|
||||
|
||||
#define BIT_AMX_TILE 0x01000000
|
||||
#define BIT_AMX_BF16 0x00400000
|
||||
#define BIT_AMX_ENBD 0x00060000
|
||||
|
||||
int support_amx_bf16() {
|
||||
#if !defined(NO_AVX) && !defined(NO_AVX512)
|
||||
int eax, ebx, ecx, edx;
|
||||
int ret=0;
|
||||
|
||||
if (!support_avx512())
|
||||
return 0;
|
||||
// CPUID.7.0:EDX indicates AMX support
|
||||
cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
|
||||
if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) {
|
||||
// CPUID.D.0:EAX[17:18] indicates AMX enabled
|
||||
cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
|
||||
if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD)
|
||||
ret = 1;
|
||||
}
|
||||
return ret;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int get_vendor(void){
|
||||
int eax, ebx, ecx, edx;
|
||||
char vendor[13];
|
||||
|
|
@ -283,6 +309,7 @@ int get_vendor(void){
|
|||
if (!strcmp(vendor, "CyrixInstead")) return VENDOR_CYRIX;
|
||||
if (!strcmp(vendor, "NexGenDriven")) return VENDOR_NEXGEN;
|
||||
if (!strcmp(vendor, "CentaurHauls")) return VENDOR_CENTAUR;
|
||||
if (!strcmp(vendor, " Shanghai ")) return VENDOR_ZHAOXIN;
|
||||
if (!strcmp(vendor, "RiseRiseRise")) return VENDOR_RISE;
|
||||
if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS;
|
||||
if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA;
|
||||
|
|
@ -296,9 +323,11 @@ int get_vendor(void){
|
|||
|
||||
int get_cputype(int gettype){
|
||||
int eax, ebx, ecx, edx;
|
||||
/*
|
||||
int extend_family, family;
|
||||
int extend_model, model;
|
||||
int type, stepping;
|
||||
*/
|
||||
int feature = 0;
|
||||
|
||||
cpuid(1, &eax, &ebx, &ecx, &edx);
|
||||
|
|
@ -352,6 +381,7 @@ int get_cputype(int gettype){
|
|||
if (support_avx2()) feature |= HAVE_AVX2;
|
||||
if (support_avx512()) feature |= HAVE_AVX512VL;
|
||||
if (support_avx512_bf16()) feature |= HAVE_AVX512BF16;
|
||||
if (support_amx_bf16()) feature |= HAVE_AMXBF16;
|
||||
if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3;
|
||||
#endif
|
||||
|
||||
|
|
@ -400,7 +430,8 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
|
|||
cpuid(0, &cpuid_level, &ebx, &ecx, &edx);
|
||||
|
||||
if (cpuid_level > 1) {
|
||||
int numcalls =0 ;
|
||||
int numcalls;
|
||||
|
||||
cpuid(2, &eax, &ebx, &ecx, &edx);
|
||||
numcalls = BITMASK(eax, 0, 0xff); //FIXME some systems may require repeated calls to read all entries
|
||||
info[ 0] = BITMASK(eax, 8, 0xff);
|
||||
|
|
@ -1066,7 +1097,8 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
|
|||
|
||||
if ((get_vendor() == VENDOR_AMD) ||
|
||||
(get_vendor() == VENDOR_HYGON) ||
|
||||
(get_vendor() == VENDOR_CENTAUR)) {
|
||||
(get_vendor() == VENDOR_CENTAUR) ||
|
||||
(get_vendor() == VENDOR_ZHAOXIN)) {
|
||||
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
LDTB.size = 4096;
|
||||
|
|
@ -1189,7 +1221,7 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
|
|||
|
||||
int get_cpuname(void){
|
||||
|
||||
int family, exfamily, model, vendor, exmodel;
|
||||
int family, exfamily, model, vendor, exmodel, stepping;
|
||||
|
||||
if (!have_cpuid()) return CPUTYPE_80386;
|
||||
|
||||
|
|
@ -1197,6 +1229,7 @@ int get_cpuname(void){
|
|||
exfamily = get_cputype(GET_EXFAMILY);
|
||||
model = get_cputype(GET_MODEL);
|
||||
exmodel = get_cputype(GET_EXMODEL);
|
||||
stepping = get_cputype(GET_STEPPING);
|
||||
|
||||
vendor = get_vendor();
|
||||
|
||||
|
|
@ -1398,6 +1431,17 @@ int get_cpuname(void){
|
|||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 10: // Ice Lake SP
|
||||
if(support_avx512_bf16())
|
||||
return CPUTYPE_COOPERLAKE;
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 7: // family 6 exmodel 7
|
||||
|
|
@ -1415,9 +1459,18 @@ int get_cpuname(void){
|
|||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 9:
|
||||
case 8:
|
||||
switch (model) {
|
||||
case 12: // Tiger Lake
|
||||
case 13: // Tiger Lake (11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz)
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 14: // Kaby Lake and refreshes
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
|
|
@ -1425,21 +1478,74 @@ int get_cpuname(void){
|
|||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
case 10: //family 6 exmodel 10
|
||||
switch (model) {
|
||||
case 5: // Comet Lake H and S
|
||||
case 6: // Comet Lake U
|
||||
case 15: // Sapphire Rapids
|
||||
if(support_avx512_bf16())
|
||||
return CPUTYPE_COOPERLAKE;
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 9:
|
||||
switch (model) {
|
||||
case 7: // Alder Lake desktop
|
||||
case 10: // Alder Lake mobile
|
||||
if(support_avx512_bf16())
|
||||
return CPUTYPE_COOPERLAKE;
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 13: // Ice Lake NNPI
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 14: // Kaby Lake and refreshes
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 10: //family 6 exmodel 10
|
||||
switch (model) {
|
||||
case 5: // Comet Lake H and S
|
||||
case 6: // Comet Lake U
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 7: // Rocket Lake
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case 0x7:
|
||||
return CPUTYPE_ITANIUM;
|
||||
case 0xf:
|
||||
|
|
@ -1538,7 +1644,6 @@ int get_cpuname(void){
|
|||
else
|
||||
return CPUTYPE_BARCELONA;
|
||||
}
|
||||
break;
|
||||
case 10: // Zen3
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
|
|
@ -1598,13 +1703,20 @@ int get_cpuname(void){
|
|||
switch (family) {
|
||||
case 0x5:
|
||||
return CPUTYPE_CENTAURC6;
|
||||
break;
|
||||
case 0x6:
|
||||
return CPUTYPE_NANO;
|
||||
break;
|
||||
|
||||
if (model == 0xf && stepping < 0xe)
|
||||
return CPUTYPE_NANO;
|
||||
return CPUTYPE_NEHALEM;
|
||||
default:
|
||||
if (family >= 0x7)
|
||||
return CPUTYPE_NEHALEM;
|
||||
else
|
||||
return CPUTYPE_VIAC3;
|
||||
}
|
||||
return CPUTYPE_VIAC3;
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_ZHAOXIN){
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_RISE){
|
||||
|
|
@ -1837,7 +1949,7 @@ char *get_lower_cpunamechar(void){
|
|||
|
||||
int get_coretype(void){
|
||||
|
||||
int family, exfamily, model, exmodel, vendor;
|
||||
int family, exfamily, model, exmodel, vendor, stepping;
|
||||
|
||||
if (!have_cpuid()) return CORE_80486;
|
||||
|
||||
|
|
@ -1845,6 +1957,7 @@ int get_coretype(void){
|
|||
exfamily = get_cputype(GET_EXFAMILY);
|
||||
model = get_cputype(GET_MODEL);
|
||||
exmodel = get_cputype(GET_EXMODEL);
|
||||
stepping = get_cputype(GET_STEPPING);
|
||||
|
||||
vendor = get_vendor();
|
||||
|
||||
|
|
@ -2002,19 +2115,7 @@ int get_coretype(void){
|
|||
return CORE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 10:
|
||||
switch (model) {
|
||||
case 5: // Comet Lake H and S
|
||||
case 6: // Comet Lake U
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
|
||||
case 5:
|
||||
switch (model) {
|
||||
case 6:
|
||||
|
|
@ -2068,6 +2169,7 @@ int get_coretype(void){
|
|||
return CORE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
|
||||
case 6:
|
||||
if (model == 6)
|
||||
#ifndef NO_AVX512
|
||||
|
|
@ -2081,12 +2183,27 @@ int get_coretype(void){
|
|||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
#endif
|
||||
break;
|
||||
#endif
|
||||
if (model == 10 || model == 12)
|
||||
#ifndef NO_AVX512
|
||||
if(support_avx512_bf16())
|
||||
return CORE_COOPERLAKE;
|
||||
return CORE_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
#endif
|
||||
|
||||
case 7:
|
||||
if (model == 10)
|
||||
return CORE_NEHALEM;
|
||||
if (model == 14)
|
||||
if (model == 13 || model == 14) // Ice Lake
|
||||
#ifndef NO_AVX512
|
||||
return CORE_SKYLAKEX;
|
||||
#else
|
||||
|
|
@ -2100,9 +2217,19 @@ int get_coretype(void){
|
|||
return CORE_NEHALEM;
|
||||
#endif
|
||||
break;
|
||||
case 9:
|
||||
|
||||
case 8:
|
||||
if (model == 14) { // Kaby Lake
|
||||
if (model == 12 || model == 13) { // Tiger Lake
|
||||
if(support_avx512())
|
||||
return CORE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CORE_HASWELL;
|
||||
if(support_avx())
|
||||
return CORE_SANDYBRIDGE;
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
if (model == 14) { // Kaby Lake mobile
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
|
|
@ -2112,12 +2239,82 @@ int get_coretype(void){
|
|||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
}
|
||||
if (model == 15) { // Sapphire Rapids
|
||||
if(support_avx512_bf16())
|
||||
return CPUTYPE_COOPERLAKE;
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
|
||||
case 9:
|
||||
if (model == 7 || model == 10) { // Alder Lake
|
||||
if(support_avx2())
|
||||
return CORE_HASWELL;
|
||||
if(support_avx())
|
||||
return CORE_SANDYBRIDGE;
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
if (model == 13) { // Ice Lake NNPI
|
||||
if(support_avx512())
|
||||
return CORE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CORE_HASWELL;
|
||||
if(support_avx())
|
||||
return CORE_SANDYBRIDGE;
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
if (model == 14) { // Kaby Lake desktop
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
|
||||
case 10:
|
||||
switch (model) {
|
||||
case 5: // Comet Lake H and S
|
||||
case 6: // Comet Lake U
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
case 7:// Rocket Lake
|
||||
#ifndef NO_AVX512
|
||||
if(support_avx512())
|
||||
return CORE_SKYLAKEX;
|
||||
#endif
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CORE_HASWELL;
|
||||
#endif
|
||||
if(support_avx())
|
||||
return CORE_SANDYBRIDGE;
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
|
||||
case 15:
|
||||
if (model <= 0x2) return CORE_NORTHWOOD;
|
||||
else return CORE_PRESCOTT;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -2216,10 +2413,19 @@ int get_coretype(void){
|
|||
if (vendor == VENDOR_CENTAUR) {
|
||||
switch (family) {
|
||||
case 0x6:
|
||||
return CORE_NANO;
|
||||
break;
|
||||
if (model == 0xf && stepping < 0xe)
|
||||
return CORE_NANO;
|
||||
return CORE_NEHALEM;
|
||||
default:
|
||||
if (family >= 0x7)
|
||||
return CORE_NEHALEM;
|
||||
else
|
||||
return CORE_VIAC3;
|
||||
}
|
||||
return CORE_VIAC3;
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_ZHAOXIN) {
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
|
||||
return CORE_UNKNOWN;
|
||||
|
|
@ -2302,6 +2508,7 @@ void get_cpuconfig(void){
|
|||
if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n");
|
||||
if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n");
|
||||
if (features & HAVE_AVX512BF16 ) printf("#define HAVE_AVX512BF16\n");
|
||||
if (features & HAVE_AMXBF16 ) printf("#define HAVE_AMXBF16\n");
|
||||
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
|
||||
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
|
||||
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
|
||||
|
|
@ -2373,9 +2580,11 @@ void get_sse(void){
|
|||
if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n");
|
||||
if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n");
|
||||
if (features & HAVE_AVX512BF16 ) printf("HAVE_AVX512BF16=1\n");
|
||||
if (features & HAVE_AMXBF16 ) printf("HAVE_AMXBF16=1\n");
|
||||
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
|
||||
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
|
||||
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
|
||||
if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n");
|
||||
|
||||
}
|
||||
//}
|
||||
|
|
|
|||
|
|
@ -27,57 +27,11 @@
|
|||
|
||||
#include <string.h>
|
||||
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
#define CPU_Z14 2
|
||||
#define CPU_Z15 3
|
||||
#include "cpuid_zarch.h"
|
||||
|
||||
static char *cpuname[] = {
|
||||
"ZARCH_GENERIC",
|
||||
"Z13",
|
||||
"Z14",
|
||||
"Z15"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"zarch_generic",
|
||||
"z13",
|
||||
"z14",
|
||||
"z15"
|
||||
};
|
||||
|
||||
int detect(void)
|
||||
{
|
||||
FILE *infile;
|
||||
char buffer[512], *p;
|
||||
|
||||
p = (char *)NULL;
|
||||
infile = fopen("/proc/sysinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("Type", buffer, 4)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
#if 0
|
||||
fprintf(stderr, "%s\n", p);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
if (strstr(p, "2964")) return CPU_Z13;
|
||||
if (strstr(p, "2965")) return CPU_Z13;
|
||||
if (strstr(p, "3906")) return CPU_Z14;
|
||||
if (strstr(p, "3907")) return CPU_Z14;
|
||||
if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14
|
||||
if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14
|
||||
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
||||
void get_libname(void)
|
||||
{
|
||||
|
||||
int d = detect();
|
||||
printf("%s", cpuname_lower[d]);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,101 @@
|
|||
#include <stdlib.h>
|
||||
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
#define CPU_Z14 2
|
||||
#define CPU_Z15 3
|
||||
|
||||
static char *cpuname[] = {
|
||||
"ZARCH_GENERIC",
|
||||
"Z13",
|
||||
"Z14",
|
||||
"Z15"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"zarch_generic",
|
||||
"z13",
|
||||
"z14",
|
||||
"z15"
|
||||
};
|
||||
|
||||
// Guard the use of getauxval() on glibc version >= 2.16
|
||||
#ifdef __GLIBC__
|
||||
#include <features.h>
|
||||
#if __GLIBC_PREREQ(2, 16)
|
||||
#include <sys/auxv.h>
|
||||
#define HAVE_GETAUXVAL 1
|
||||
|
||||
static unsigned long get_hwcap(void)
|
||||
{
|
||||
unsigned long hwcap = getauxval(AT_HWCAP);
|
||||
char *maskenv;
|
||||
|
||||
// honor requests for not using specific CPU features in LD_HWCAP_MASK
|
||||
maskenv = getenv("LD_HWCAP_MASK");
|
||||
if (maskenv)
|
||||
hwcap &= strtoul(maskenv, NULL, 0);
|
||||
|
||||
return hwcap;
|
||||
// note that a missing auxval is interpreted as no capabilities
|
||||
// available, which is safe.
|
||||
}
|
||||
|
||||
#else // __GLIBC_PREREQ(2, 16)
|
||||
#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16"
|
||||
|
||||
static unsigned long get_hwcap(void) {
|
||||
// treat missing support for getauxval() as no capabilities available,
|
||||
// which is safe.
|
||||
return 0;
|
||||
}
|
||||
#endif // __GLIBC_PREREQ(2, 16)
|
||||
#endif // __GLIBC
|
||||
|
||||
static int detect(void)
|
||||
{
|
||||
unsigned long hwcap = get_hwcap();
|
||||
|
||||
// Choose the architecture level for optimized kernels based on hardware
|
||||
// capability bits (just like glibc chooses optimized implementations).
|
||||
//
|
||||
// The hardware capability bits that are used here indicate both
|
||||
// hardware support for a particular ISA extension and the presence of
|
||||
// software support to enable its use. For example, when HWCAP_S390_VX
|
||||
// is set then both the CPU can execute SIMD instructions and the Linux
|
||||
// kernel can manage applications using the vector registers and SIMD
|
||||
// instructions.
|
||||
//
|
||||
// See glibc's sysdeps/s390/dl-procinfo.h for an overview (also in
|
||||
// sysdeps/unix/sysv/linux/s390/bits/hwcap.h) of the defined hardware
|
||||
// capability bits. They are derived from the information that the
|
||||
// "store facility list (extended)" instructions provide.
|
||||
// (https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/s390/dl-procinfo.h;hb=HEAD)
|
||||
//
|
||||
// currently used:
|
||||
// HWCAP_S390_VX - vector facility for z/Architecture (introduced with
|
||||
// IBM z13), enables level CPU_Z13 (SIMD)
|
||||
// HWCAP_S390_VXE - vector enhancements facility 1 (introduced with IBM
|
||||
// z14), together with VX enables level CPU_Z14
|
||||
// (single-precision SIMD instructions)
|
||||
//
|
||||
// When you add optimized kernels that make use of other ISA extensions
|
||||
// (e.g., for exploiting the vector-enhancements facility 2 that was introduced
|
||||
// with IBM z15), then add a new architecture level (e.g., CPU_Z15) and gate
|
||||
// it on the hwcap that represents it here (e.g., HWCAP_S390_VXRS_EXT2
|
||||
// for the z15 vector enhancements).
|
||||
//
|
||||
// To learn the value of hwcaps on a given system, set the environment
|
||||
// variable LD_SHOW_AUXV and let ld.so dump it (e.g., by running
|
||||
// LD_SHOW_AUXV=1 /bin/true).
|
||||
// Also, the init function for dynamic arch support will print hwcaps
|
||||
// when OPENBLAS_VERBOSE is set to 2 or higher.
|
||||
if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE))
|
||||
return CPU_Z14;
|
||||
|
||||
if (hwcap & HWCAP_S390_VX)
|
||||
return CPU_Z13;
|
||||
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
||||
12
ctest.c
12
ctest.c
|
|
@ -84,7 +84,7 @@ OS_AIX
|
|||
OS_OSF
|
||||
#endif
|
||||
|
||||
#if defined(__WIN32) || defined(__WIN64) || defined(__WINNT)
|
||||
#if defined(__WIN32) || defined(__WIN64) || defined(_WIN32) || defined(_WIN64) || defined(__WINNT)
|
||||
OS_WINNT
|
||||
#endif
|
||||
|
||||
|
|
@ -141,7 +141,7 @@ ARCH_SPARC
|
|||
ARCH_IA64
|
||||
#endif
|
||||
|
||||
#if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__)
|
||||
#if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) || defined(__aarch64__)
|
||||
BINARY_64
|
||||
#endif
|
||||
|
||||
|
|
@ -157,7 +157,15 @@ ARCH_ARM64
|
|||
ARCH_RISCV64
|
||||
#endif
|
||||
|
||||
#ifdef __loongarch64
|
||||
ARCH_LOONGARCH64
|
||||
#endif
|
||||
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
|
||||
HAVE_C11
|
||||
#endif
|
||||
|
||||
#if defined(__e2k__)
|
||||
ARCH_E2K
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -4,10 +4,22 @@ include_directories(${PROJECT_BINARY_DIR})
|
|||
enable_language(Fortran)
|
||||
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS")
|
||||
if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
|
||||
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize")
|
||||
endif()
|
||||
|
||||
if(WIN32)
|
||||
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1
|
||||
"$ErrorActionPreference = \"Stop\"\n"
|
||||
"Get-Content $args[1] | & $args[0]\n"
|
||||
)
|
||||
set(test_helper powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1")
|
||||
else()
|
||||
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh
|
||||
"$1 < $2\n"
|
||||
)
|
||||
set(test_helper sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh")
|
||||
endif()
|
||||
|
||||
foreach(float_type ${FLOAT_TYPES})
|
||||
string(SUBSTRING ${float_type} 0 1 float_char_upper)
|
||||
|
|
@ -21,7 +33,7 @@ foreach(float_type ${FLOAT_TYPES})
|
|||
c_${float_char}blas1.c)
|
||||
target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
|
||||
add_test(NAME "x${float_char}cblat1"
|
||||
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat1")
|
||||
COMMAND $<TARGET_FILE:x${float_char}cblat1>)
|
||||
|
||||
#level2
|
||||
add_executable(x${float_char}cblat2
|
||||
|
|
@ -33,7 +45,7 @@ foreach(float_type ${FLOAT_TYPES})
|
|||
constant.c)
|
||||
target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
|
||||
add_test(NAME "x${float_char}cblat2"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat2" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2")
|
||||
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat2> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2")
|
||||
|
||||
#level3
|
||||
add_executable(x${float_char}cblat3
|
||||
|
|
@ -45,6 +57,6 @@ foreach(float_type ${FLOAT_TYPES})
|
|||
constant.c)
|
||||
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
|
||||
add_test(NAME "x${float_char}cblat3"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat3" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
|
||||
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
|
||||
|
||||
endforeach()
|
||||
|
|
|
|||
|
|
@ -6,6 +6,9 @@ TOPDIR = ..
|
|||
include $(TOPDIR)/Makefile.system
|
||||
|
||||
override CFLAGS += -DADD$(BU) -DCBLAS
|
||||
ifeq ($(F_COMPILER),GFORTRAN)
|
||||
override FFLAGS += -fno-tree-vectorize
|
||||
endif
|
||||
override TARGET_ARCH=
|
||||
override TARGET_MACH=
|
||||
|
||||
|
|
@ -212,6 +215,9 @@ ifeq ($(C_COMPILER), CLANG)
|
|||
CEXTRALIB = -lomp
|
||||
endif
|
||||
endif
|
||||
ifeq ($(F_COMPILER), NAG)
|
||||
CEXTRALIB = -lgomp
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_SINGLE),1)
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ void F77_cgemv(int *order, char *transp, int *m, int *n,
|
|||
get_transpose_type(transp, &trans);
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_COMPLEX) );
|
||||
A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
|
|
@ -50,7 +50,7 @@ void F77_cgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
|
|||
get_transpose_type(transp, &trans);
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *ku+*kl+2;
|
||||
A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
for( i=0; i<*ku; i++ ){
|
||||
irow=*ku+*kl-i;
|
||||
jcol=(*ku)-i;
|
||||
|
|
@ -94,7 +94,7 @@ void F77_cgeru(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
|
|
@ -122,7 +122,7 @@ void F77_cgerc(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
A=(CBLAS_TEST_COMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
|
|
@ -154,7 +154,7 @@ void F77_chemv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = (CBLAS_TEST_COMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
A = (CBLAS_TEST_COMPLEX *)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
|
|
@ -190,7 +190,7 @@ int i,irow,j,jcol,LDA;
|
|||
*incx, beta, y, *incy );
|
||||
else {
|
||||
LDA = *k+2;
|
||||
A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
|
|
@ -251,8 +251,8 @@ void F77_chpmv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
|
|||
beta, y, *incy);
|
||||
else {
|
||||
LDA = *n;
|
||||
A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ));
|
||||
AP = (CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)*
|
||||
A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ));
|
||||
AP = (CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof( CBLAS_TEST_COMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
|
|
@ -311,7 +311,7 @@ void F77_ctbmv(int *order, char *uplow, char *transp, char *diagn,
|
|||
x, *incx);
|
||||
else {
|
||||
LDA = *k+2;
|
||||
A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
|
|
@ -375,7 +375,7 @@ void F77_ctbsv(int *order, char *uplow, char *transp, char *diagn,
|
|||
*incx);
|
||||
else {
|
||||
LDA = *k+2;
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ));
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
|
|
@ -436,8 +436,8 @@ void F77_ctpmv(int *order, char *uplow, char *transp, char *diagn,
|
|||
cblas_ctpmv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
|
||||
else {
|
||||
LDA = *n;
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)*
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof(CBLAS_TEST_COMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
|
|
@ -491,8 +491,8 @@ void F77_ctpsv(int *order, char *uplow, char *transp, char *diagn,
|
|||
cblas_ctpsv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
|
||||
else {
|
||||
LDA = *n;
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)*
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof(CBLAS_TEST_COMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
|
|
@ -544,7 +544,7 @@ void F77_ctrmv(int *order, char *uplow, char *transp, char *diagn,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA=*n+1;
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
|
|
@ -573,7 +573,7 @@ void F77_ctrsv(int *order, char *uplow, char *transp, char *diagn,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A =(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
A =(CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
|
|
@ -601,8 +601,8 @@ void F77_chpr(int *order, char *uplow, int *n, float *alpha,
|
|||
cblas_chpr(CblasRowMajor, UNDEFINED, *n, *alpha, x, *incx, ap );
|
||||
else {
|
||||
LDA = *n;
|
||||
A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
AP = ( CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)*
|
||||
A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
AP = ( CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof( CBLAS_TEST_COMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
|
|
@ -678,8 +678,8 @@ void F77_chpr2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
|
|||
*incy, ap );
|
||||
else {
|
||||
LDA = *n;
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc( LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
AP=(CBLAS_TEST_COMPLEX*)malloc( (((LDA+1)*LDA)/2)*
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc( (size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
AP=(CBLAS_TEST_COMPLEX*)malloc( ((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof( CBLAS_TEST_COMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
|
|
@ -750,7 +750,7 @@ void F77_cher(int *order, char *uplow, int *n, float *alpha,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_COMPLEX ));
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX ));
|
||||
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
|
|
@ -784,7 +784,7 @@ void F77_cher2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ void F77_dgemv(int *order, char *transp, int *m, int *n, double *alpha,
|
|||
get_transpose_type(transp, &trans);
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*m)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
|
|
@ -43,7 +43,7 @@ void F77_dger(int *order, int *m, int *n, double *alpha, double *x, int *incx,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*m)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
|
||||
|
||||
for( i=0; i<*m; i++ ) {
|
||||
for( j=0; j<*n; j++ )
|
||||
|
|
@ -74,7 +74,7 @@ void F77_dtrmv(int *order, char *uplow, char *transp, char *diagn,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
|
|
@ -102,7 +102,7 @@ void F77_dtrsv(int *order, char *uplow, char *transp, char *diagn,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
|
|
@ -123,7 +123,7 @@ void F77_dsymv(int *order, char *uplow, int *n, double *alpha, double *a,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
|
|
@ -146,7 +146,7 @@ void F77_dsyr(int *order, char *uplow, int *n, double *alpha, double *x,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
|
|
@ -170,7 +170,7 @@ void F77_dsyr2(int *order, char *uplow, int *n, double *alpha, double *x,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
|
|
@ -196,7 +196,7 @@ void F77_dgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *ku+*kl+2;
|
||||
A = ( double* )malloc( (*n+*kl)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n+*kl)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*ku; i++ ){
|
||||
irow=*ku+*kl-i;
|
||||
jcol=(*ku)-i;
|
||||
|
|
@ -236,7 +236,7 @@ void F77_dtbmv(int *order, char *uplow, char *transp, char *diagn,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *k+1;
|
||||
A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
|
|
@ -282,7 +282,7 @@ void F77_dtbsv(int *order, char *uplow, char *transp, char *diagn,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *k+1;
|
||||
A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
|
|
@ -325,7 +325,7 @@ void F77_dsbmv(int *order, char *uplow, int *n, int *k, double *alpha,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *k+1;
|
||||
A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
|
|
@ -369,8 +369,8 @@ void F77_dspmv(int *order, char *uplow, int *n, double *alpha, double *ap,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( double* )malloc( LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
|
|
@ -411,8 +411,8 @@ void F77_dtpmv(int *order, char *uplow, char *transp, char *diagn,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( double* )malloc( LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
|
|
@ -451,8 +451,8 @@ void F77_dtpsv(int *order, char *uplow, char *transp, char *diagn,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( double* )malloc( LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
|
|
@ -488,8 +488,8 @@ void F77_dspr(int *order, char *uplow, int *n, double *alpha, double *x,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( double* )malloc( LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
|
|
@ -540,8 +540,8 @@ void F77_dspr2(int *order, char *uplow, int *n, double *alpha, double *x,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( double* )malloc( LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
|
|
|
|||
|
|
@ -26,34 +26,34 @@ void F77_dgemm(int *order, char *transpa, char *transpb, int *m, int *n,
|
|||
if (*order == TEST_ROW_MJR) {
|
||||
if (transa == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
A = (double *)malloc( (*m)*LDA*sizeof( double ) );
|
||||
A = (double *)malloc( (*m)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*k; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else {
|
||||
LDA = *m+1;
|
||||
A = ( double* )malloc( LDA*(*k)*sizeof( double ) );
|
||||
A = ( double* )malloc( (size_t)LDA*(*k)*sizeof( double ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
if (transb == CblasNoTrans) {
|
||||
LDB = *n+1;
|
||||
B = ( double* )malloc( (*k)*LDB*sizeof( double ) );
|
||||
B = ( double* )malloc( (*k)*(size_t)LDB*sizeof( double ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
}
|
||||
else {
|
||||
LDB = *k+1;
|
||||
B = ( double* )malloc( LDB*(*n)*sizeof( double ) );
|
||||
B = ( double* )malloc( (size_t)LDB*(*n)*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
}
|
||||
LDC = *n+1;
|
||||
C = ( double* )malloc( (*m)*LDC*sizeof( double ) );
|
||||
C = ( double* )malloc( (*m)*(size_t)LDC*sizeof( double ) );
|
||||
for( j=0; j<*n; j++ )
|
||||
for( i=0; i<*m; i++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
|
|
@ -89,25 +89,25 @@ void F77_dsymm(int *order, char *rtlf, char *uplow, int *m, int *n,
|
|||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A = ( double* )malloc( (*m)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDB = *n+1;
|
||||
B = ( double* )malloc( (*m)*LDB*sizeof( double ) );
|
||||
B = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
LDC = *n+1;
|
||||
C = ( double* )malloc( (*m)*LDC*sizeof( double ) );
|
||||
C = ( double* )malloc( (*m)*(size_t)LDC*sizeof( double ) );
|
||||
for( j=0; j<*n; j++ )
|
||||
for( i=0; i<*m; i++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
|
|
@ -143,20 +143,20 @@ void F77_dsyrk(int *order, char *uplow, char *transp, int *n, int *k,
|
|||
if (*order == TEST_ROW_MJR) {
|
||||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*k)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*k)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDC = *n+1;
|
||||
C = ( double* )malloc( (*n)*LDC*sizeof( double ) );
|
||||
C = ( double* )malloc( (*n)*(size_t)LDC*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
|
|
@ -191,8 +191,8 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
|||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
LDB = *k+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
B = ( double* )malloc( (*n)*LDB*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
B = ( double* )malloc( (*n)*(size_t)LDB*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
|
|
@ -202,8 +202,8 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
|||
else {
|
||||
LDA = *n+1;
|
||||
LDB = *n+1;
|
||||
A = ( double* )malloc( LDA*(*k)*sizeof( double ) );
|
||||
B = ( double* )malloc( LDB*(*k)*sizeof( double ) );
|
||||
A = ( double* )malloc( (size_t)LDA*(*k)*sizeof( double ) );
|
||||
B = ( double* )malloc( (size_t)LDB*(*k)*sizeof( double ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
|
|
@ -211,7 +211,7 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
|||
}
|
||||
}
|
||||
LDC = *n+1;
|
||||
C = ( double* )malloc( (*n)*LDC*sizeof( double ) );
|
||||
C = ( double* )malloc( (*n)*(size_t)LDC*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
|
|
@ -249,20 +249,20 @@ void F77_dtrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
|||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A = ( double* )malloc( (*m)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDB = *n+1;
|
||||
B = ( double* )malloc( (*m)*LDB*sizeof( double ) );
|
||||
B = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
|
|
@ -300,20 +300,20 @@ void F77_dtrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
|||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A = ( double* )malloc( (*m)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDB = *n+1;
|
||||
B = ( double* )malloc( (*m)*LDB*sizeof( double ) );
|
||||
B = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ void F77_sgemv(int *order, char *transp, int *m, int *n, float *alpha,
|
|||
get_transpose_type(transp, &trans);
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*m)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
|
|
@ -43,7 +43,7 @@ void F77_sger(int *order, int *m, int *n, float *alpha, float *x, int *incx,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*m)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
|
||||
|
||||
for( i=0; i<*m; i++ ) {
|
||||
for( j=0; j<*n; j++ )
|
||||
|
|
@ -74,7 +74,7 @@ void F77_strmv(int *order, char *uplow, char *transp, char *diagn,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
|
|
@ -102,7 +102,7 @@ void F77_strsv(int *order, char *uplow, char *transp, char *diagn,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
|
|
@ -123,7 +123,7 @@ void F77_ssymv(int *order, char *uplow, int *n, float *alpha, float *a,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
|
|
@ -146,7 +146,7 @@ void F77_ssyr(int *order, char *uplow, int *n, float *alpha, float *x,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
|
|
@ -170,7 +170,7 @@ void F77_ssyr2(int *order, char *uplow, int *n, float *alpha, float *x,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
|
|
@ -196,7 +196,7 @@ void F77_sgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *ku+*kl+2;
|
||||
A = ( float* )malloc( (*n+*kl)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n+*kl)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*ku; i++ ){
|
||||
irow=*ku+*kl-i;
|
||||
jcol=(*ku)-i;
|
||||
|
|
@ -236,7 +236,7 @@ void F77_stbmv(int *order, char *uplow, char *transp, char *diagn,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *k+1;
|
||||
A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
|
|
@ -282,7 +282,7 @@ void F77_stbsv(int *order, char *uplow, char *transp, char *diagn,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *k+1;
|
||||
A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
|
|
@ -325,7 +325,7 @@ void F77_ssbmv(int *order, char *uplow, int *n, int *k, float *alpha,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *k+1;
|
||||
A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
|
|
@ -369,8 +369,8 @@ void F77_sspmv(int *order, char *uplow, int *n, float *alpha, float *ap,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( float* )malloc( LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
|
|
@ -410,8 +410,8 @@ void F77_stpmv(int *order, char *uplow, char *transp, char *diagn,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( float* )malloc( LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
|
|
@ -449,8 +449,8 @@ void F77_stpsv(int *order, char *uplow, char *transp, char *diagn,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( float* )malloc( LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
|
|
@ -485,8 +485,8 @@ void F77_sspr(int *order, char *uplow, int *n, float *alpha, float *x,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( float* )malloc( LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
|
|
@ -536,8 +536,8 @@ void F77_sspr2(int *order, char *uplow, int *n, float *alpha, float *x,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( float* )malloc( LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
|
|
|
|||
|
|
@ -23,34 +23,34 @@ void F77_sgemm(int *order, char *transpa, char *transpb, int *m, int *n,
|
|||
if (*order == TEST_ROW_MJR) {
|
||||
if (transa == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
A = (float *)malloc( (*m)*LDA*sizeof( float ) );
|
||||
A = (float *)malloc( (*m)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*k; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else {
|
||||
LDA = *m+1;
|
||||
A = ( float* )malloc( LDA*(*k)*sizeof( float ) );
|
||||
A = ( float* )malloc( (size_t)LDA*(*k)*sizeof( float ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
if (transb == CblasNoTrans) {
|
||||
LDB = *n+1;
|
||||
B = ( float* )malloc( (*k)*LDB*sizeof( float ) );
|
||||
B = ( float* )malloc( (*k)*(size_t)LDB*sizeof( float ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
}
|
||||
else {
|
||||
LDB = *k+1;
|
||||
B = ( float* )malloc( LDB*(*n)*sizeof( float ) );
|
||||
B = ( float* )malloc( (size_t)LDB*(*n)*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
}
|
||||
LDC = *n+1;
|
||||
C = ( float* )malloc( (*m)*LDC*sizeof( float ) );
|
||||
C = ( float* )malloc( (*m)*(size_t)LDC*sizeof( float ) );
|
||||
for( j=0; j<*n; j++ )
|
||||
for( i=0; i<*m; i++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
|
|
@ -85,25 +85,25 @@ void F77_ssymm(int *order, char *rtlf, char *uplow, int *m, int *n,
|
|||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A = ( float* )malloc( (*m)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDB = *n+1;
|
||||
B = ( float* )malloc( (*m)*LDB*sizeof( float ) );
|
||||
B = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
LDC = *n+1;
|
||||
C = ( float* )malloc( (*m)*LDC*sizeof( float ) );
|
||||
C = ( float* )malloc( (*m)*(size_t)LDC*sizeof( float ) );
|
||||
for( j=0; j<*n; j++ )
|
||||
for( i=0; i<*m; i++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
|
|
@ -139,20 +139,20 @@ void F77_ssyrk(int *order, char *uplow, char *transp, int *n, int *k,
|
|||
if (*order == TEST_ROW_MJR) {
|
||||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*k)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*k)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDC = *n+1;
|
||||
C = ( float* )malloc( (*n)*LDC*sizeof( float ) );
|
||||
C = ( float* )malloc( (*n)*(size_t)LDC*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
|
|
@ -187,8 +187,8 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
|||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
LDB = *k+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
B = ( float* )malloc( (*n)*LDB*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
B = ( float* )malloc( (*n)*(size_t)LDB*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
|
|
@ -198,8 +198,8 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
|||
else {
|
||||
LDA = *n+1;
|
||||
LDB = *n+1;
|
||||
A = ( float* )malloc( LDA*(*k)*sizeof( float ) );
|
||||
B = ( float* )malloc( LDB*(*k)*sizeof( float ) );
|
||||
A = ( float* )malloc( (size_t)LDA*(*k)*sizeof( float ) );
|
||||
B = ( float* )malloc( (size_t)LDB*(*k)*sizeof( float ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
|
|
@ -207,7 +207,7 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
|||
}
|
||||
}
|
||||
LDC = *n+1;
|
||||
C = ( float* )malloc( (*n)*LDC*sizeof( float ) );
|
||||
C = ( float* )malloc( (*n)*(size_t)LDC*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
|
|
@ -245,20 +245,20 @@ void F77_strmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
|||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A = ( float* )malloc( (*m)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDB = *n+1;
|
||||
B = ( float* )malloc( (*m)*LDB*sizeof( float ) );
|
||||
B = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
|
|
@ -296,20 +296,20 @@ void F77_strsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
|||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A = ( float* )malloc( (*m)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDB = *n+1;
|
||||
B = ( float* )malloc( (*m)*LDB*sizeof( float ) );
|
||||
B = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ void F77_zgemv(int *order, char *transp, int *m, int *n,
|
|||
get_transpose_type(transp, &trans);
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = (CBLAS_TEST_ZOMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_ZOMPLEX) );
|
||||
A = (CBLAS_TEST_ZOMPLEX *)malloc( (*m)*(size_t)LDA*sizeof( CBLAS_TEST_ZOMPLEX) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
|
|
@ -50,7 +50,7 @@ void F77_zgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
|
|||
get_transpose_type(transp, &trans);
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *ku+*kl+2;
|
||||
A=( CBLAS_TEST_ZOMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=( CBLAS_TEST_ZOMPLEX* )malloc((*n+*kl)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*ku; i++ ){
|
||||
irow=*ku+*kl-i;
|
||||
jcol=(*ku)-i;
|
||||
|
|
@ -94,7 +94,7 @@ void F77_zgeru(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
|
|
@ -122,7 +122,7 @@ void F77_zgerc(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
|
|
@ -154,7 +154,7 @@ void F77_zhemv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = (CBLAS_TEST_ZOMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A = (CBLAS_TEST_ZOMPLEX *)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
|
|
@ -190,7 +190,7 @@ int i,irow,j,jcol,LDA;
|
|||
*incx, beta, y, *incy );
|
||||
else {
|
||||
LDA = *k+2;
|
||||
A =(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A =(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
|
|
@ -251,8 +251,8 @@ void F77_zhpmv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
|
|||
beta, y, *incy);
|
||||
else {
|
||||
LDA = *n;
|
||||
A = (CBLAS_TEST_ZOMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
AP = (CBLAS_TEST_ZOMPLEX* )malloc( (((LDA+1)*LDA)/2)*
|
||||
A = (CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
AP = (CBLAS_TEST_ZOMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof( CBLAS_TEST_ZOMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
|
|
@ -311,7 +311,7 @@ void F77_ztbmv(int *order, char *uplow, char *transp, char *diagn,
|
|||
x, *incx);
|
||||
else {
|
||||
LDA = *k+2;
|
||||
A=(CBLAS_TEST_ZOMPLEX *)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX *)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
|
|
@ -375,7 +375,7 @@ void F77_ztbsv(int *order, char *uplow, char *transp, char *diagn,
|
|||
*incx);
|
||||
else {
|
||||
LDA = *k+2;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
|
|
@ -436,8 +436,8 @@ void F77_ztpmv(int *order, char *uplow, char *transp, char *diagn,
|
|||
cblas_ztpmv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
|
||||
else {
|
||||
LDA = *n;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
AP=(CBLAS_TEST_ZOMPLEX*)malloc((((LDA+1)*LDA)/2)*
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
AP=(CBLAS_TEST_ZOMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
|
|
@ -491,8 +491,8 @@ void F77_ztpsv(int *order, char *uplow, char *transp, char *diagn,
|
|||
cblas_ztpsv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
|
||||
else {
|
||||
LDA = *n;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
AP=(CBLAS_TEST_ZOMPLEX*)malloc((((LDA+1)*LDA)/2)*
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
AP=(CBLAS_TEST_ZOMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
|
|
@ -544,7 +544,7 @@ void F77_ztrmv(int *order, char *uplow, char *transp, char *diagn,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA=*n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
|
|
@ -573,7 +573,7 @@ void F77_ztrsv(int *order, char *uplow, char *transp, char *diagn,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A =(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A =(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
|
|
@ -601,8 +601,8 @@ void F77_zhpr(int *order, char *uplow, int *n, double *alpha,
|
|||
cblas_zhpr(CblasRowMajor, UNDEFINED, *n, *alpha, x, *incx, ap );
|
||||
else {
|
||||
LDA = *n;
|
||||
A = (CBLAS_TEST_ZOMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
AP = ( CBLAS_TEST_ZOMPLEX* )malloc( (((LDA+1)*LDA)/2)*
|
||||
A = (CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
AP = ( CBLAS_TEST_ZOMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof( CBLAS_TEST_ZOMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
|
|
@ -678,8 +678,8 @@ void F77_zhpr2(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
|
|||
*incy, ap );
|
||||
else {
|
||||
LDA = *n;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc( LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
AP=(CBLAS_TEST_ZOMPLEX*)malloc( (((LDA+1)*LDA)/2)*
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc( (size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
AP=(CBLAS_TEST_ZOMPLEX*)malloc( ((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof( CBLAS_TEST_ZOMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
|
|
@ -750,7 +750,7 @@ void F77_zher(int *order, char *uplow, int *n, double *alpha,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_ZOMPLEX ));
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*(size_t)LDA*sizeof( CBLAS_TEST_ZOMPLEX ));
|
||||
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
|
|
@ -784,7 +784,7 @@ void F77_zher2(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
|
|||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A= ( CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A= ( CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
|
|
|
|||
|
|
@ -26,7 +26,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
|
|||
if (*order == TEST_ROW_MJR) {
|
||||
if (transa == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
|
|
@ -35,7 +35,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
|
|||
}
|
||||
else {
|
||||
LDA = *m+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*m; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
|
|
@ -45,7 +45,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
|
|||
|
||||
if (transb == CblasNoTrans) {
|
||||
LDB = *n+1;
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) );
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
B[i*LDB+j].real=b[j*(*ldb)+i].real;
|
||||
|
|
@ -54,7 +54,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
|
|||
}
|
||||
else {
|
||||
LDB = *k+1;
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
B[i*LDB+j].real=b[j*(*ldb)+i].real;
|
||||
|
|
@ -63,7 +63,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
|
|||
}
|
||||
|
||||
LDC = *n+1;
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( j=0; j<*n; j++ )
|
||||
for( i=0; i<*m; i++ ) {
|
||||
C[i*LDC+j].real=c[j*(*ldc)+i].real;
|
||||
|
|
@ -103,7 +103,7 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
|
|||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
|
|
@ -112,7 +112,7 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
|
|||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
|
|
@ -120,14 +120,14 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
|
|||
}
|
||||
}
|
||||
LDB = *n+1;
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
B[i*LDB+j].real=b[j*(*ldb)+i].real;
|
||||
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
|
||||
}
|
||||
LDC = *n+1;
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( j=0; j<*n; j++ )
|
||||
for( i=0; i<*m; i++ ) {
|
||||
C[i*LDC+j].real=c[j*(*ldc)+i].real;
|
||||
|
|
@ -167,25 +167,25 @@ void F77_zsymm(int *order, char *rtlf, char *uplow, int *m, int *n,
|
|||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDB = *n+1;
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
LDC = *n+1;
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( j=0; j<*n; j++ )
|
||||
for( i=0; i<*m; i++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
|
|
@ -221,7 +221,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
|
|||
if (*order == TEST_ROW_MJR) {
|
||||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
|
|
@ -230,7 +230,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
|
|||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
|
|
@ -238,7 +238,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
|
|||
}
|
||||
}
|
||||
LDC = *n+1;
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
C[i*LDC+j].real=c[j*(*ldc)+i].real;
|
||||
|
|
@ -277,7 +277,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
|
|||
if (*order == TEST_ROW_MJR) {
|
||||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
|
|
@ -286,7 +286,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
|
|||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
|
|
@ -294,7 +294,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
|
|||
}
|
||||
}
|
||||
LDC = *n+1;
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
C[i*LDC+j].real=c[j*(*ldc)+i].real;
|
||||
|
|
@ -333,8 +333,8 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
|
|||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
LDB = *k+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
|
|
@ -346,8 +346,8 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
|
|||
else {
|
||||
LDA = *n+1;
|
||||
LDB = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc( (size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc( (size_t)LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
|
|
@ -357,7 +357,7 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
|
|||
}
|
||||
}
|
||||
LDC = *n+1;
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
C[i*LDC+j].real=c[j*(*ldc)+i].real;
|
||||
|
|
@ -397,8 +397,8 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
|||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
LDB = *k+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
|
|
@ -410,8 +410,8 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
|||
else {
|
||||
LDA = *n+1;
|
||||
LDB = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
|
|
@ -421,7 +421,7 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
|||
}
|
||||
}
|
||||
LDC = *n+1;
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
C[i*LDC+j].real=c[j*(*ldc)+i].real;
|
||||
|
|
@ -463,7 +463,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
|||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
|
|
@ -472,7 +472,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
|||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
|
|
@ -480,7 +480,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
|||
}
|
||||
}
|
||||
LDB = *n+1;
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
B[i*LDB+j].real=b[j*(*ldb)+i].real;
|
||||
|
|
@ -522,7 +522,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
|||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
|
|
@ -531,7 +531,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
|||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
|
|
@ -539,7 +539,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
|||
}
|
||||
}
|
||||
LDB = *n+1;
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
B[i*LDB+j].real=b[j*(*ldb)+i].real;
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
#include "cblas_test.h"
|
||||
int CBLAS_CallFromC;
|
||||
int RowMajorStrg;
|
||||
|
||||
|
|
|
|||
|
|
@ -81,6 +81,7 @@ foreach (float_type ${FLOAT_TYPES})
|
|||
GenerateNamedObjects("gbmv_thread.c" "TRANSA" "gbmv_thread_t" false "" "" false ${float_type})
|
||||
endif ()
|
||||
|
||||
# special defines for complex
|
||||
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
|
||||
|
||||
foreach (u_source ${U_SOURCES})
|
||||
|
|
@ -197,6 +198,13 @@ foreach (float_type ${FLOAT_TYPES})
|
|||
endif ()
|
||||
endforeach ()
|
||||
|
||||
if (BUILD_BFLOAT16)
|
||||
if (USE_THREAD)
|
||||
GenerateNamedObjects("sbgemv_thread.c" "" "gemv_thread_n" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("sbgemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "BFLOAT16")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if ( BUILD_COMPLEX AND NOT BUILD_SINGLE)
|
||||
if (USE_THREAD)
|
||||
GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "SINGLE")
|
||||
|
|
|
|||
|
|
@ -64,9 +64,9 @@ CBLASOBJS += \
|
|||
chpmv_U.$(SUFFIX) chpmv_L.$(SUFFIX) chpmv_V.$(SUFFIX) chpmv_M.$(SUFFIX) \
|
||||
chpr_U.$(SUFFIX) chpr_L.$(SUFFIX) chpr_V.$(SUFFIX) chpr_M.$(SUFFIX) \
|
||||
chpr2_U.$(SUFFIX) chpr2_L.$(SUFFIX) chpr2_V.$(SUFFIX) chpr2_M.$(SUFFIX) \
|
||||
csbmv_U.$(SUFFIX) csbmv_L.$(SUFFIX) cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \
|
||||
cspr_U.$(SUFFIX) cspr_L.$(SUFFIX) cspr2_U.$(SUFFIX) cspr2_L.$(SUFFIX) \
|
||||
csyr_U.$(SUFFIX) csyr_L.$(SUFFIX) csyr2_U.$(SUFFIX) csyr2_L.$(SUFFIX) \
|
||||
csbmv_U.$(SUFFIX) csbmv_L.$(SUFFIX) \
|
||||
cspr2_U.$(SUFFIX) cspr2_L.$(SUFFIX) \
|
||||
csyr2_U.$(SUFFIX) csyr2_L.$(SUFFIX) \
|
||||
ctbmv_NUU.$(SUFFIX) ctbmv_NUN.$(SUFFIX) ctbmv_NLU.$(SUFFIX) ctbmv_NLN.$(SUFFIX) \
|
||||
ctbmv_TUU.$(SUFFIX) ctbmv_TUN.$(SUFFIX) ctbmv_TLU.$(SUFFIX) ctbmv_TLN.$(SUFFIX) \
|
||||
ctbmv_RUU.$(SUFFIX) ctbmv_RUN.$(SUFFIX) ctbmv_RLU.$(SUFFIX) ctbmv_RLN.$(SUFFIX) \
|
||||
|
|
@ -92,6 +92,13 @@ CBLASOBJS += \
|
|||
ctrsv_RUU.$(SUFFIX) ctrsv_RUN.$(SUFFIX) ctrsv_RLU.$(SUFFIX) ctrsv_RLN.$(SUFFIX) \
|
||||
ctrsv_CUU.$(SUFFIX) ctrsv_CUN.$(SUFFIX) ctrsv_CLU.$(SUFFIX) ctrsv_CLN.$(SUFFIX)
|
||||
|
||||
ifndef NO_LAPACK
|
||||
CBLASOBJS += \
|
||||
cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \
|
||||
cspr_U.$(SUFFIX) cspr_L.$(SUFFIX) \
|
||||
csyr_U.$(SUFFIX) csyr_L.$(SUFFIX)
|
||||
endif
|
||||
|
||||
ZBLASOBJS += \
|
||||
zgbmv_n.$(SUFFIX) zgbmv_t.$(SUFFIX) zgbmv_r.$(SUFFIX) zgbmv_c.$(SUFFIX) \
|
||||
zgbmv_o.$(SUFFIX) zgbmv_u.$(SUFFIX) zgbmv_s.$(SUFFIX) zgbmv_d.$(SUFFIX) \
|
||||
|
|
|
|||
|
|
@ -12,6 +12,12 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES})
|
|||
if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3)
|
||||
GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0)
|
||||
endif ()
|
||||
if (BUILD_BFLOAT16)
|
||||
GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16")
|
||||
if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3)
|
||||
GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16")
|
||||
endif ()
|
||||
endif ()
|
||||
endforeach ()
|
||||
|
||||
if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
|
||||
|
|
|
|||
|
|
@ -425,7 +425,7 @@ cgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
|||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -473,7 +473,7 @@ zgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
|||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -521,7 +521,7 @@ xgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
|||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -632,7 +632,7 @@ cgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
|||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -680,7 +680,7 @@ zgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
|||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -728,7 +728,7 @@ xgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
|||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -1895,7 +1895,7 @@ cgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
|||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -1943,7 +1943,7 @@ zgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
|||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -1991,7 +1991,7 @@ xgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
|||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -2048,7 +2048,7 @@ cgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
|||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -2096,7 +2096,7 @@ zgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
|||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -2144,7 +2144,7 @@ xgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
|||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -2817,7 +2817,7 @@ cgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
|||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -2865,7 +2865,7 @@ zgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
|||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -2913,7 +2913,7 @@ xgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
|||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -3025,7 +3025,7 @@ cgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
|||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -3073,7 +3073,7 @@ zgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
|||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -3121,7 +3121,7 @@ xgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
|||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -4288,7 +4288,7 @@ cgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
|||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -4336,7 +4336,7 @@ zgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
|||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -4384,7 +4384,7 @@ xgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
|||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -4441,7 +4441,7 @@ cgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
|||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -4489,7 +4489,7 @@ zgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
|||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
@ -4537,7 +4537,7 @@ xgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
|||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
|
|
|||
|
|
@ -333,14 +333,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
#else
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
|
||||
/*
|
||||
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
|
||||
else
|
||||
*/
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -367,14 +367,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
/* Split local region of B into parts */
|
||||
for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){
|
||||
min_jj = MIN(n_to, js + div_n) - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||
else
|
||||
/*
|
||||
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
|
||||
else
|
||||
*/
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
#endif
|
||||
/* Copy part of local region of B into workspace */
|
||||
|
|
|
|||
|
|
@ -138,7 +138,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
@ -215,7 +215,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
@ -320,7 +320,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
@ -399,7 +399,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -122,7 +122,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = 0; jjs < ls - js; jjs += min_jj){
|
||||
min_jj = ls - js - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
@ -146,7 +146,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = 0; jjs < min_l; jjs += min_jj){
|
||||
min_jj = min_l - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
@ -203,7 +203,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
@ -258,7 +258,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = 0; jjs < min_l; jjs += min_jj){
|
||||
min_jj = min_l - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
@ -283,7 +283,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){
|
||||
min_jj = js - ls - min_l - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
@ -344,7 +344,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
|
|
|
|||
|
|
@ -49,6 +49,8 @@ GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 ""
|
|||
if (DYNAMIC_ARCH)
|
||||
if (ARM64)
|
||||
list(APPEND COMMON_SOURCES dynamic_arm64.c)
|
||||
elseif (POWER)
|
||||
list(APPEND COMMON_SOURCES dynamic_power.c)
|
||||
else ()
|
||||
list(APPEND COMMON_SOURCES dynamic.c)
|
||||
endif ()
|
||||
|
|
|
|||
|
|
@ -24,10 +24,14 @@ else
|
|||
ifeq ($(ARCH),zarch)
|
||||
COMMONOBJS += dynamic_zarch.$(SUFFIX)
|
||||
else
|
||||
ifeq ($(ARCH),mips64)
|
||||
COMMONOBJS += dynamic_mips64.$(SUFFIX)
|
||||
else
|
||||
COMMONOBJS += dynamic.$(SUFFIX)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
COMMONOBJS += parameter.$(SUFFIX)
|
||||
endif
|
||||
|
|
@ -92,10 +96,14 @@ else
|
|||
ifeq ($(ARCH),zarch)
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX)
|
||||
else
|
||||
ifeq ($(ARCH),mips64)
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX)
|
||||
else
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
|
||||
endif
|
||||
|
|
|
|||
|
|
@ -209,7 +209,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
/* REAL / Double */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
|
||||
double *, BLASLONG, double *, BLASLONG,
|
||||
double *, BLASLONG, void *) = func;
|
||||
double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG,
|
||||
double *, BLASLONG, double *, BLASLONG, void *)) func;
|
||||
|
||||
afunc(args -> m, args -> n, args -> k,
|
||||
((double *)args -> alpha)[0],
|
||||
|
|
@ -220,7 +221,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
/* REAL / Single */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
|
||||
float *, BLASLONG, float *, BLASLONG,
|
||||
float *, BLASLONG, void *) = func;
|
||||
float *, BLASLONG, void *) = (void (*)
|
||||
(BLASLONG, BLASLONG, BLASLONG, float,
|
||||
float *, BLASLONG, float *, BLASLONG,
|
||||
float *, BLASLONG, void *)) func;
|
||||
|
||||
afunc(args -> m, args -> n, args -> k,
|
||||
((float *)args -> alpha)[0],
|
||||
|
|
@ -232,7 +236,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
/* REAL / BFLOAT16 */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
|
||||
bfloat16 *, BLASLONG, bfloat16 *, BLASLONG,
|
||||
bfloat16 *, BLASLONG, void *) = func;
|
||||
bfloat16 *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
|
||||
bfloat16 *, BLASLONG, bfloat16 *, BLASLONG,
|
||||
bfloat16 *, BLASLONG, void *)) func;
|
||||
|
||||
afunc(args -> m, args -> n, args -> k,
|
||||
((bfloat16 *)args -> alpha)[0],
|
||||
|
|
@ -243,7 +249,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
/* REAL / BLAS_STOBF16 */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
|
||||
float *, BLASLONG, bfloat16 *, BLASLONG,
|
||||
float *, BLASLONG, void *) = func;
|
||||
float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float,
|
||||
float *, BLASLONG, bfloat16 *, BLASLONG,
|
||||
float *, BLASLONG, void *)) func;
|
||||
|
||||
afunc(args -> m, args -> n, args -> k,
|
||||
((float *)args -> alpha)[0],
|
||||
|
|
@ -254,7 +262,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
/* REAL / BLAS_DTOBF16 */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
|
||||
double *, BLASLONG, bfloat16 *, BLASLONG,
|
||||
double *, BLASLONG, void *) = func;
|
||||
double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double,
|
||||
double *, BLASLONG, bfloat16 *, BLASLONG,
|
||||
double *, BLASLONG, void *)) func;
|
||||
|
||||
afunc(args -> m, args -> n, args -> k,
|
||||
((double *)args -> alpha)[0],
|
||||
|
|
@ -271,7 +281,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
/* COMPLEX / Extended Double */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
|
||||
xdouble *, BLASLONG, xdouble *, BLASLONG,
|
||||
xdouble *, BLASLONG, void *) = func;
|
||||
xdouble *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
|
||||
xdouble *, BLASLONG, xdouble *, BLASLONG,
|
||||
xdouble *, BLASLONG, void *)) func;
|
||||
|
||||
afunc(args -> m, args -> n, args -> k,
|
||||
((xdouble *)args -> alpha)[0],
|
||||
|
|
@ -285,7 +297,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
/* COMPLEX / Double */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
|
||||
double *, BLASLONG, double *, BLASLONG,
|
||||
double *, BLASLONG, void *) = func;
|
||||
double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double,
|
||||
double *, BLASLONG, double *, BLASLONG,
|
||||
double *, BLASLONG, void *)) func;
|
||||
|
||||
afunc(args -> m, args -> n, args -> k,
|
||||
((double *)args -> alpha)[0],
|
||||
|
|
@ -297,7 +311,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
/* COMPLEX / Single */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float,
|
||||
float *, BLASLONG, float *, BLASLONG,
|
||||
float *, BLASLONG, void *) = func;
|
||||
float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float, float,
|
||||
float *, BLASLONG, float *, BLASLONG,
|
||||
float *, BLASLONG, void *)) func;
|
||||
|
||||
afunc(args -> m, args -> n, args -> k,
|
||||
((float *)args -> alpha)[0],
|
||||
|
|
@ -425,7 +441,7 @@ blas_queue_t *tscq;
|
|||
#endif
|
||||
|
||||
if (queue) {
|
||||
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
|
||||
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = (int (*)(blas_arg_t *, void *, void *, void *, void *, BLASLONG))queue -> routine;
|
||||
|
||||
atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1);
|
||||
|
||||
|
|
@ -503,7 +519,7 @@ blas_queue_t *tscq;
|
|||
legacy_exec(routine, queue -> mode, queue -> args, sb);
|
||||
} else
|
||||
if (queue -> mode & BLAS_PTHREAD) {
|
||||
void (*pthreadcompat)(void *) = queue -> routine;
|
||||
void (*pthreadcompat)(void *) = (void(*)(void*))queue -> routine;
|
||||
(pthreadcompat)(queue -> args);
|
||||
} else
|
||||
(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
|
||||
|
|
@ -871,13 +887,13 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
|||
fprintf(STDERR, "\n");
|
||||
#endif
|
||||
|
||||
routine = queue -> routine;
|
||||
routine = (int (*)(blas_arg_t *, void *, void *, double *, double *, BLASLONG))queue -> routine;
|
||||
|
||||
if (queue -> mode & BLAS_LEGACY) {
|
||||
legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
|
||||
} else
|
||||
if (queue -> mode & BLAS_PTHREAD) {
|
||||
void (*pthreadcompat)(void *) = queue -> routine;
|
||||
void (*pthreadcompat)(void *) = (void (*)(void*))queue -> routine;
|
||||
(pthreadcompat)(queue -> args);
|
||||
} else
|
||||
(routine)(queue -> args, queue -> range_m, queue -> range_n,
|
||||
|
|
@ -967,9 +983,11 @@ void goto_set_num_threads(int num_threads) {
|
|||
blas_cpu_number = num_threads;
|
||||
|
||||
#if defined(ARCH_MIPS64)
|
||||
#ifndef DYNAMIC_ARCH
|
||||
//set parameters for different number of threads.
|
||||
blas_set_parameter();
|
||||
#endif
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -1022,38 +1040,39 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
|||
|
||||
int i;
|
||||
|
||||
if (!blas_server_avail) return 0;
|
||||
|
||||
LOCK_COMMAND(&server_lock);
|
||||
|
||||
for (i = 0; i < blas_num_threads - 1; i++) {
|
||||
if (blas_server_avail) {
|
||||
|
||||
for (i = 0; i < blas_num_threads - 1; i++) {
|
||||
|
||||
|
||||
pthread_mutex_lock (&thread_status[i].lock);
|
||||
pthread_mutex_lock (&thread_status[i].lock);
|
||||
|
||||
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1);
|
||||
thread_status[i].status = THREAD_STATUS_WAKEUP;
|
||||
pthread_cond_signal (&thread_status[i].wakeup);
|
||||
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1);
|
||||
thread_status[i].status = THREAD_STATUS_WAKEUP;
|
||||
pthread_cond_signal (&thread_status[i].wakeup);
|
||||
|
||||
pthread_mutex_unlock(&thread_status[i].lock);
|
||||
pthread_mutex_unlock(&thread_status[i].lock);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
pthread_join(blas_threads[i], NULL);
|
||||
}
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
pthread_join(blas_threads[i], NULL);
|
||||
}
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
pthread_mutex_destroy(&thread_status[i].lock);
|
||||
pthread_cond_destroy (&thread_status[i].wakeup);
|
||||
}
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
pthread_mutex_destroy(&thread_status[i].lock);
|
||||
pthread_cond_destroy (&thread_status[i].wakeup);
|
||||
}
|
||||
|
||||
#ifdef NEED_STACKATTR
|
||||
pthread_attr_destory(&attr);
|
||||
pthread_attr_destroy(&attr);
|
||||
#endif
|
||||
|
||||
blas_server_avail = 0;
|
||||
blas_server_avail = 0;
|
||||
|
||||
}
|
||||
UNLOCK_COMMAND(&server_lock);
|
||||
|
||||
return 0;
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@
|
|||
#include <stdlib.h>
|
||||
#include "common.h"
|
||||
|
||||
#if defined(OS_CYGWIN_NT) && !defined(unlikely)
|
||||
#if !defined(unlikely)
|
||||
#ifdef __GNUC__
|
||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
#else
|
||||
|
|
@ -391,8 +391,9 @@ int blas_thread_init(void){
|
|||
|
||||
int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
|
||||
|
||||
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
|
||||
#if defined(SMP_SERVER)
|
||||
// Handle lazy re-init of the thread-pool after a POSIX fork
|
||||
// on Cygwin or as delayed init when a static library is used
|
||||
if (unlikely(blas_server_avail == 0)) blas_thread_init();
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -292,6 +292,7 @@ extern gotoblas_t gotoblas_COOPERLAKE;
|
|||
#define VENDOR_AMD 2
|
||||
#define VENDOR_CENTAUR 3
|
||||
#define VENDOR_HYGON 4
|
||||
#define VENDOR_ZHAOXIN 5
|
||||
#define VENDOR_UNKNOWN 99
|
||||
|
||||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
|
||||
|
|
@ -404,6 +405,7 @@ static int get_vendor(void){
|
|||
if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
|
||||
if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
|
||||
if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
|
||||
if (!strcmp(vendor.vchar, " Shanghai ")) return VENDOR_ZHAOXIN;
|
||||
if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
|
||||
|
||||
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
|
||||
|
|
@ -414,7 +416,7 @@ static int get_vendor(void){
|
|||
static gotoblas_t *get_coretype(void){
|
||||
|
||||
int eax, ebx, ecx, edx;
|
||||
int family, exfamily, model, vendor, exmodel;
|
||||
int family, exfamily, model, vendor, exmodel, stepping;
|
||||
|
||||
cpuid(1, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
|
|
@ -422,6 +424,7 @@ static gotoblas_t *get_coretype(void){
|
|||
exfamily = BITMASK(eax, 20, 0xff);
|
||||
model = BITMASK(eax, 4, 0x0f);
|
||||
exmodel = BITMASK(eax, 16, 0x0f);
|
||||
stepping = BITMASK(eax, 0, 0x0f);
|
||||
|
||||
vendor = get_vendor();
|
||||
|
||||
|
|
@ -621,11 +624,27 @@ static gotoblas_t *get_coretype(void){
|
|||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
if (model == 10 || model == 12){
|
||||
// Ice Lake SP
|
||||
if(support_avx512_bf16())
|
||||
return &gotoblas_COOPERLAKE;
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 7:
|
||||
if (model == 10) // Goldmont Plus
|
||||
return &gotoblas_NEHALEM;
|
||||
if (model == 14) {
|
||||
if (model == 13 || model == 14) {
|
||||
// Ice Lake
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
|
|
@ -642,8 +661,22 @@ static gotoblas_t *get_coretype(void){
|
|||
}
|
||||
}
|
||||
return NULL;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 12 || model == 13) { // Tiger Lake
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
if (model == 14 ) { // Kaby Lake, Coffee Lake
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
|
|
@ -655,8 +688,42 @@ static gotoblas_t *get_coretype(void){
|
|||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
case 10:
|
||||
if (model == 5 || model == 6) {
|
||||
if (model == 15){ // Sapphire Rapids
|
||||
if(support_avx512_bf16())
|
||||
return &gotoblas_COOPERLAKE;
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
|
||||
|
||||
case 9:
|
||||
if (model == 7 || model == 10) { // Alder Lake
|
||||
if(support_avx512_bf16())
|
||||
return &gotoblas_COOPERLAKE;
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2()){
|
||||
return &gotoblas_HASWELL;
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
if (model == 14 ) { // Kaby Lake, Coffee Lake
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
|
|
@ -666,7 +733,33 @@ static gotoblas_t *get_coretype(void){
|
|||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 10:
|
||||
if (model == 5 || model == 6) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
if (model == 7) {
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
case 0xf:
|
||||
|
|
@ -779,10 +872,19 @@ static gotoblas_t *get_coretype(void){
|
|||
if (vendor == VENDOR_CENTAUR) {
|
||||
switch (family) {
|
||||
case 0x6:
|
||||
return &gotoblas_NANO;
|
||||
if (model == 0xf && stepping < 0xe)
|
||||
return &gotoblas_NANO;
|
||||
return &gotoblas_NEHALEM;
|
||||
default:
|
||||
if (family >= 0x7)
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_ZHAOXIN) {
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
|
@ -962,7 +1064,13 @@ void gotoblas_dynamic_init(void) {
|
|||
#ifdef ARCH_X86
|
||||
if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI;
|
||||
#else
|
||||
if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT;
|
||||
if (gotoblas == NULL) {
|
||||
if (support_avx512_bf16()) gotoblas = &gotoblas_COOPERLAKE;
|
||||
else if (support_avx512()) gotoblas = &gotoblas_SKYLAKEX;
|
||||
else if (support_avx2()) gotoblas = &gotoblas_HASWELL;
|
||||
else if (support_avx()) gotoblas = &gotoblas_SANDYBRIDGE;
|
||||
else gotoblas = &gotoblas_PRESCOTT;
|
||||
}
|
||||
/* sanity check, if 64bit pointer we can't have a 32 bit cpu */
|
||||
if (sizeof(void*) == 8) {
|
||||
if (gotoblas == &gotoblas_KATMAI ||
|
||||
|
|
|
|||
|
|
@ -43,6 +43,68 @@
|
|||
#endif
|
||||
|
||||
extern gotoblas_t gotoblas_ARMV8;
|
||||
#ifdef DYNAMIC_LIST
|
||||
#ifdef DYN_CORTEXA53
|
||||
extern gotoblas_t gotoblas_CORTEXA53;
|
||||
#else
|
||||
#define gotoblas_CORTEXA53 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_CORTEXA57
|
||||
extern gotoblas_t gotoblas_CORTEXA57;
|
||||
#else
|
||||
#define gotoblas_CORTEXA57 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_CORTEXA72
|
||||
extern gotoblas_t gotoblas_CORTEXA72;
|
||||
#else
|
||||
#define gotoblas_CORTEXA72 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_CORTEXA73
|
||||
extern gotoblas_t gotoblas_CORTEXA73;
|
||||
#else
|
||||
#define gotoblas_CORTEXA73 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_FALKOR
|
||||
extern gotoblas_t gotoblas_FALKOR;
|
||||
#else
|
||||
#define gotoblas_FALKOR gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_TSV110
|
||||
extern gotoblas_t gotoblas_TSV110;
|
||||
#else
|
||||
#define gotoblas_TSV110 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_THUNDERX
|
||||
extern gotoblas_t gotoblas_THUNDERX;
|
||||
#else
|
||||
#define gotoblas_THUNDERX gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_THUNDERX2T99
|
||||
extern gotoblas_t gotoblas_THUNDERX2T99;
|
||||
#else
|
||||
#define gotoblas_THUNDERX2T99 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_THUNDERX3T110
|
||||
extern gotoblas_t gotoblas_THUNDERX3T110;
|
||||
#else
|
||||
#define gotoblas_THUNDERX3T110 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_EMAG8180
|
||||
extern gotoblas_t gotoblas_EMAG8180;
|
||||
#else
|
||||
#define gotoblas_EMAG8180 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_NEOVERSEN1
|
||||
extern gotoblas_t gotoblas_NEOVERSEN1;
|
||||
#else
|
||||
#define gotoblas_NEOVERSEN1 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_CORTEX_A55
|
||||
extern gotoblas_t gotoblas_CORTEXA55;
|
||||
#else
|
||||
#define gotoblas_CORTEXA55 gotoblas_ARMV8
|
||||
#endif
|
||||
#else
|
||||
extern gotoblas_t gotoblas_CORTEXA53;
|
||||
extern gotoblas_t gotoblas_CORTEXA57;
|
||||
extern gotoblas_t gotoblas_CORTEXA72;
|
||||
|
|
@ -54,10 +116,12 @@ extern gotoblas_t gotoblas_TSV110;
|
|||
extern gotoblas_t gotoblas_EMAG8180;
|
||||
extern gotoblas_t gotoblas_NEOVERSEN1;
|
||||
extern gotoblas_t gotoblas_THUNDERX3T110;
|
||||
extern gotoblas_t gotoblas_CORTEXA55;
|
||||
#endif
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
#define NUM_CORETYPES 12
|
||||
#define NUM_CORETYPES 13
|
||||
|
||||
/*
|
||||
* In case asm/hwcap.h is outdated on the build system, make sure
|
||||
|
|
@ -68,7 +132,7 @@ extern void openblas_warning(int verbose, const char * msg);
|
|||
#endif
|
||||
|
||||
#define get_cpu_ftr(id, var) ({ \
|
||||
__asm__("mrs %0, "#id : "=r" (var)); \
|
||||
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \
|
||||
})
|
||||
|
||||
static char *corename[] = {
|
||||
|
|
@ -83,7 +147,10 @@ static char *corename[] = {
|
|||
"tsv110",
|
||||
"emag8180",
|
||||
"neoversen1",
|
||||
"neoversev1",
|
||||
"neoversen2",
|
||||
"thunderx3t110",
|
||||
"cortexa55",
|
||||
"unknown"
|
||||
};
|
||||
|
||||
|
|
@ -100,6 +167,7 @@ char *gotoblas_corename(void) {
|
|||
if (gotoblas == &gotoblas_EMAG8180) return corename[ 9];
|
||||
if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10];
|
||||
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11];
|
||||
if (gotoblas == &gotoblas_CORTEXA55) return corename[12];
|
||||
return corename[NUM_CORETYPES];
|
||||
}
|
||||
|
||||
|
|
@ -131,6 +199,7 @@ static gotoblas_t *force_coretype(char *coretype) {
|
|||
case 9: return (&gotoblas_EMAG8180);
|
||||
case 10: return (&gotoblas_NEOVERSEN1);
|
||||
case 11: return (&gotoblas_THUNDERX3T110);
|
||||
case 12: return (&gotoblas_CORTEXA55);
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
|
|
@ -189,6 +258,8 @@ static gotoblas_t *get_coretype(void) {
|
|||
return &gotoblas_CORTEXA73;
|
||||
case 0xd0c: // Neoverse N1
|
||||
return &gotoblas_NEOVERSEN1;
|
||||
case 0xd05: // Cortex A55
|
||||
return &gotoblas_CORTEXA55;
|
||||
}
|
||||
break;
|
||||
case 0x42: // Broadcom
|
||||
|
|
|
|||
|
|
@ -0,0 +1,230 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
|
||||
#include <sys/wait.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/resource.h>
|
||||
#include "common.h"
|
||||
|
||||
extern gotoblas_t gotoblas_LOONGSON3R3;
|
||||
extern gotoblas_t gotoblas_LOONGSON3R4;
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
#define NUM_CORETYPES 2
|
||||
|
||||
static char *corename[] = {
|
||||
"loongson3r3",
|
||||
"loongson3r4",
|
||||
"UNKNOWN"
|
||||
};
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_LOONGSON3R3) return corename[0];
|
||||
if (gotoblas == &gotoblas_LOONGSON3R4) return corename[1];
|
||||
return corename[NUM_CORETYPES];
|
||||
}
|
||||
|
||||
static gotoblas_t *force_coretype(char *coretype) {
|
||||
int i;
|
||||
int found = -1;
|
||||
char message[128];
|
||||
|
||||
for ( i=0 ; i < NUM_CORETYPES; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype, corename[i], 20))
|
||||
{
|
||||
found = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
switch (found)
|
||||
{
|
||||
case 0: return (&gotoblas_LOONGSON3R3);
|
||||
case 1: return (&gotoblas_LOONGSON3R4);
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#define MMI_MASK 0x00000010
|
||||
#define MSA_MASK 0x00000020
|
||||
|
||||
int fd[2];
|
||||
int support_cpucfg;
|
||||
|
||||
static void handler(int signum)
|
||||
{
|
||||
close(fd[1]);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* Brief : Function to check if cpucfg supported on loongson
|
||||
* Return: 1 supported
|
||||
* 0 not supported
|
||||
*/
|
||||
static int cpucfg_test(void) {
|
||||
pid_t pid;
|
||||
int status = 0;
|
||||
|
||||
support_cpucfg = 0;
|
||||
pipe(fd);
|
||||
pid = fork();
|
||||
if (pid == 0) { /* Subprocess */
|
||||
struct sigaction act;
|
||||
close(fd[0]);
|
||||
/* Set signal action for SIGILL. */
|
||||
act.sa_handler = handler;
|
||||
sigaction(SIGILL,&act,NULL);
|
||||
|
||||
/* Execute cpucfg in subprocess. */
|
||||
__asm__ volatile(
|
||||
".insn \n\t"
|
||||
".word (0xc8080118) \n\t"
|
||||
:::
|
||||
);
|
||||
support_cpucfg = 1;
|
||||
write(fd[1],&support_cpucfg,sizeof(support_cpucfg));
|
||||
close(fd[1]);
|
||||
exit(0);
|
||||
} else if (pid > 0){ /* Parent process*/
|
||||
close(fd[1]);
|
||||
if ((waitpid(pid,&status,0) <= 0) ||
|
||||
(read(fd[0],&support_cpucfg,sizeof(support_cpucfg)) <= 0))
|
||||
support_cpucfg = 0;
|
||||
close(fd[0]);
|
||||
} else {
|
||||
support_cpucfg = 0;
|
||||
}
|
||||
|
||||
return support_cpucfg;
|
||||
}
|
||||
|
||||
static gotoblas_t *get_coretype_from_cpucfg(void) {
|
||||
int flag = 0;
|
||||
__asm__ volatile(
|
||||
".insn \n\t"
|
||||
"dli $8, 0x01 \n\t"
|
||||
".word (0xc9084918) \n\t"
|
||||
"usw $9, 0x00(%0) \n\t"
|
||||
:
|
||||
: "r"(&flag)
|
||||
: "memory"
|
||||
);
|
||||
if (flag & MSA_MASK)
|
||||
return (&gotoblas_LOONGSON3R4);
|
||||
if (flag & MMI_MASK)
|
||||
return (&gotoblas_LOONGSON3R3);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static gotoblas_t *get_coretype_from_cpuinfo(void) {
|
||||
#ifdef linux
|
||||
FILE *infile;
|
||||
char buffer[512], *p;
|
||||
|
||||
p = (char *)NULL;
|
||||
//Check model name for Loongson3
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("model name", buffer, 10)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
fclose(infile);
|
||||
if(p != NULL){
|
||||
if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000"))
|
||||
return (&gotoblas_LOONGSON3R3);
|
||||
else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000"))
|
||||
return (&gotoblas_LOONGSON3R4);
|
||||
else
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static gotoblas_t *get_coretype(void) {
|
||||
int ret = 0;
|
||||
|
||||
ret = cpucfg_test();
|
||||
if (ret == 1)
|
||||
return get_coretype_from_cpucfg();
|
||||
else
|
||||
return get_coretype_from_cpuinfo();
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_init(void) {
|
||||
char coremsg[128];
|
||||
char coren[22];
|
||||
char *p;
|
||||
|
||||
if (gotoblas) return;
|
||||
|
||||
p = getenv("OPENBLAS_CORETYPE");
|
||||
if ( p )
|
||||
{
|
||||
gotoblas = force_coretype(p);
|
||||
}
|
||||
else
|
||||
{
|
||||
gotoblas = get_coretype();
|
||||
}
|
||||
|
||||
if (gotoblas == NULL)
|
||||
{
|
||||
snprintf(coremsg, 128, "Falling back to loongson3r3 core\n");
|
||||
openblas_warning(1, coremsg);
|
||||
gotoblas = &gotoblas_LOONGSON3R3;
|
||||
}
|
||||
|
||||
if (gotoblas && gotoblas->init) {
|
||||
strncpy(coren, gotoblas_corename(), 20);
|
||||
sprintf(coremsg, "Core: %s\n", coren);
|
||||
openblas_warning(2, coremsg);
|
||||
gotoblas -> init();
|
||||
} else {
|
||||
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_quit(void) {
|
||||
gotoblas = NULL;
|
||||
}
|
||||
|
|
@ -6,10 +6,6 @@ extern gotoblas_t gotoblas_POWER8;
|
|||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
extern gotoblas_t gotoblas_POWER9;
|
||||
#endif
|
||||
//#if (!defined __GNUC__) || ( __GNUC__ >= 11) \
|
||||
// || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
|
||||
//#define HAVE_P10_SUPPORT 1
|
||||
//#endif
|
||||
#ifdef HAVE_P10_SUPPORT
|
||||
extern gotoblas_t gotoblas_POWER10;
|
||||
#endif
|
||||
|
|
@ -27,7 +23,9 @@ static char *corename[] = {
|
|||
#define NUM_CORETYPES 4
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
#ifndef C_PGI
|
||||
if (gotoblas == &gotoblas_POWER6) return corename[1];
|
||||
#endif
|
||||
if (gotoblas == &gotoblas_POWER8) return corename[2];
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
if (gotoblas == &gotoblas_POWER9) return corename[3];
|
||||
|
|
@ -38,10 +36,164 @@ char *gotoblas_corename(void) {
|
|||
return corename[0];
|
||||
}
|
||||
|
||||
#if defined(__clang__)
|
||||
static int __builtin_cpu_supports(char* arg)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(C_PGI) || defined(__clang__)
|
||||
/*
|
||||
* NV HPC compilers do not yet implement __builtin_cpu_is().
|
||||
* Fake a version here for use in the CPU detection code below.
|
||||
*
|
||||
* Strategy here is to first check the CPU to see what it actually is,
|
||||
* and then test the input to see if what the CPU actually is matches
|
||||
* what was requested.
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
* Define POWER processor version table.
|
||||
*
|
||||
* NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time
|
||||
*/
|
||||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_POWER5 5
|
||||
#define CPU_POWER6 6
|
||||
#define CPU_POWER8 8
|
||||
#define CPU_POWER9 9
|
||||
#define CPU_POWER10 10
|
||||
|
||||
static struct {
|
||||
uint32_t pvr_mask;
|
||||
uint32_t pvr_value;
|
||||
const char* cpu_name;
|
||||
uint32_t cpu_type;
|
||||
} pvrPOWER [] = {
|
||||
|
||||
{ /* POWER6 in P5+ mode; 2.04-compliant processor */
|
||||
.pvr_mask = 0xffffffff,
|
||||
.pvr_value = 0x0f000001,
|
||||
.cpu_name = "POWER5+",
|
||||
.cpu_type = CPU_POWER5,
|
||||
},
|
||||
|
||||
{ /* Power6 aka POWER6X*/
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x003e0000,
|
||||
.cpu_name = "POWER6 (raw)",
|
||||
.cpu_type = CPU_POWER6,
|
||||
},
|
||||
|
||||
{ /* Power7 */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x003f0000,
|
||||
.cpu_name = "POWER7 (raw)",
|
||||
.cpu_type = CPU_POWER6,
|
||||
},
|
||||
|
||||
{ /* Power7+ */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x004A0000,
|
||||
.cpu_name = "POWER7+ (raw)",
|
||||
.cpu_type = CPU_POWER6,
|
||||
},
|
||||
|
||||
{ /* Power8E */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x004b0000,
|
||||
.cpu_name = "POWER8E (raw)",
|
||||
.cpu_type = CPU_POWER8,
|
||||
},
|
||||
|
||||
{ /* Power8NVL */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x004c0000,
|
||||
.cpu_name = "POWER8NVL (raw)",
|
||||
.cpu_type = CPU_POWER8,
|
||||
},
|
||||
|
||||
{ /* Power8 */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x004d0000,
|
||||
.cpu_name = "POWER8 (raw)",
|
||||
.cpu_type = CPU_POWER8,
|
||||
},
|
||||
|
||||
{ /* Power9 DD2.0 */
|
||||
.pvr_mask = 0xffffefff,
|
||||
.pvr_value = 0x004e0200,
|
||||
.cpu_name = "POWER9 (raw)",
|
||||
.cpu_type = CPU_POWER9,
|
||||
},
|
||||
|
||||
{ /* Power9 DD 2.1 */
|
||||
.pvr_mask = 0xffffefff,
|
||||
.pvr_value = 0x004e0201,
|
||||
.cpu_name = "POWER9 (raw)",
|
||||
.cpu_type = CPU_POWER9,
|
||||
},
|
||||
|
||||
{ /* Power9 DD2.2 or later */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x004e0000,
|
||||
.cpu_name = "POWER9 (raw)",
|
||||
.cpu_type = CPU_POWER9,
|
||||
},
|
||||
|
||||
{ /* Power10 */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x00800000,
|
||||
.cpu_name = "POWER10 (raw)",
|
||||
.cpu_type = CPU_POWER10,
|
||||
},
|
||||
|
||||
{ /* End of table, pvr_mask and pvr_value must be zero */
|
||||
.pvr_mask = 0x0,
|
||||
.pvr_value = 0x0,
|
||||
.cpu_name = "Unknown",
|
||||
.cpu_type = CPU_UNKNOWN,
|
||||
},
|
||||
};
|
||||
|
||||
static int __builtin_cpu_is(const char *cpu) {
|
||||
int i;
|
||||
uint32_t pvr;
|
||||
uint32_t cpu_type;
|
||||
|
||||
asm("mfpvr %0" : "=r"(pvr));
|
||||
|
||||
for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) {
|
||||
if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(DEBUG)
|
||||
printf("%s: returning CPU=%s, cpu_type=%p\n", __func__,
|
||||
pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type);
|
||||
#endif
|
||||
cpu_type = pvrPOWER[i].cpu_type;
|
||||
|
||||
if (!strcmp(cpu, "power8"))
|
||||
return cpu_type == CPU_POWER8;
|
||||
if (!strcmp(cpu, "power9"))
|
||||
return cpu_type == CPU_POWER9;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* C_PGI */
|
||||
|
||||
static gotoblas_t *get_coretype(void) {
|
||||
|
||||
#ifndef C_PGI
|
||||
if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x"))
|
||||
return &gotoblas_POWER6;
|
||||
#endif
|
||||
if (__builtin_cpu_is("power8"))
|
||||
return &gotoblas_POWER8;
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
|
|
@ -52,6 +204,11 @@ static gotoblas_t *get_coretype(void) {
|
|||
if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma"))
|
||||
return &gotoblas_POWER10;
|
||||
#endif
|
||||
/* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
|
||||
if (__builtin_cpu_is("power10"))
|
||||
return &gotoblas_POWER9;
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
|
@ -72,7 +229,9 @@ static gotoblas_t *force_coretype(char * coretype) {
|
|||
|
||||
switch (found)
|
||||
{
|
||||
#ifndef C_PGI
|
||||
case 1: return (&gotoblas_POWER6);
|
||||
#endif
|
||||
case 2: return (&gotoblas_POWER8);
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
case 3: return (&gotoblas_POWER9);
|
||||
|
|
|
|||
|
|
@ -1,38 +1,7 @@
|
|||
#include "common.h"
|
||||
#include "cpuid_zarch.h"
|
||||
#include <stdbool.h>
|
||||
|
||||
// Guard the use of getauxval() on glibc version >= 2.16
|
||||
#ifdef __GLIBC__
|
||||
#include <features.h>
|
||||
#if __GLIBC_PREREQ(2, 16)
|
||||
#include <sys/auxv.h>
|
||||
#define HAVE_GETAUXVAL 1
|
||||
|
||||
static unsigned long get_hwcap(void)
|
||||
{
|
||||
unsigned long hwcap = getauxval(AT_HWCAP);
|
||||
char *maskenv;
|
||||
|
||||
// honor requests for not using specific CPU features in LD_HWCAP_MASK
|
||||
maskenv = getenv("LD_HWCAP_MASK");
|
||||
if (maskenv)
|
||||
hwcap &= strtoul(maskenv, NULL, 0);
|
||||
|
||||
return hwcap;
|
||||
// note that a missing auxval is interpreted as no capabilities
|
||||
// available, which is safe.
|
||||
}
|
||||
|
||||
#else // __GLIBC_PREREQ(2, 16)
|
||||
#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16"
|
||||
|
||||
static unsigned long get_hwcap(void) {
|
||||
// treat missing support for getauxval() as no capabilities available,
|
||||
// which is safe.
|
||||
return 0;
|
||||
}
|
||||
#endif // __GLIBC_PREREQ(2, 16)
|
||||
#endif // __GLIBC
|
||||
|
||||
extern gotoblas_t gotoblas_ZARCH_GENERIC;
|
||||
#ifdef DYN_Z13
|
||||
|
|
@ -44,25 +13,19 @@ extern gotoblas_t gotoblas_Z14;
|
|||
|
||||
#define NUM_CORETYPES 4
|
||||
|
||||
extern int openblas_verbose();
|
||||
extern void openblas_warning(int verbose, const char* msg);
|
||||
|
||||
static char* corename[] = {
|
||||
"unknown",
|
||||
"Z13",
|
||||
"Z14",
|
||||
"ZARCH_GENERIC",
|
||||
};
|
||||
|
||||
char* gotoblas_corename(void) {
|
||||
#ifdef DYN_Z13
|
||||
if (gotoblas == &gotoblas_Z13) return corename[1];
|
||||
if (gotoblas == &gotoblas_Z13) return cpuname[CPU_Z13];
|
||||
#endif
|
||||
#ifdef DYN_Z14
|
||||
if (gotoblas == &gotoblas_Z14) return corename[2];
|
||||
if (gotoblas == &gotoblas_Z14) return cpuname[CPU_Z14];
|
||||
#endif
|
||||
if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3];
|
||||
if (gotoblas == &gotoblas_ZARCH_GENERIC) return cpuname[CPU_GENERIC];
|
||||
|
||||
return corename[0];
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
#ifndef HWCAP_S390_VXE
|
||||
|
|
@ -79,25 +42,28 @@ char* gotoblas_corename(void) {
|
|||
*/
|
||||
static gotoblas_t* get_coretype(void) {
|
||||
|
||||
unsigned long hwcap __attribute__((unused)) = get_hwcap();
|
||||
int cpu = detect();
|
||||
|
||||
#ifdef DYN_Z14
|
||||
switch(cpu) {
|
||||
// z14 and z15 systems: exploit Vector Facility (SIMD) and
|
||||
// Vector-Enhancements Facility 1 (float SIMD instructions), if present.
|
||||
if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE))
|
||||
case CPU_Z14:
|
||||
#ifdef DYN_Z14
|
||||
return &gotoblas_Z14;
|
||||
#endif
|
||||
|
||||
#ifdef DYN_Z13
|
||||
// z13: Vector Facility (SIMD for double)
|
||||
if (hwcap & HWCAP_S390_VX)
|
||||
case CPU_Z13:
|
||||
#ifdef DYN_Z13
|
||||
return &gotoblas_Z13;
|
||||
#endif
|
||||
|
||||
default:
|
||||
// fallback in case of missing compiler support, systems before z13, or
|
||||
// when the OS does not advertise support for the Vector Facility (e.g.,
|
||||
// missing support in the OS kernel)
|
||||
return &gotoblas_ZARCH_GENERIC;
|
||||
return &gotoblas_ZARCH_GENERIC;
|
||||
}
|
||||
}
|
||||
|
||||
static gotoblas_t* force_coretype(char* coretype) {
|
||||
|
|
@ -108,28 +74,28 @@ static gotoblas_t* force_coretype(char* coretype) {
|
|||
|
||||
for (i = 0; i < NUM_CORETYPES; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype, corename[i], 20))
|
||||
if (!strncasecmp(coretype, cpuname[i], 20))
|
||||
{
|
||||
found = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (found == 1) {
|
||||
if (found == CPU_Z13) {
|
||||
#ifdef DYN_Z13
|
||||
return &gotoblas_Z13;
|
||||
#else
|
||||
openblas_warning(1, "Z13 support not compiled in");
|
||||
return NULL;
|
||||
#endif
|
||||
} else if (found == 2) {
|
||||
} else if (found == CPU_Z14) {
|
||||
#ifdef DYN_Z14
|
||||
return &gotoblas_Z14;
|
||||
#else
|
||||
openblas_warning(1, "Z14 support not compiled in");
|
||||
return NULL;
|
||||
#endif
|
||||
} else if (found == 3) {
|
||||
} else if (found == CPU_GENERIC) {
|
||||
return &gotoblas_ZARCH_GENERIC;
|
||||
}
|
||||
|
||||
|
|
@ -155,6 +121,11 @@ void gotoblas_dynamic_init(void) {
|
|||
else
|
||||
{
|
||||
gotoblas = get_coretype();
|
||||
if (openblas_verbose() >= 2) {
|
||||
snprintf(coremsg, sizeof(coremsg), "Choosing kernels based on getauxval(AT_HWCAP)=0x%lx\n",
|
||||
getauxval(AT_HWCAP));
|
||||
openblas_warning(2, coremsg);
|
||||
}
|
||||
}
|
||||
|
||||
if (gotoblas == NULL)
|
||||
|
|
@ -165,9 +136,11 @@ void gotoblas_dynamic_init(void) {
|
|||
}
|
||||
|
||||
if (gotoblas && gotoblas->init) {
|
||||
strncpy(coren, gotoblas_corename(), 20);
|
||||
sprintf(coremsg, "Core: %s\n", coren);
|
||||
openblas_warning(2, coremsg);
|
||||
if (openblas_verbose() >= 2) {
|
||||
strncpy(coren, gotoblas_corename(), 20);
|
||||
sprintf(coremsg, "Core: %s\n", coren);
|
||||
openblas_warning(2, coremsg);
|
||||
}
|
||||
gotoblas->init();
|
||||
}
|
||||
else {
|
||||
|
|
|
|||
|
|
@ -73,6 +73,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
#ifndef likely
|
||||
#ifdef __GNUC__
|
||||
#define likely(x) __builtin_expect(!!(x), 1)
|
||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
#else
|
||||
#define likely(x) (x)
|
||||
#define unlikely(x) (x)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(USE_TLS) && defined(SMP)
|
||||
#define COMPILE_TLS
|
||||
|
||||
|
|
@ -222,11 +232,11 @@ int get_num_procs(void);
|
|||
#else
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
int ret;
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
cpu_set_t cpuset,*cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
int i;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
|
|
@ -236,6 +246,15 @@ int get_num_procs(void) {
|
|||
#endif
|
||||
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
|
||||
#if defined(USE_OPENMP)
|
||||
#if _OPENMP >= 201511
|
||||
ret = omp_get_num_places();
|
||||
if (ret >0 ) nums = ret;
|
||||
#endif
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
#if !defined(OS_LINUX)
|
||||
return nums;
|
||||
#endif
|
||||
|
|
@ -428,7 +447,7 @@ extern int openblas_goto_num_threads_env();
|
|||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
|
|
@ -436,7 +455,7 @@ int blas_get_cpu_number(void){
|
|||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
|
|
@ -460,7 +479,7 @@ int blas_get_cpu_number(void){
|
|||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
|
|
@ -1241,7 +1260,7 @@ UNLOCK_COMMAND(&alloc_lock);
|
|||
|
||||
func = &memoryalloc[0];
|
||||
|
||||
while ((func != NULL) && (map_address == (void *) -1)) {
|
||||
while ((*func != NULL) && (map_address == (void *) -1)) {
|
||||
|
||||
map_address = (*func)((void *)base_address);
|
||||
|
||||
|
|
@ -1291,7 +1310,12 @@ UNLOCK_COMMAND(&alloc_lock);
|
|||
return (void *)(((char *)alloc_info) + sizeof(struct alloc_t));
|
||||
|
||||
error:
|
||||
printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
|
||||
printf("OpenBLAS : Program will terminate because you tried to allocate too many TLS memory regions.\n");
|
||||
printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
|
||||
printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
|
||||
printf("a sufficiently small number. This error typically occurs when the software that relies on\n");
|
||||
printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n");
|
||||
printf("cpu cores than what OpenBLAS was configured to handle.\n");
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
|
@ -1619,10 +1643,12 @@ static int on_process_term(void)
|
|||
#else
|
||||
#pragma data_seg(".CRT$XLB")
|
||||
#endif
|
||||
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
|
||||
|
||||
#ifdef _WIN64
|
||||
static const PIMAGE_TLS_CALLBACK dll_callback(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
|
||||
#pragma const_seg()
|
||||
#else
|
||||
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
|
||||
#pragma data_seg()
|
||||
#endif
|
||||
|
||||
|
|
@ -1631,10 +1657,12 @@ static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOI
|
|||
#else
|
||||
#pragma data_seg(".CRT$XTU")
|
||||
#endif
|
||||
static int(*p_process_term)(void) = on_process_term;
|
||||
|
||||
#ifdef _WIN64
|
||||
static const int(*p_process_term)(void) = on_process_term;
|
||||
#pragma const_seg()
|
||||
#else
|
||||
static int(*p_process_term)(void) = on_process_term;
|
||||
#pragma data_seg()
|
||||
#endif
|
||||
#endif
|
||||
|
|
@ -1668,16 +1696,23 @@ void gotoblas_dummy_for_PGI(void) {
|
|||
#ifndef MEM_LARGE_PAGES
|
||||
#define MEM_LARGE_PAGES 0x20000000
|
||||
#endif
|
||||
#else
|
||||
#elif !defined(OS_EMBEDDED)
|
||||
#define ALLOC_MMAP
|
||||
#define ALLOC_MALLOC
|
||||
#else
|
||||
#define ALLOC_MALLOC
|
||||
|
||||
inline int puts(const char *str) { return 0; }
|
||||
inline int printf(const char *format, ...) { return 0; }
|
||||
inline char *getenv(const char *name) { return ""; }
|
||||
inline int atoi(const char *str) { return 0; }
|
||||
#endif
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
|
||||
#if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED)
|
||||
#include <sys/mman.h>
|
||||
#ifndef NO_SYSV_IPC
|
||||
#include <sys/shm.h>
|
||||
|
|
@ -1691,7 +1726,6 @@ void gotoblas_dummy_for_PGI(void) {
|
|||
#include <sys/sysinfo.h>
|
||||
#include <sched.h>
|
||||
#include <errno.h>
|
||||
#include <linux/unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/resource.h>
|
||||
|
|
@ -1767,11 +1801,12 @@ int get_num_procs(void);
|
|||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
|
||||
int ret;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
cpu_set_t cpuset,*cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
int i;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
|
|
@ -1781,10 +1816,20 @@ int get_num_procs(void) {
|
|||
#endif
|
||||
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
|
||||
#if defined(USE_OPENMP)
|
||||
/* if (omp_get_proc_bind() != omp_proc_bind_false) */
|
||||
#if _OPENMP >= 201511
|
||||
ret = omp_get_num_places();
|
||||
if (ret >0 ) nums = ret;
|
||||
#endif
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
#if !defined(OS_LINUX)
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
|
||||
#if !defined(__GLIBC_PREREQ)
|
||||
return nums;
|
||||
#else
|
||||
|
|
@ -1969,7 +2014,7 @@ extern int openblas_goto_num_threads_env();
|
|||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
|
|
@ -1977,7 +2022,7 @@ int blas_get_cpu_number(void){
|
|||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
|
|
@ -2001,7 +2046,7 @@ int blas_get_cpu_number(void){
|
|||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
|
|
@ -2045,6 +2090,7 @@ struct release_t {
|
|||
int hugetlb_allocated = 0;
|
||||
|
||||
static struct release_t release_info[NUM_BUFFERS];
|
||||
static struct release_t *new_release_info;
|
||||
static int release_pos = 0;
|
||||
|
||||
#if defined(OS_LINUX) && !defined(NO_WARMUP)
|
||||
|
|
@ -2095,8 +2141,13 @@ static void *alloc_mmap(void *address){
|
|||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
if (likely(release_pos < NUM_BUFFERS)) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_mmap_free;
|
||||
} else {
|
||||
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
|
||||
new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free;
|
||||
}
|
||||
release_pos ++;
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
|
@ -2259,8 +2310,13 @@ static void *alloc_mmap(void *address){
|
|||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
if (likely(release_pos < NUM_BUFFERS)) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_mmap_free;
|
||||
} else {
|
||||
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
|
||||
new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free;
|
||||
}
|
||||
release_pos ++;
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
|
@ -2292,8 +2348,13 @@ static void *alloc_malloc(void *address){
|
|||
if (map_address == (void *)NULL) map_address = (void *)-1;
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
if (likely(release_pos < NUM_BUFFERS)) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_malloc_free;
|
||||
} else {
|
||||
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
|
||||
new_release_info[release_pos-NUM_BUFFERS].func = alloc_malloc_free;
|
||||
}
|
||||
release_pos ++;
|
||||
}
|
||||
|
||||
|
|
@ -2326,8 +2387,13 @@ static void *alloc_qalloc(void *address){
|
|||
if (map_address == (void *)NULL) map_address = (void *)-1;
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
if (likely(release_pos < NUM_BUFFERS)) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_qalloc_free;
|
||||
} else {
|
||||
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
|
||||
new_release_info[release_pos-NUM_BUFFERS].func = alloc_qalloc_free;
|
||||
}
|
||||
release_pos ++;
|
||||
}
|
||||
|
||||
|
|
@ -2355,8 +2421,13 @@ static void *alloc_windows(void *address){
|
|||
if (map_address == (void *)NULL) map_address = (void *)-1;
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
if (likely(release_pos < NUM_BUFFERS)) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_windows_free;
|
||||
} else {
|
||||
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
|
||||
new_release_info[release_pos-NUM_BUFFERS].func = alloc_windows_free;
|
||||
}
|
||||
release_pos ++;
|
||||
}
|
||||
|
||||
|
|
@ -2399,9 +2470,15 @@ static void *alloc_devicedirver(void *address){
|
|||
fd, 0);
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
if (likely(release_pos < NUM_BUFFERS)) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].attr = fd;
|
||||
release_info[release_pos].func = alloc_devicedirver_free;
|
||||
} else {
|
||||
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
|
||||
new_release_info[release_pos-NUM_BUFFERS].attr = fd;
|
||||
new_release_info[release_pos-NUM_BUFFERS].func = alloc_devicedirver_free;
|
||||
}
|
||||
release_pos ++;
|
||||
}
|
||||
|
||||
|
|
@ -2435,9 +2512,15 @@ static void *alloc_shm(void *address){
|
|||
|
||||
shmctl(shmid, IPC_RMID, 0);
|
||||
|
||||
if (likely(release_pos < NUM_BUFFERS)) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].attr = shmid;
|
||||
release_info[release_pos].func = alloc_shm_free;
|
||||
} else {
|
||||
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
|
||||
new_release_info[release_pos-NUM_BUFFERS].attr = shmid;
|
||||
new_release_info[release_pos-NUM_BUFFERS].func = alloc_shm_free;
|
||||
}
|
||||
release_pos ++;
|
||||
}
|
||||
|
||||
|
|
@ -2541,8 +2624,13 @@ static void *alloc_hugetlb(void *address){
|
|||
#endif
|
||||
|
||||
if (map_address != (void *)-1){
|
||||
if (likely(release_pos < NUM_BUFFERS)) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_hugetlb_free;
|
||||
} else {
|
||||
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
|
||||
new_release_info[release_pos-NUM_BUFFERS].func = alloc_hugetlb_free;
|
||||
}
|
||||
release_pos ++;
|
||||
}
|
||||
|
||||
|
|
@ -2589,9 +2677,15 @@ static void *alloc_hugetlbfile(void *address){
|
|||
fd, 0);
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
if (likely(release_pos < NUM_BUFFERS)) {
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].attr = fd;
|
||||
release_info[release_pos].func = alloc_hugetlbfile_free;
|
||||
} else {
|
||||
new_release_info[release_pos-NUM_BUFFERS].address = map_address;
|
||||
new_release_info[release_pos-NUM_BUFFERS].attr = fd;
|
||||
new_release_info[release_pos-NUM_BUFFERS].func = alloc_hugetlbfile_free;
|
||||
}
|
||||
release_pos ++;
|
||||
}
|
||||
|
||||
|
|
@ -2621,8 +2715,25 @@ static volatile struct {
|
|||
|
||||
} memory[NUM_BUFFERS];
|
||||
|
||||
static int memory_initialized = 0;
|
||||
struct newmemstruct
|
||||
{
|
||||
BLASULONG lock;
|
||||
void *addr;
|
||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
||||
int pos;
|
||||
#endif
|
||||
int used;
|
||||
#ifndef __64BIT__
|
||||
char dummy[48];
|
||||
#else
|
||||
char dummy[40];
|
||||
#endif
|
||||
|
||||
};
|
||||
static volatile struct newmemstruct *newmemory;
|
||||
|
||||
static int memory_initialized = 0;
|
||||
static int memory_overflowed = 0;
|
||||
/* Memory allocation routine */
|
||||
/* procpos ... indicates where it comes from */
|
||||
/* 0 : Level 3 functions */
|
||||
|
|
@ -2631,6 +2742,8 @@ static int memory_initialized = 0;
|
|||
|
||||
void *blas_memory_alloc(int procpos){
|
||||
|
||||
int i;
|
||||
|
||||
int position;
|
||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
||||
int mypos = 0;
|
||||
|
|
@ -2761,6 +2874,25 @@ void *blas_memory_alloc(int procpos){
|
|||
position ++;
|
||||
|
||||
} while (position < NUM_BUFFERS);
|
||||
|
||||
if (memory_overflowed) {
|
||||
|
||||
do {
|
||||
RMB;
|
||||
#if defined(USE_OPENMP)
|
||||
if (!newmemory[position-NUM_BUFFERS].used) {
|
||||
blas_lock(&newmemory[position-NUM_BUFFERS].lock);
|
||||
#endif
|
||||
if (!newmemory[position-NUM_BUFFERS].used) goto allocation2;
|
||||
|
||||
#if defined(USE_OPENMP)
|
||||
blas_unlock(&newmemory[position-NUM_BUFFERS].lock);
|
||||
}
|
||||
#endif
|
||||
position ++;
|
||||
|
||||
} while (position < 512+NUM_BUFFERS);
|
||||
}
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
|
|
@ -2788,7 +2920,7 @@ void *blas_memory_alloc(int procpos){
|
|||
|
||||
func = &memoryalloc[0];
|
||||
|
||||
while ((func != NULL) && (map_address == (void *) -1)) {
|
||||
while ((*func != NULL) && (map_address == (void *) -1)) {
|
||||
|
||||
map_address = (*func)((void *)base_address);
|
||||
|
||||
|
|
@ -2868,8 +3000,102 @@ void *blas_memory_alloc(int procpos){
|
|||
return (void *)memory[position].addr;
|
||||
|
||||
error:
|
||||
printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
if (memory_overflowed) goto terminate;
|
||||
fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n");
|
||||
memory_overflowed=1;
|
||||
new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t));
|
||||
newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct));
|
||||
for (i = 0; i < 512; i++) {
|
||||
newmemory[i].addr = (void *)0;
|
||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
||||
newmemory[i].pos = -1;
|
||||
#endif
|
||||
newmemory[i].used = 0;
|
||||
newmemory[i].lock = 0;
|
||||
}
|
||||
|
||||
allocation2:
|
||||
newmemory[position-NUM_BUFFERS].used = 1;
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#else
|
||||
blas_unlock(&newmemory[position-NUM_BUFFERS].lock);
|
||||
#endif
|
||||
do {
|
||||
#ifdef DEBUG
|
||||
printf("Allocation Start : %lx\n", base_address);
|
||||
#endif
|
||||
|
||||
map_address = (void *)-1;
|
||||
|
||||
func = &memoryalloc[0];
|
||||
|
||||
while ((*func != NULL) && (map_address == (void *) -1)) {
|
||||
|
||||
map_address = (*func)((void *)base_address);
|
||||
|
||||
#ifdef ALLOC_DEVICEDRIVER
|
||||
if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) {
|
||||
fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef ALLOC_HUGETLBFILE
|
||||
if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
|
||||
#ifndef OS_WINDOWS
|
||||
fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)
|
||||
if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
|
||||
#endif
|
||||
|
||||
func ++;
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
printf(" Success -> %08lx\n", map_address);
|
||||
#endif
|
||||
if (((BLASLONG) map_address) == -1) base_address = 0UL;
|
||||
|
||||
if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
|
||||
|
||||
} while ((BLASLONG)map_address == -1);
|
||||
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
newmemory[position-NUM_BUFFERS].addr = map_address;
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
printf(" Mapping Succeeded. %p(%d)\n", (void *)newmemory[position-NUM_BUFFERS].addr, position);
|
||||
#endif
|
||||
|
||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
||||
|
||||
if (newmemory[position-NUM_BUFFERS].pos == -1) newmemory[position-NUM_BUFFERS].pos = mypos;
|
||||
|
||||
#endif
|
||||
return (void *)newmemory[position-NUM_BUFFERS].addr;
|
||||
|
||||
terminate:
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
|
||||
printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
|
||||
printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
|
||||
printf("a sufficiently small number. This error typically occurs when the software that relies on\n");
|
||||
printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n");
|
||||
printf("cpu cores than what OpenBLAS was configured to handle.\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
|
@ -2888,13 +3114,28 @@ void blas_memory_free(void *free_area){
|
|||
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
|
||||
position++;
|
||||
|
||||
if (position >= NUM_BUFFERS) goto error;
|
||||
if (position >= NUM_BUFFERS && !memory_overflowed) goto error;
|
||||
|
||||
#ifdef DEBUG
|
||||
if (memory[position].addr != free_area) goto error;
|
||||
printf(" Position : %d\n", position);
|
||||
#endif
|
||||
if (unlikely(memory_overflowed && position >= NUM_BUFFERS)) {
|
||||
while ((position < NUM_BUFFERS+512) && (newmemory[position-NUM_BUFFERS].addr != free_area))
|
||||
position++;
|
||||
// arm: ensure all writes are finished before other thread takes this memory
|
||||
WMB;
|
||||
|
||||
newmemory[position].used = 0;
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("Unmap from overflow area succeeded.\n\n");
|
||||
#endif
|
||||
return;
|
||||
} else {
|
||||
// arm: ensure all writes are finished before other thread takes this memory
|
||||
WMB;
|
||||
|
||||
|
|
@ -2908,7 +3149,7 @@ void blas_memory_free(void *free_area){
|
|||
#endif
|
||||
|
||||
return;
|
||||
|
||||
}
|
||||
error:
|
||||
printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area);
|
||||
|
||||
|
|
@ -2943,7 +3184,10 @@ void blas_shutdown(void){
|
|||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
for (pos = 0; pos < release_pos; pos ++) {
|
||||
if (likely(pos < NUM_BUFFERS))
|
||||
release_info[pos].func(&release_info[pos]);
|
||||
else
|
||||
new_release_info[pos-NUM_BUFFERS].func(&new_release_info[pos-NUM_BUFFERS]);
|
||||
}
|
||||
|
||||
#ifdef SEEK_ADDRESS
|
||||
|
|
@ -2960,6 +3204,15 @@ void blas_shutdown(void){
|
|||
#endif
|
||||
memory[pos].lock = 0;
|
||||
}
|
||||
if (memory_overflowed)
|
||||
for (pos = 0; pos < 512; pos ++){
|
||||
newmemory[pos].addr = (void *)0;
|
||||
newmemory[pos].used = 0;
|
||||
#if defined(WHEREAMI) && !defined(USE_OPENMP)
|
||||
newmemory[pos].pos = -1;
|
||||
#endif
|
||||
newmemory[pos].lock = 0;
|
||||
}
|
||||
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
||||
|
|
|
|||
|
|
@ -183,7 +183,7 @@ int get_L2_size(void){
|
|||
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
||||
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
|
||||
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || \
|
||||
defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
|
||||
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
|
|
@ -269,7 +269,7 @@ void blas_set_parameter(void){
|
|||
int factor;
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || \
|
||||
defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || \
|
||||
defined(SKYLAKEX) || defined(COOPERLAKE)
|
||||
defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
|
||||
int size = 16;
|
||||
#else
|
||||
int size = get_L2_size();
|
||||
|
|
@ -524,6 +524,9 @@ void blas_set_parameter(void){
|
|||
xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M;
|
||||
#endif
|
||||
|
||||
#ifdef BUILD_BFLOAT16
|
||||
sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15;
|
||||
#endif
|
||||
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
|
||||
dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
|
||||
cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
|
||||
|
|
@ -629,7 +632,9 @@ void blas_set_parameter(void){
|
|||
xgemm_p = 16 * (size + 1);
|
||||
#endif
|
||||
|
||||
#ifdef BUILD_BFLOAT16
|
||||
sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15;
|
||||
#endif
|
||||
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
|
||||
dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
|
||||
cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
|
||||
|
|
@ -717,7 +722,7 @@ void blas_set_parameter(void){
|
|||
|
||||
#if defined(ARCH_MIPS64)
|
||||
void blas_set_parameter(void){
|
||||
#if defined(LOONGSON3A)
|
||||
#if defined(LOONGSON3R3) || defined(LOONGSON3R4)
|
||||
#ifdef SMP
|
||||
if(blas_num_threads == 1){
|
||||
#endif
|
||||
|
|
@ -731,20 +736,6 @@ void blas_set_parameter(void){
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(LOONGSON3B)
|
||||
#ifdef SMP
|
||||
if(blas_num_threads == 1 || blas_num_threads == 2){
|
||||
#endif
|
||||
//single thread
|
||||
dgemm_r = 640;
|
||||
#ifdef SMP
|
||||
}else{
|
||||
//multi thread
|
||||
dgemm_r = 160;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -139,9 +139,17 @@ endif
|
|||
ifneq (,$(filter 1 2,$(NOFORTRAN)))
|
||||
#only build without Fortran
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
else
|
||||
ifeq ($(F_COMPILER), INTEL)
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -all-load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def
|
||||
else
|
||||
ifeq ($(F_COMPILER), FLANG)
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -fno-fortran-main -Mnomain -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
else
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
dllinit.$(SUFFIX) : dllinit.c
|
||||
$(CC) $(CFLAGS) -c -o $(@F) -s $<
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# Changelog
|
||||
# 2017/09/03 staticfloat
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue