Merge branch 'develop' into risc-v
This commit is contained in:
commit
4aa2d89217
|
@ -0,0 +1,143 @@
|
||||||
|
---
|
||||||
|
kind: pipeline
|
||||||
|
name: arm64_gcc_make
|
||||||
|
|
||||||
|
platform:
|
||||||
|
os: linux
|
||||||
|
arch: arm64
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Build and Test
|
||||||
|
image: ubuntu:19.04
|
||||||
|
environment:
|
||||||
|
CC: gcc
|
||||||
|
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
|
||||||
|
commands:
|
||||||
|
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
|
||||||
|
- apt-get update -y
|
||||||
|
- apt-get install -y make $CC gfortran perl
|
||||||
|
- $CC --version
|
||||||
|
- make QUIET_MAKE=1 $COMMON_FLAGS
|
||||||
|
- make -C test $COMMON_FLAGS
|
||||||
|
- make -C ctest $COMMON_FLAGS
|
||||||
|
- make -C utest $COMMON_FLAGS
|
||||||
|
|
||||||
|
---
|
||||||
|
kind: pipeline
|
||||||
|
name: arm32_gcc_make
|
||||||
|
|
||||||
|
platform:
|
||||||
|
os: linux
|
||||||
|
arch: arm
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Build and Test
|
||||||
|
image: ubuntu:19.04
|
||||||
|
environment:
|
||||||
|
CC: gcc
|
||||||
|
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32'
|
||||||
|
commands:
|
||||||
|
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
|
||||||
|
- apt-get update -y
|
||||||
|
- apt-get install -y make $CC gfortran perl
|
||||||
|
- $CC --version
|
||||||
|
- make QUIET_MAKE=1 $COMMON_FLAGS
|
||||||
|
- make -C test $COMMON_FLAGS
|
||||||
|
- make -C ctest $COMMON_FLAGS
|
||||||
|
- make -C utest $COMMON_FLAGS
|
||||||
|
|
||||||
|
---
|
||||||
|
kind: pipeline
|
||||||
|
name: arm64_clang_make
|
||||||
|
|
||||||
|
platform:
|
||||||
|
os: linux
|
||||||
|
arch: arm64
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Build and Test
|
||||||
|
image: ubuntu:18.04
|
||||||
|
environment:
|
||||||
|
CC: clang
|
||||||
|
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
|
||||||
|
commands:
|
||||||
|
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
|
||||||
|
- apt-get update -y
|
||||||
|
- apt-get install -y make $CC gfortran perl
|
||||||
|
- $CC --version
|
||||||
|
- make QUIET_MAKE=1 $COMMON_FLAGS
|
||||||
|
- make -C test $COMMON_FLAGS
|
||||||
|
- make -C ctest $COMMON_FLAGS
|
||||||
|
- make -C utest $COMMON_FLAGS
|
||||||
|
|
||||||
|
---
|
||||||
|
kind: pipeline
|
||||||
|
name: arm32_clang_cmake
|
||||||
|
|
||||||
|
platform:
|
||||||
|
os: linux
|
||||||
|
arch: arm
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Build and Test
|
||||||
|
image: ubuntu:18.04
|
||||||
|
environment:
|
||||||
|
CC: clang
|
||||||
|
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV6 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
|
||||||
|
commands:
|
||||||
|
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
|
||||||
|
- apt-get update -y
|
||||||
|
- apt-get install -y make $CC g++ perl cmake
|
||||||
|
- $CC --version
|
||||||
|
- mkdir build && cd build
|
||||||
|
- cmake $CMAKE_FLAGS ..
|
||||||
|
- make -j
|
||||||
|
- ctest -V
|
||||||
|
|
||||||
|
---
|
||||||
|
kind: pipeline
|
||||||
|
name: arm64_gcc_cmake
|
||||||
|
|
||||||
|
platform:
|
||||||
|
os: linux
|
||||||
|
arch: arm64
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Build and Test
|
||||||
|
image: ubuntu:18.04
|
||||||
|
environment:
|
||||||
|
CC: gcc
|
||||||
|
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
|
||||||
|
commands:
|
||||||
|
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
|
||||||
|
- apt-get update -y
|
||||||
|
- apt-get install -y make $CC g++ perl cmake
|
||||||
|
- $CC --version
|
||||||
|
- mkdir build && cd build
|
||||||
|
- cmake $CMAKE_FLAGS ..
|
||||||
|
- make -j
|
||||||
|
- ctest -V
|
||||||
|
|
||||||
|
---
|
||||||
|
kind: pipeline
|
||||||
|
name: arm64_clang_cmake
|
||||||
|
|
||||||
|
platform:
|
||||||
|
os: linux
|
||||||
|
arch: arm64
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Build and Test
|
||||||
|
image: ubuntu:18.04
|
||||||
|
environment:
|
||||||
|
CC: clang
|
||||||
|
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
|
||||||
|
commands:
|
||||||
|
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
|
||||||
|
- apt-get update -y
|
||||||
|
- apt-get install -y make $CC g++ perl cmake
|
||||||
|
- $CC --version
|
||||||
|
- mkdir build && cd build
|
||||||
|
- cmake $CMAKE_FLAGS ..
|
||||||
|
- make -j
|
||||||
|
- ctest -V
|
|
@ -0,0 +1,78 @@
|
||||||
|
# Only the "head" branch of the OpenBLAS package is tested
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
paths:
|
||||||
|
- '**/nightly-Homebrew-build.yml'
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- develop
|
||||||
|
paths:
|
||||||
|
- '**/nightly-Homebrew-build.yml'
|
||||||
|
schedule:
|
||||||
|
- cron: 45 7 * * *
|
||||||
|
# This is 7:45 AM UTC daily, late at night in the USA
|
||||||
|
|
||||||
|
# Since push and pull_request will still always be building and testing the `develop` branch,
|
||||||
|
# it only makes sense to test if this file has been changed
|
||||||
|
|
||||||
|
name: Nightly-Homebrew-Build
|
||||||
|
jobs:
|
||||||
|
build-OpenBLAS-with-Homebrew:
|
||||||
|
runs-on: macos-latest
|
||||||
|
env:
|
||||||
|
HOMEBREW_DEVELOPER: "ON"
|
||||||
|
HOMEBREW_DISPLAY_INSTALL_TIMES: "ON"
|
||||||
|
HOMEBREW_NO_ANALYTICS: "ON"
|
||||||
|
HOMEBREW_NO_AUTO_UPDATE: "ON"
|
||||||
|
HOMEBREW_NO_BOTTLE_SOURCE_FALLBACK: "ON"
|
||||||
|
HOMEBREW_NO_INSTALL_CLEANUP: "ON"
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Random delay for cron job
|
||||||
|
run: |
|
||||||
|
delay=$(( RANDOM % 600 ))
|
||||||
|
printf 'Delaying for %s seconds on event %s' ${delay} "${{ github.event_name }}"
|
||||||
|
sleep ${delay}
|
||||||
|
if: github.event_name == 'schedule'
|
||||||
|
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
# This isn't even needed, technically. Homebrew will get `develop` via git
|
||||||
|
|
||||||
|
- name: Update Homebrew
|
||||||
|
if: github.event_name != 'pull_request'
|
||||||
|
run: brew update || true
|
||||||
|
|
||||||
|
- name: Install prerequisites
|
||||||
|
run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas
|
||||||
|
|
||||||
|
- name: Install and bottle OpenBLAS
|
||||||
|
run: brew install --fetch-HEAD --HEAD --build-bottle --keep-tmp openblas
|
||||||
|
# the HEAD flags tell Homebrew to build the develop branch fetch via git
|
||||||
|
|
||||||
|
- name: Create bottle
|
||||||
|
run: |
|
||||||
|
brew bottle -v openblas
|
||||||
|
mkdir bottles
|
||||||
|
mv *.bottle.tar.gz bottles
|
||||||
|
|
||||||
|
- name: Upload bottle
|
||||||
|
uses: actions/upload-artifact@v1
|
||||||
|
with:
|
||||||
|
name: openblas--HEAD.catalina.bottle.tar.gz
|
||||||
|
path: bottles
|
||||||
|
|
||||||
|
- name: Show linkage
|
||||||
|
run: brew linkage -v openblas
|
||||||
|
|
||||||
|
- name: Test openblas
|
||||||
|
run: brew test --HEAD --verbose openblas
|
||||||
|
|
||||||
|
- name: Audit openblas formula
|
||||||
|
run: |
|
||||||
|
brew audit --strict openblas
|
||||||
|
brew cat openblas
|
||||||
|
|
||||||
|
- name: Post logs on failure
|
||||||
|
if: failure()
|
||||||
|
run: brew gist-logs --with-hostname -v openblas
|
|
@ -87,4 +87,5 @@ build.*
|
||||||
*.swp
|
*.swp
|
||||||
benchmark/*.goto
|
benchmark/*.goto
|
||||||
benchmark/smallscaling
|
benchmark/smallscaling
|
||||||
|
CMakeCache.txt
|
||||||
|
CMakeFiles/*
|
||||||
|
|
42
.travis.yml
42
.travis.yml
|
@ -4,11 +4,10 @@ dist: precise
|
||||||
sudo: true
|
sudo: true
|
||||||
language: c
|
language: c
|
||||||
|
|
||||||
jobs:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- &test-ubuntu
|
- &test-ubuntu
|
||||||
os: linux
|
os: linux
|
||||||
stage: test
|
|
||||||
compiler: gcc
|
compiler: gcc
|
||||||
addons:
|
addons:
|
||||||
apt:
|
apt:
|
||||||
|
@ -18,7 +17,7 @@ jobs:
|
||||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||||
script:
|
script:
|
||||||
- set -e
|
- set -e
|
||||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||||
- make -C test $COMMON_FLAGS $BTYPE
|
- make -C test $COMMON_FLAGS $BTYPE
|
||||||
- make -C ctest $COMMON_FLAGS $BTYPE
|
- make -C ctest $COMMON_FLAGS $BTYPE
|
||||||
- make -C utest $COMMON_FLAGS $BTYPE
|
- make -C utest $COMMON_FLAGS $BTYPE
|
||||||
|
@ -26,6 +25,15 @@ jobs:
|
||||||
- TARGET_BOX=LINUX64
|
- TARGET_BOX=LINUX64
|
||||||
- BTYPE="BINARY=64"
|
- BTYPE="BINARY=64"
|
||||||
|
|
||||||
|
- <<: *test-ubuntu
|
||||||
|
os: linux-ppc64le
|
||||||
|
before_script:
|
||||||
|
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
|
||||||
|
env:
|
||||||
|
# for matrix annotation only
|
||||||
|
- TARGET_BOX=PPC64LE_LINUX
|
||||||
|
- BTYPE="BINARY=64 USE_OPENMP=1"
|
||||||
|
|
||||||
- <<: *test-ubuntu
|
- <<: *test-ubuntu
|
||||||
env:
|
env:
|
||||||
- TARGET_BOX=LINUX64
|
- TARGET_BOX=LINUX64
|
||||||
|
@ -59,7 +67,6 @@ jobs:
|
||||||
- BTYPE="BINARY=32"
|
- BTYPE="BINARY=32"
|
||||||
|
|
||||||
- os: linux
|
- os: linux
|
||||||
stage: test
|
|
||||||
compiler: gcc
|
compiler: gcc
|
||||||
addons:
|
addons:
|
||||||
apt:
|
apt:
|
||||||
|
@ -80,13 +87,12 @@ jobs:
|
||||||
# that don't require sudo.
|
# that don't require sudo.
|
||||||
- &test-alpine
|
- &test-alpine
|
||||||
os: linux
|
os: linux
|
||||||
stage: test
|
|
||||||
dist: trusty
|
dist: trusty
|
||||||
sudo: true
|
sudo: true
|
||||||
language: minimal
|
language: minimal
|
||||||
before_install:
|
before_install:
|
||||||
- "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \
|
- "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
|
||||||
&& echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e alpine-chroot-install' | sha1sum -c || exit 1"
|
&& echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1"
|
||||||
- alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
|
- alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
|
||||||
install:
|
install:
|
||||||
- sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
|
- sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
|
||||||
|
@ -120,11 +126,10 @@ jobs:
|
||||||
- <<: *test-alpine
|
- <<: *test-alpine
|
||||||
env:
|
env:
|
||||||
- TARGET_BOX=LINUX64_MUSL
|
- TARGET_BOX=LINUX64_MUSL
|
||||||
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2"
|
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2"
|
||||||
|
|
||||||
- &test-cmake
|
- &test-cmake
|
||||||
os: linux
|
os: linux
|
||||||
stage: test
|
|
||||||
compiler: clang
|
compiler: clang
|
||||||
addons:
|
addons:
|
||||||
apt:
|
apt:
|
||||||
|
@ -153,20 +158,27 @@ jobs:
|
||||||
|
|
||||||
- &test-macos
|
- &test-macos
|
||||||
os: osx
|
os: osx
|
||||||
stage: test
|
osx_image: xcode10.1
|
||||||
osx_image: xcode8
|
|
||||||
before_script:
|
before_script:
|
||||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||||
- brew update
|
- brew update
|
||||||
- brew install gcc # for gfortran
|
- brew install gcc@8 # for gfortran
|
||||||
script:
|
script:
|
||||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||||
env:
|
env:
|
||||||
- BTYPE="BINARY=64 INTERFACE64=1"
|
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8"
|
||||||
|
|
||||||
- <<: *test-macos
|
- <<: *test-macos
|
||||||
|
osx_image: xcode10.0
|
||||||
env:
|
env:
|
||||||
- BTYPE="BINARY=32"
|
- BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
|
||||||
|
|
||||||
|
- <<: *test-macos
|
||||||
|
osx_image: xcode10.1
|
||||||
|
env:
|
||||||
|
- CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
|
||||||
|
- CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||||
|
- BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
|
||||||
|
|
||||||
# whitelist
|
# whitelist
|
||||||
branches:
|
branches:
|
||||||
|
|
171
CMakeLists.txt
171
CMakeLists.txt
|
@ -6,21 +6,35 @@ cmake_minimum_required(VERSION 2.8.5)
|
||||||
project(OpenBLAS C ASM)
|
project(OpenBLAS C ASM)
|
||||||
set(OpenBLAS_MAJOR_VERSION 0)
|
set(OpenBLAS_MAJOR_VERSION 0)
|
||||||
set(OpenBLAS_MINOR_VERSION 3)
|
set(OpenBLAS_MINOR_VERSION 3)
|
||||||
set(OpenBLAS_PATCH_VERSION 0.dev)
|
set(OpenBLAS_PATCH_VERSION 9.dev)
|
||||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||||
|
|
||||||
# Adhere to GNU filesystem layout conventions
|
# Adhere to GNU filesystem layout conventions
|
||||||
include(GNUInstallDirs)
|
include(GNUInstallDirs)
|
||||||
|
|
||||||
set(OpenBLAS_LIBNAME openblas)
|
include(CMakePackageConfigHelpers)
|
||||||
|
|
||||||
|
|
||||||
#######
|
#######
|
||||||
if(MSVC)
|
if(MSVC)
|
||||||
option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
|
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
|
||||||
endif()
|
endif()
|
||||||
option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF)
|
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
|
||||||
option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF)
|
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
|
||||||
option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF)
|
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
|
||||||
|
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
|
||||||
|
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
|
||||||
|
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
|
||||||
|
else()
|
||||||
|
set(NO_AFFINITY 1)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# Add a prefix or suffix to all exported symbol names in the shared library.
|
||||||
|
# Avoids conflicts with other BLAS libraries, especially when using
|
||||||
|
# 64 bit integer interfaces in OpenBLAS.
|
||||||
|
|
||||||
|
set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" )
|
||||||
|
set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" )
|
||||||
#######
|
#######
|
||||||
if(BUILD_WITHOUT_LAPACK)
|
if(BUILD_WITHOUT_LAPACK)
|
||||||
set(NO_LAPACK 1)
|
set(NO_LAPACK 1)
|
||||||
|
@ -33,12 +47,27 @@ endif()
|
||||||
|
|
||||||
#######
|
#######
|
||||||
|
|
||||||
|
if(MSVC AND MSVC_STATIC_CRT)
|
||||||
|
set(CompilerFlags
|
||||||
|
CMAKE_CXX_FLAGS
|
||||||
|
CMAKE_CXX_FLAGS_DEBUG
|
||||||
|
CMAKE_CXX_FLAGS_RELEASE
|
||||||
|
CMAKE_C_FLAGS
|
||||||
|
CMAKE_C_FLAGS_DEBUG
|
||||||
|
CMAKE_C_FLAGS_RELEASE
|
||||||
|
)
|
||||||
|
foreach(CompilerFlag ${CompilerFlags})
|
||||||
|
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
|
||||||
|
endforeach()
|
||||||
|
endif()
|
||||||
|
|
||||||
message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.")
|
message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")
|
||||||
|
|
||||||
include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
|
include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
|
||||||
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
|
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
|
||||||
|
|
||||||
|
set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE})
|
||||||
|
|
||||||
set(BLASDIRS interface driver/level2 driver/level3 driver/others)
|
set(BLASDIRS interface driver/level2 driver/level3 driver/others)
|
||||||
|
|
||||||
if (NOT DYNAMIC_ARCH)
|
if (NOT DYNAMIC_ARCH)
|
||||||
|
@ -51,10 +80,10 @@ endif ()
|
||||||
|
|
||||||
set(SUBDIRS ${BLASDIRS})
|
set(SUBDIRS ${BLASDIRS})
|
||||||
if (NOT NO_LAPACK)
|
if (NOT NO_LAPACK)
|
||||||
list(APPEND SUBDIRS lapack)
|
|
||||||
if(BUILD_RELAPACK)
|
if(BUILD_RELAPACK)
|
||||||
list(APPEND SUBDIRS relapack/src)
|
list(APPEND SUBDIRS relapack/src)
|
||||||
endif()
|
endif()
|
||||||
|
list(APPEND SUBDIRS lapack)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
# set which float types we want to build for
|
# set which float types we want to build for
|
||||||
|
@ -123,7 +152,7 @@ endif ()
|
||||||
|
|
||||||
# Only generate .def for dll on MSVC and always produce pdb files for debug and release
|
# Only generate .def for dll on MSVC and always produce pdb files for debug and release
|
||||||
if(MSVC)
|
if(MSVC)
|
||||||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4)
|
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
|
||||||
set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def")
|
set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def")
|
||||||
endif()
|
endif()
|
||||||
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
|
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
|
||||||
|
@ -138,14 +167,9 @@ if (${DYNAMIC_ARCH})
|
||||||
endforeach()
|
endforeach()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
# Only build shared libs for MSVC
|
|
||||||
if (MSVC)
|
|
||||||
set(BUILD_SHARED_LIBS ON)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
|
|
||||||
# add objects to the openblas lib
|
# add objects to the openblas lib
|
||||||
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
|
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
|
||||||
|
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
|
||||||
|
|
||||||
# Android needs to explicitly link against libm
|
# Android needs to explicitly link against libm
|
||||||
if(ANDROID)
|
if(ANDROID)
|
||||||
|
@ -154,7 +178,7 @@ endif()
|
||||||
|
|
||||||
# Handle MSVC exports
|
# Handle MSVC exports
|
||||||
if(MSVC AND BUILD_SHARED_LIBS)
|
if(MSVC AND BUILD_SHARED_LIBS)
|
||||||
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4)
|
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
|
||||||
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
|
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
|
||||||
else()
|
else()
|
||||||
# Creates verbose .def file (51KB vs 18KB)
|
# Creates verbose .def file (51KB vs 18KB)
|
||||||
|
@ -165,6 +189,7 @@ endif()
|
||||||
# Set output for libopenblas
|
# Set output for libopenblas
|
||||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||||
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
|
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
|
||||||
|
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS")
|
||||||
|
|
||||||
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
|
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
|
||||||
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
|
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
|
||||||
|
@ -186,7 +211,8 @@ if (USE_THREAD)
|
||||||
target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (MSVC OR NOT NOFORTRAN)
|
#if (MSVC OR NOT NOFORTRAN)
|
||||||
|
if (NOT NO_CBLAS)
|
||||||
# Broken without fortran on unix
|
# Broken without fortran on unix
|
||||||
add_subdirectory(utest)
|
add_subdirectory(utest)
|
||||||
endif()
|
endif()
|
||||||
|
@ -204,14 +230,92 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
|
||||||
SOVERSION ${OpenBLAS_MAJOR_VERSION}
|
SOVERSION ${OpenBLAS_MAJOR_VERSION}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
|
||||||
|
if (NOT MSVC)
|
||||||
|
target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition")
|
||||||
|
else()
|
||||||
|
target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "")
|
||||||
|
if (NOT DEFINED ARCH)
|
||||||
|
set(ARCH_IN "x86_64")
|
||||||
|
else()
|
||||||
|
set(ARCH_IN ${ARCH})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (${CORE} STREQUAL "generic")
|
||||||
|
set(ARCH_IN "GENERIC")
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
if (NOT DEFINED EXPRECISION)
|
||||||
|
set(EXPRECISION_IN 0)
|
||||||
|
else()
|
||||||
|
set(EXPRECISION_IN ${EXPRECISION})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (NOT DEFINED NO_CBLAS)
|
||||||
|
set(NO_CBLAS_IN 0)
|
||||||
|
else()
|
||||||
|
set(NO_CBLAS_IN ${NO_CBLAS})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (NOT DEFINED NO_LAPACK)
|
||||||
|
set(NO_LAPACK_IN 0)
|
||||||
|
else()
|
||||||
|
set(NO_LAPACK_IN ${NO_LAPACK})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (NOT DEFINED NO_LAPACKE)
|
||||||
|
set(NO_LAPACKE_IN 0)
|
||||||
|
else()
|
||||||
|
set(NO_LAPACKE_IN ${NO_LAPACKE})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (NOT DEFINED NEED2UNDERSCORES)
|
||||||
|
set(NEED2UNDERSCORES_IN 0)
|
||||||
|
else()
|
||||||
|
set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (NOT DEFINED ONLY_CBLAS)
|
||||||
|
set(ONLY_CBLAS_IN 0)
|
||||||
|
else()
|
||||||
|
set(ONLY_CBLAS_IN ${ONLY_CBLAS})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (NOT DEFINED BU)
|
||||||
|
set(BU _)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (NOT ${SYMBOLPREFIX} STREQUAL "")
|
||||||
|
message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
|
||||||
|
endif()
|
||||||
|
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
|
||||||
|
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
|
||||||
|
endif()
|
||||||
|
add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD
|
||||||
|
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||||
|
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
|
||||||
|
COMMENT "renaming symbols"
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
# Install project
|
# Install project
|
||||||
|
|
||||||
# Install libraries
|
# Install libraries
|
||||||
install(TARGETS ${OpenBLAS_LIBNAME}
|
install(TARGETS ${OpenBLAS_LIBNAME}
|
||||||
|
EXPORT "OpenBLAS${SUFFIX64}Targets"
|
||||||
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||||
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
|
||||||
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
|
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
|
||||||
|
|
||||||
|
# Install headers
|
||||||
|
set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
|
||||||
|
set(CMAKE_INSTALL_FULL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR})
|
||||||
|
|
||||||
message(STATUS "Generating openblas_config.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
message(STATUS "Generating openblas_config.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||||
|
|
||||||
set(OPENBLAS_CONFIG_H ${CMAKE_BINARY_DIR}/openblas_config.h)
|
set(OPENBLAS_CONFIG_H ${CMAKE_BINARY_DIR}/openblas_config.h)
|
||||||
|
@ -231,7 +335,7 @@ install (FILES ${OPENBLAS_CONFIG_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||||
if(NOT NOFORTRAN)
|
if(NOT NOFORTRAN)
|
||||||
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||||
|
|
||||||
set(F77BLAS_H ${CMAKE_BINARY_DIR}/f77blas.h)
|
set(F77BLAS_H ${CMAKE_BINARY_DIR}/generated/f77blas.h)
|
||||||
file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n")
|
file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n")
|
||||||
file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n")
|
file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n")
|
||||||
file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n")
|
file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n")
|
||||||
|
@ -244,10 +348,11 @@ endif()
|
||||||
if(NOT NO_CBLAS)
|
if(NOT NO_CBLAS)
|
||||||
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
|
||||||
|
|
||||||
|
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
|
||||||
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
|
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
|
||||||
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
|
||||||
file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}")
|
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
|
||||||
install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h)
|
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(NOT NO_LAPACKE)
|
if(NOT NO_LAPACKE)
|
||||||
|
@ -259,11 +364,31 @@ if(NOT NO_LAPACKE)
|
||||||
ADD_CUSTOM_TARGET(genlapacke
|
ADD_CUSTOM_TARGET(genlapacke
|
||||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
|
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
|
||||||
)
|
)
|
||||||
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
|
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
include(FindPkgConfig QUIET)
|
include(FindPkgConfig QUIET)
|
||||||
if(PKG_CONFIG_FOUND)
|
if(PKG_CONFIG_FOUND)
|
||||||
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY)
|
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY)
|
||||||
install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
|
install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
|
||||||
|
set(PN OpenBLAS)
|
||||||
|
set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}")
|
||||||
|
configure_package_config_file(cmake/${PN}Config.cmake.in
|
||||||
|
"${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake"
|
||||||
|
INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR})
|
||||||
|
write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
|
||||||
|
VERSION ${${PN}_VERSION}
|
||||||
|
COMPATIBILITY AnyNewerVersion)
|
||||||
|
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake
|
||||||
|
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
|
||||||
|
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
|
||||||
|
RENAME ${PN}${SUFFIX64}ConfigVersion.cmake
|
||||||
|
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
|
||||||
|
install(EXPORT "${PN}${SUFFIX64}Targets"
|
||||||
|
NAMESPACE "${PN}${SUFFIX64}::"
|
||||||
|
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
|
||||||
|
|
||||||
|
|
|
@ -167,4 +167,16 @@ In chronological order:
|
||||||
* [2017-02-26] ztrmm kernel for IBM z13
|
* [2017-02-26] ztrmm kernel for IBM z13
|
||||||
* [2017-03-13] strmm and ctrmm kernel for IBM z13
|
* [2017-03-13] strmm and ctrmm kernel for IBM z13
|
||||||
* [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13
|
* [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13
|
||||||
|
* [2018-03-07] added missing Blas Level 1-2 (double precision) simd codes
|
||||||
|
* [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
|
||||||
|
* [2019-03-14] power9 dgemm/dtrmm kernel
|
||||||
|
* [2019-04-29] power9 sgemm/strmm kernel
|
||||||
|
|
||||||
|
* Jiachen Wang <https://github.com/wjc404>
|
||||||
|
* [2019-07-29] optimize AVX2 DGEMM
|
||||||
|
* [2019-10-20] AVX512 DGEMM kernel (4x8)
|
||||||
|
* [2019-11-06] optimize AVX512 SGEMM
|
||||||
|
* [2019-11-12] AVX512 CGEMM & ZGEMM kernels
|
||||||
|
* [2019-12-23] optimize AVX2 CGEMM and ZGEMM
|
||||||
|
* [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels
|
||||||
|
* [2020-01-07] optimize AVX2 SGEMM and STRMM
|
||||||
|
|
418
Changelog.txt
418
Changelog.txt
|
@ -1,4 +1,422 @@
|
||||||
OpenBLAS ChangeLog
|
OpenBLAS ChangeLog
|
||||||
|
====================================================================
|
||||||
|
Version 0.3.8
|
||||||
|
9-Feb-2020
|
||||||
|
|
||||||
|
common:
|
||||||
|
` * LAPACK has been updated to 3.9.0 (plus patches up to
|
||||||
|
January 2nd, 2020)
|
||||||
|
* CMAKE support has been improved in several areas including
|
||||||
|
cross-compilation
|
||||||
|
* a thread race condition in the GEMM3M kernels was resolved
|
||||||
|
* the "generic" (plain C) gemm beta kernel used by many targets
|
||||||
|
has been sped up
|
||||||
|
* an optimized version of the LAPACK trtrs functions has been added
|
||||||
|
* an incompatibilty between the LAPACK tests and the OpenBLAS
|
||||||
|
implementation of XERBLA was resolved, removing the numerous
|
||||||
|
warnings about wrong error exits in the former
|
||||||
|
* support for NetBSD has been added
|
||||||
|
* support for compilation with g95 and non-GNU versions of ld
|
||||||
|
has been improved
|
||||||
|
* support for compilation with (upcoming) gcc 10 has been added
|
||||||
|
|
||||||
|
POWER:
|
||||||
|
* worked around miscompilation of several POWER8 and POWER9
|
||||||
|
kernels by older versions of gcc
|
||||||
|
* added support for big-endian POWER8 and for compilation on AIX
|
||||||
|
* corrected bugs in the big-endian support for PPC440 and PPC970
|
||||||
|
* DYNAMIC_ARCH support is now available in CMAKE builds as well
|
||||||
|
|
||||||
|
ARMV8:
|
||||||
|
* performance of DGEMM_BETA and SGEMM_NCOPY has been improved
|
||||||
|
* compilation for 32bit works again
|
||||||
|
* performance of the RPCC function has been improved
|
||||||
|
* improved performance on small systems
|
||||||
|
* DYNAMIC_ARCH support is now available in CMAKE builds as well
|
||||||
|
* cross-compilation from OSX to IOS was simplified
|
||||||
|
|
||||||
|
x86_64:
|
||||||
|
* a new AVX512 DGEMM kernel was added and the AVX512 SGEMM kernel
|
||||||
|
was significantly improved
|
||||||
|
* optimized AVX512 kernels for CGEMM and ZGEMM have been added
|
||||||
|
* AVX2 kernels for STRMM, SGEMM, and CGEMM have been significantly
|
||||||
|
sped up and optimized CGEMM3M and ZGEMM3M kernels have been added
|
||||||
|
* added support for QEMU virtual cpus
|
||||||
|
* a compilation problem with PGI and SUN compilers was fixed
|
||||||
|
* Intel "Goldmont plus" is now autodetected
|
||||||
|
* a potential crash on program exit on MS Windows has been fixed
|
||||||
|
|
||||||
|
x86:
|
||||||
|
* an unwanted case sensitivity in the implementation of LSAME
|
||||||
|
on older 32bit AMD cpus was fixed
|
||||||
|
|
||||||
|
zarch:
|
||||||
|
* Z15 is now supported as Z14
|
||||||
|
* DYNAMIC_ARCH is now available on ZARCH as well
|
||||||
|
|
||||||
|
====================================================================
|
||||||
|
Version 0.3.7
|
||||||
|
11-Aug 2019
|
||||||
|
|
||||||
|
common:
|
||||||
|
* having the gmake special variables TARGET_ARCH or TARGET_MACH
|
||||||
|
defined no longer causes build failures in ctest or utest
|
||||||
|
* defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer
|
||||||
|
has the same effect as setting them to 1
|
||||||
|
* a new test program was added to allow checking the library for
|
||||||
|
thread safety
|
||||||
|
* a new option USE_LOCKING was added to ensure thread safety when
|
||||||
|
OpenBLAS itself is built without multithreading but will be
|
||||||
|
called from multiple threads.
|
||||||
|
* a build failure on Linux with glibc versions earlier than 2.5
|
||||||
|
was fixed
|
||||||
|
* a runtime error with CPU enumeration (and NO_AFFINITY not set)
|
||||||
|
on glibc 2.6 was fixed
|
||||||
|
* NO_AFFINITY was added to the CMAKE options (and defaults to being
|
||||||
|
active on Linux, as in the gmake builds)
|
||||||
|
|
||||||
|
x86_64:
|
||||||
|
* the build-time logic for detection of AVX512 availability in
|
||||||
|
the processor and compiler was fixed
|
||||||
|
* gmake builds on OSX now set the internal name of the library to
|
||||||
|
libopenblas.0.dylib (consistent with CMAKE)
|
||||||
|
* the Haswell DGEMM kernel received a significant speedup through
|
||||||
|
improved prefetch and load instructions
|
||||||
|
* performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly
|
||||||
|
increased by avoiding vpermpd instructions
|
||||||
|
* the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled
|
||||||
|
to fix remaining errors in DGEMM, DSYMM and DTRMM
|
||||||
|
|
||||||
|
POWER:
|
||||||
|
* added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970
|
||||||
|
* added optimized kernels for POWER9 SGEMM and STRMM
|
||||||
|
|
||||||
|
ARMV7:
|
||||||
|
* fixed the softfp implementations of xAMAX and IxAMAX
|
||||||
|
* removed the predefined -march= flags on both ARMV5 and ARMV6 as
|
||||||
|
they were appropriate for only a subset of platforms
|
||||||
|
|
||||||
|
====================================================================
|
||||||
|
Version 0.3.6
|
||||||
|
29-Apr-2019
|
||||||
|
|
||||||
|
common:
|
||||||
|
* the build tools now check that a given cpu TARGET is actually valid
|
||||||
|
* the build-time check of system features (c_check) has been made
|
||||||
|
less dependent on particular perl features (this should mainly
|
||||||
|
benefit building on Windows)
|
||||||
|
* several problem with the ReLAPACK integration were fixed,
|
||||||
|
including INTERFACE64 support and building a shared library
|
||||||
|
* building with CMAKE on BSD systems was improved
|
||||||
|
* a non-absolute SUM function was added based on the
|
||||||
|
existing optimized code for ASUM
|
||||||
|
* CBLAS interfaces to the IxMIN and IxMAX functions were added
|
||||||
|
* a name clash between LAPACKE and BOOST headers was resolved
|
||||||
|
* CMAKE builds with OpenMP failed to include the appropriate getrf_parallel
|
||||||
|
kernels
|
||||||
|
* a crash on thread (key) deletion with the USE_TLS=1 memory management
|
||||||
|
option was fixed
|
||||||
|
* restored several earlier fixes, in particular for OpenMP performance,
|
||||||
|
building on BSD, and calling fork on CYGWIN, which had inadvertently
|
||||||
|
been dropped in the 0.3.3 rewrite of the memory management code.
|
||||||
|
|
||||||
|
x86_64:
|
||||||
|
* the AVX512 DGEMM kernel has been disabled again due to unsolved problems
|
||||||
|
* building with old versions of MSVC was fixed
|
||||||
|
* it is now possible to build a static library on Windows with CMAKE
|
||||||
|
* accessing environment variables on CYGWIN at run time was fixed
|
||||||
|
* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
|
||||||
|
* Intel "Denverton" atom and Hygon "Dhyana" zen CPUs are now autodetected
|
||||||
|
* building for DYNAMIC_ARCH with a DYNAMIC_LIST of targets is now supported
|
||||||
|
with CMAKE as well
|
||||||
|
* building for DYNAMIC_ARCH with GENERIC as the default target is now supported
|
||||||
|
* a buffer overflow in the SSE GEMM kernel for Intel Nano targets was fixed
|
||||||
|
* assembly bugs involving undeclared modification of input operands were fixed
|
||||||
|
in the AXPY, DOT, GEMV, GER, SCAL, SYMV and TRSM microkernels for Nehalem,
|
||||||
|
Sandybridge, Haswell, Bulldozer and Piledriver. These would typically cause
|
||||||
|
test failures or segfaults when compiled with recent versions of gcc from 8 onward.
|
||||||
|
* a similar bug was fixed in the blas_quickdivide code used to split workloads
|
||||||
|
in most functions
|
||||||
|
* a bug in the IxMIN implementation for the GENERIC target made it return the result of IxMAX
|
||||||
|
* fixed building on SkylakeX systems when either the compiler or the (emulated) operating
|
||||||
|
environment does not support AVX512
|
||||||
|
* improved GEMM performance on ZEN targets
|
||||||
|
|
||||||
|
x86:
|
||||||
|
* build failures caused by the recently added checks for AVX512 were fixed
|
||||||
|
* an inline assembly bug involving undeclared modification of an input argument was
|
||||||
|
fixed in the blas_quickdivide code used to split workloads in most functions
|
||||||
|
* a bug in the IMIN implementation for the GENERIC target made it return the result of IMAX
|
||||||
|
|
||||||
|
MIPS32:
|
||||||
|
* a bug in the IMIN implementation made it return the result of IMAX
|
||||||
|
|
||||||
|
POWER:
|
||||||
|
* single precision BLAS1/2 functions have received optimized POWER8 kernels
|
||||||
|
* POWER9 is now a separate target, with an optimized DGEMM/DTRMM kernel
|
||||||
|
* building on PPC970 systems under OSX Leopard or Tiger is now supported
|
||||||
|
* out-of-bounds memory accesses in the gemm_beta microkernels were fixed
|
||||||
|
* building a shared library on AIX is now supported for POWER6
|
||||||
|
* DYNAMIC_ARCH support has been added for POWER6 and newer
|
||||||
|
|
||||||
|
ARMv7:
|
||||||
|
* corrected xDOT behaviour with zero INC_X or INC_Y
|
||||||
|
* a bug in the IMIN implementation made it return the result of IMAX
|
||||||
|
|
||||||
|
ARMv8:
|
||||||
|
* added support for HiSilicon TSV110 cpus
|
||||||
|
* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
|
||||||
|
* cross-compilation with CMAKE now works again
|
||||||
|
* a bug in the IMIN implementation made it return the result of IMAX
|
||||||
|
* ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7
|
||||||
|
|
||||||
|
IBM Z:
|
||||||
|
* optimized microkernels for single precicion BLAS1/2 functions have been added
|
||||||
|
for both Z13 and Z14
|
||||||
|
|
||||||
|
====================================================================
|
||||||
|
Version 0.3.5
|
||||||
|
31-Dec-2018
|
||||||
|
|
||||||
|
common:
|
||||||
|
* loop unrolling in TRMV has been enabled again.
|
||||||
|
* A domain error in the thread workload distribution for SYRK
|
||||||
|
has been fixed.
|
||||||
|
* gmake builds will now automatically add -fPIC to the build
|
||||||
|
options if the platform requires it.
|
||||||
|
* a pthreads key leakage (and associate crash on dlclose) in
|
||||||
|
the USE_TLS codepath was fixed.
|
||||||
|
* building of the utest cases on systems that do not provide
|
||||||
|
an implementation of complex.h was fixed.
|
||||||
|
|
||||||
|
x86_64:
|
||||||
|
* the SkylakeX code was changed to compile on OSX.
|
||||||
|
* unwanted application of the -march=skylake-avx512 option
|
||||||
|
to the common code parts of a DYNAMIC_ARCH build was fixed.
|
||||||
|
* improved performance of SGEMM for small workloads on Skylake X.
|
||||||
|
* performance of SGEMM and DGEMM was improved on Haswell.
|
||||||
|
|
||||||
|
ARMV8:
|
||||||
|
* a configuration error that broke the CNRM2 kernel was corrected.
|
||||||
|
* compilation of the GEMM kernels with CMAKE was fixed.
|
||||||
|
* DYNAMIC_ARCH builds are now available with CMAKE as well.
|
||||||
|
* using CMAKE for cross-compilation to the new cpu TARGETs
|
||||||
|
introduced in 0.3.4 now works.
|
||||||
|
|
||||||
|
POWER:
|
||||||
|
* a problem in cpu autodetection for AIX has been corrected.
|
||||||
|
|
||||||
|
====================================================================
|
||||||
|
Version 0.3.4
|
||||||
|
02-Dec-2018
|
||||||
|
|
||||||
|
common:
|
||||||
|
* the new, experimental thread-local memory allocation had
|
||||||
|
inadvertently been left enabled for gmake builds in 0.3.3
|
||||||
|
despite the announcement. It is now disabled by default, and
|
||||||
|
single-threaded builds will keep using the old allocator even
|
||||||
|
if the USE_TLS option is turned on.
|
||||||
|
* OpenBLAS will now provide enough buffer space for at least 50
|
||||||
|
threads by default.
|
||||||
|
* The output of openblas_get_config() now contains the version
|
||||||
|
number.
|
||||||
|
* A serious thread safety bug in GEMV operation with small M and
|
||||||
|
large N size has been fixed.
|
||||||
|
* The code will now automatically call blas_thread_init after a
|
||||||
|
fork if needed before handling a call to openblas_set_num_threads
|
||||||
|
* Accesses to parallelized level3 functions from multiple callers
|
||||||
|
are now serialized to avoid thread races (unless using OpenMP).
|
||||||
|
This should provide better performance than the known-threadsafe
|
||||||
|
(but non-default) USE_SIMPLE_THREADED_LEVEL3 option.
|
||||||
|
* When building LAPACK with gfortran, -frecursive is now (again)
|
||||||
|
enabled by default to ensure correct behaviour.
|
||||||
|
* The OpenBLAS version cblas.h now supports both CBLAS_ORDER and
|
||||||
|
CBLAS_LAYOUT as the name of the matrix row/column order option.
|
||||||
|
* Externally set LDFLAGS are now passed through to the final compile/link
|
||||||
|
steps to facilitate setting platform-specific linker flags.
|
||||||
|
* A potential race condition during the build of LAPACK (that would
|
||||||
|
usually manifest itself as a failure to build TESTING/MATGEN) has been
|
||||||
|
fixed.
|
||||||
|
* xHEMV has been changed to stay single-threaded for small input sizes
|
||||||
|
where the overhead of multithreading exceeds any possible gains
|
||||||
|
* CSWAP and ZSWAP have been limited to a single thread except on ARMV8 or
|
||||||
|
ThunderX hardware with sizable input.
|
||||||
|
* Linker flags for the PGI compiler have been updated
|
||||||
|
* Behaviour of AXPY with zero increments is now handled in the C interface,
|
||||||
|
correcting the result on at least Intel Atom.
|
||||||
|
* The result matrix from calling SGELSS with an all-zero input matrix is
|
||||||
|
now zeroed completely.
|
||||||
|
|
||||||
|
x86_64:
|
||||||
|
* Autodetection of AMD Ryzen2 has been fixed (again).
|
||||||
|
* CMAKE builds now support labeling of an INTERFACE64=1 build of
|
||||||
|
the library with the _64 suffix.
|
||||||
|
* AVX512 version of DGEMM has been added and the AVX512 SGEMM kernel
|
||||||
|
has been sped up by rewriting with C intrinsics
|
||||||
|
* Fixed compilation on RHEL5/CENTOS5 (issue with typename __WAIT_STATUS)
|
||||||
|
|
||||||
|
POWER:
|
||||||
|
* added support for building on AIX (with gcc and GNU tools from AIX Toolbox).
|
||||||
|
* CPU type detection has been implemented for AIX.
|
||||||
|
* CPU type detection has been fixed for NETBSD.
|
||||||
|
|
||||||
|
MIPS64:
|
||||||
|
* AXPY on LOONGSON3A has been corrected to pass "zero increment" utest.
|
||||||
|
* DSDOT on LOONGSON3A has been fixed.
|
||||||
|
* the SGEMM microkernel has been hardened against potential data loss.
|
||||||
|
|
||||||
|
ARMV8:
|
||||||
|
* DYNAMic_ARCH support is now available for 64bit ARM
|
||||||
|
* cross-compiling for ARMV8 under iOS now works.
|
||||||
|
* cpu-specific code has been rearranged to make better use of both
|
||||||
|
hardware commonalities and model-specific compiler optimizations.
|
||||||
|
* XGENE1 has been removed as a TARGET, superseded by the improved generic
|
||||||
|
ARMV8 support.
|
||||||
|
|
||||||
|
ARMV7:
|
||||||
|
* Older assembly mnemonics have been converted to UAL form to allow
|
||||||
|
building with clang 7.0
|
||||||
|
* Cross compiling LAPACKE for Android has been fixed again (broken by
|
||||||
|
update to LAPACK 3.7.0 some while ago).
|
||||||
|
|
||||||
|
====================================================================
|
||||||
|
Version 0.3.3
|
||||||
|
31-Aug-2018
|
||||||
|
|
||||||
|
common:
|
||||||
|
* thread memory allocation has been switched back to the method
|
||||||
|
used before version 0.3.1 due to unexpected problems caused by
|
||||||
|
the new code under some circumstances. A new compile-time option
|
||||||
|
USE_TLS has been added to enable the new code, and it is hoped
|
||||||
|
that this can become the default again in the next version.
|
||||||
|
* LAPAck PR272 has been integrated, which fixes spurious errors
|
||||||
|
in DSYEVR and related functions caused by missing conversion
|
||||||
|
from ILAENV to ILAENV_2STAGE in several _2stage routines.
|
||||||
|
* the cmake-generated OpenBLASConfig.cmake now uses correct case
|
||||||
|
for the name of the library
|
||||||
|
* added support for Haiku OS
|
||||||
|
|
||||||
|
x86_64:
|
||||||
|
* added AVX512 implementations of SDOT, DDOT, SAXPY, DAXPY,
|
||||||
|
DSCAL, DGEMVN and DSYMVL
|
||||||
|
* added a workaround for a cygwin issue that prevented compilation
|
||||||
|
of AVX512 code
|
||||||
|
|
||||||
|
IBM Z:
|
||||||
|
* added autodetection of Z14
|
||||||
|
* fixed TRMM errors in the generic target
|
||||||
|
|
||||||
|
====================================================================
|
||||||
|
Version 0.3.2
|
||||||
|
30-Jul-2018
|
||||||
|
|
||||||
|
common:
|
||||||
|
* fixes for regressions caused by the rewrite of the thread
|
||||||
|
initialization code in 0.3.1
|
||||||
|
|
||||||
|
POWER:
|
||||||
|
* fixed cpu autodetection for the BSDs
|
||||||
|
|
||||||
|
MIPS64:
|
||||||
|
* fixed utest errors in AXPY, DSDOT, ROT and SWAP
|
||||||
|
|
||||||
|
x86_64:
|
||||||
|
* added autodetection of AMD Ryzen 2
|
||||||
|
* fixed build with older versions of MSVC
|
||||||
|
|
||||||
|
====================================================================
|
||||||
|
Version 0.3.1
|
||||||
|
01-Jul-2018
|
||||||
|
|
||||||
|
common:
|
||||||
|
* rewritten thread initialization code with significantly reduced overhead
|
||||||
|
* added CBLAS interfaces to the IxAMIN BLAS extension functions
|
||||||
|
* fixed the lapack-test target
|
||||||
|
* CMAKE builds now create an OpenBLASConfig.cmake file
|
||||||
|
* ZAXPY now uses a single thread for small input sizes
|
||||||
|
* the LAPACK code was updated from Reference-LAPACK/lapack#253
|
||||||
|
(fixing LAPACKE interfaces to Aasen's functions)
|
||||||
|
|
||||||
|
POWER:
|
||||||
|
* corrected CROT and ZROT behaviour with zero INC_X
|
||||||
|
|
||||||
|
ARMV7:
|
||||||
|
* corrected xDOT behaviour with zero INC_X or INC_Y
|
||||||
|
|
||||||
|
x86_64:
|
||||||
|
* retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER,
|
||||||
|
this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO
|
||||||
|
(which will still be supported via the slower PRESCOTT kernels when this option is not set)
|
||||||
|
* added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to
|
||||||
|
specify the list of x86_64 targets to include. Any target not on the list will be supported
|
||||||
|
by the Sandybridge or Nehalem kernels if available, or by Prescott.
|
||||||
|
* improved SWITCH_RATIO on Haswell for increased GEMM throughput
|
||||||
|
* added initial support for Intel Skylake X, including an AVX512 SGEMM kernel
|
||||||
|
* added autodetection of Intel Cannon Lake series as Skylake X
|
||||||
|
* added a default L2 cache size for hypervisors that return zero here (Chromebook)
|
||||||
|
* fixed a name clash with recent Windows10 headers that broke the build with (at least)
|
||||||
|
recent mingw from MSYS2
|
||||||
|
* fixed a link error in mixed clang/gfortran builds with OpenMP
|
||||||
|
* updated the OSX deployment target to 10.8
|
||||||
|
* switched on parallel make for builds on MS Windows by default
|
||||||
|
|
||||||
|
x86:
|
||||||
|
* fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y
|
||||||
|
|
||||||
|
====================================================================
|
||||||
|
Version 0.3.0
|
||||||
|
23-May-2108
|
||||||
|
|
||||||
|
common:
|
||||||
|
* fixed some more thread race and locking bugs
|
||||||
|
* added preliminary support for calling an OpenMP build of the library from multiple threads
|
||||||
|
* removed performance impact of thread locks added in 0.2.20 on OpenMP code
|
||||||
|
* general code cleanup
|
||||||
|
* optimized DSDOT implementation
|
||||||
|
* improved thread distribution for GEMM
|
||||||
|
* corrected IMATCOPY/OMATCOPY implementation
|
||||||
|
* fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations
|
||||||
|
* cmake build improvements
|
||||||
|
* pkgconfig file now contains build options
|
||||||
|
* openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build
|
||||||
|
* corrections and improvements for systems with more than 64 cpus
|
||||||
|
* LAPACK code updated to 3.8.0 including later fixes
|
||||||
|
* added ReLAPACK, a recursive implementation of several LAPACK functions
|
||||||
|
* Rewrote ROTMG to handle cases that the netlib code failed to address
|
||||||
|
* Disabled (broken) multithreading code for xTRMV
|
||||||
|
* corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard
|
||||||
|
* shared memory access failures on startup are now handled more gracefully
|
||||||
|
* restored utests from earlier releases (and made them pass on all affected systems)
|
||||||
|
|
||||||
|
SPARC:
|
||||||
|
* several fixes for cpu autodetection
|
||||||
|
|
||||||
|
POWER:
|
||||||
|
* corrected vector register overwriting in several Power8 kernels
|
||||||
|
* optimized additional BLAS functions
|
||||||
|
|
||||||
|
ARM:
|
||||||
|
* added support for CortexA53 and A72
|
||||||
|
* added autodetection for ThunderX2T99
|
||||||
|
* made most optimized kernels the default for generic ARMv8 targets
|
||||||
|
|
||||||
|
x86_64:
|
||||||
|
* parallelized DDOT kernel for Haswell
|
||||||
|
* changed alignment directives in assembly kernels to boost performance on OSX
|
||||||
|
* fixed register handling in the GEMV microkernels (bug exposed by gcc7)
|
||||||
|
* added support for building on OpenBSD and Dragonfly
|
||||||
|
* updated compiler options to work with Intel release 2018
|
||||||
|
* support fully optimized build with clang/flang on Microsoft Windows
|
||||||
|
* fixed building on AIX
|
||||||
|
|
||||||
|
IBM Z:
|
||||||
|
* added optimized BLAS 1/2 functions
|
||||||
|
|
||||||
|
MIPS:
|
||||||
|
* fixed cpu autodetection helper code
|
||||||
|
* added mips32 1004K cpu (Mediatek MT7621 and similar SoC)
|
||||||
|
* added mips64 I6500 cpu
|
||||||
|
|
||||||
====================================================================
|
====================================================================
|
||||||
Version 0.2.20
|
Version 0.2.20
|
||||||
24-Jul-2017
|
24-Jul-2017
|
||||||
|
|
73
Makefile
73
Makefile
|
@ -21,9 +21,20 @@ ifeq ($(BUILD_RELAPACK), 1)
|
||||||
RELA = re_lapack
|
RELA = re_lapack
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(NO_FORTRAN), 1)
|
||||||
|
define NOFORTRAN
|
||||||
|
1
|
||||||
|
endef
|
||||||
|
define NO_LAPACK
|
||||||
|
1
|
||||||
|
endef
|
||||||
|
export NOFORTRAN
|
||||||
|
export NO_LAPACK
|
||||||
|
endif
|
||||||
|
|
||||||
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
|
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
|
||||||
|
|
||||||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
|
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
|
||||||
|
|
||||||
.PHONY : all libs netlib $(RELA) test ctest shared install
|
.PHONY : all libs netlib $(RELA) test ctest shared install
|
||||||
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
|
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
|
||||||
|
@ -47,7 +58,7 @@ endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
@echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
|
@echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
|
||||||
ifndef NOFORTRAN
|
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||||
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"
|
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"
|
||||||
endif
|
endif
|
||||||
ifneq ($(OSNAME), AIX)
|
ifneq ($(OSNAME), AIX)
|
||||||
|
@ -85,8 +96,8 @@ endif
|
||||||
@echo
|
@echo
|
||||||
|
|
||||||
shared :
|
shared :
|
||||||
ifndef NO_SHARED
|
ifneq ($(NO_SHARED), 1)
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||||
@$(MAKE) -C exports so
|
@$(MAKE) -C exports so
|
||||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||||
|
@ -98,6 +109,7 @@ endif
|
||||||
ifeq ($(OSNAME), Darwin)
|
ifeq ($(OSNAME), Darwin)
|
||||||
@$(MAKE) -C exports dyn
|
@$(MAKE) -C exports dyn
|
||||||
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||||
|
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), WINNT)
|
ifeq ($(OSNAME), WINNT)
|
||||||
@$(MAKE) -C exports dll
|
@$(MAKE) -C exports dll
|
||||||
|
@ -108,19 +120,22 @@ endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
tests :
|
tests :
|
||||||
ifndef NOFORTRAN
|
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||||
touch $(LIBNAME)
|
touch $(LIBNAME)
|
||||||
ifndef NO_FBLAS
|
ifndef NO_FBLAS
|
||||||
$(MAKE) -C test all
|
$(MAKE) -C test all
|
||||||
$(MAKE) -C utest all
|
|
||||||
endif
|
endif
|
||||||
|
$(MAKE) -C utest all
|
||||||
ifndef NO_CBLAS
|
ifndef NO_CBLAS
|
||||||
$(MAKE) -C ctest all
|
$(MAKE) -C ctest all
|
||||||
|
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
|
||||||
|
$(MAKE) -C cpp_thread_test all
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
libs :
|
libs :
|
||||||
ifeq ($(CORE), UNKOWN)
|
ifeq ($(CORE), UNKNOWN)
|
||||||
$(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.)
|
$(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.)
|
||||||
endif
|
endif
|
||||||
ifeq ($(NOFORTRAN), 1)
|
ifeq ($(NOFORTRAN), 1)
|
||||||
|
@ -153,6 +168,9 @@ ifeq ($(DYNAMIC_ARCH), 1)
|
||||||
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
|
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
|
||||||
done
|
done
|
||||||
@echo DYNAMIC_ARCH=1 >> Makefile.conf_last
|
@echo DYNAMIC_ARCH=1 >> Makefile.conf_last
|
||||||
|
ifeq ($(DYNAMIC_OLDER), 1)
|
||||||
|
@echo DYNAMIC_OLDER=1 >> Makefile.conf_last
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
ifdef USE_THREAD
|
ifdef USE_THREAD
|
||||||
@echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
|
@echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
|
||||||
|
@ -207,7 +225,7 @@ netlib :
|
||||||
|
|
||||||
else
|
else
|
||||||
netlib : lapack_prebuild
|
netlib : lapack_prebuild
|
||||||
ifndef NOFORTRAN
|
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
|
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
|
||||||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
|
@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
|
||||||
endif
|
endif
|
||||||
|
@ -228,22 +246,22 @@ prof_lapack : lapack_prebuild
|
||||||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
|
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
|
||||||
|
|
||||||
lapack_prebuild :
|
lapack_prebuild :
|
||||||
ifndef NOFORTRAN
|
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||||
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "override ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "ARFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "LAPACKLIB = ../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "TMGLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "TMGLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "LAPACKELIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "LAPACKELIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
-@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
|
@ -253,6 +271,8 @@ ifeq ($(F_COMPILER), GFORTRAN)
|
||||||
ifdef SMP
|
ifdef SMP
|
||||||
ifeq ($(OSNAME), WINNT)
|
ifeq ($(OSNAME), WINNT)
|
||||||
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
|
else ifeq ($(OSNAME), Haiku)
|
||||||
|
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
else
|
else
|
||||||
-@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
|
-@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||||
endif
|
endif
|
||||||
|
@ -271,21 +291,21 @@ endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
large.tgz :
|
large.tgz :
|
||||||
ifndef NOFORTRAN
|
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||||
if [ ! -a $< ]; then
|
if [ ! -a $< ]; then
|
||||||
-wget http://www.netlib.org/lapack/timing/large.tgz;
|
-wget http://www.netlib.org/lapack/timing/large.tgz;
|
||||||
fi
|
fi
|
||||||
endif
|
endif
|
||||||
|
|
||||||
timing.tgz :
|
timing.tgz :
|
||||||
ifndef NOFORTRAN
|
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||||
if [ ! -a $< ]; then
|
if [ ! -a $< ]; then
|
||||||
-wget http://www.netlib.org/lapack/timing/timing.tgz;
|
-wget http://www.netlib.org/lapack/timing/timing.tgz;
|
||||||
fi
|
fi
|
||||||
endif
|
endif
|
||||||
|
|
||||||
lapack-timing : large.tgz timing.tgz
|
lapack-timing : large.tgz timing.tgz
|
||||||
ifndef NOFORTRAN
|
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||||
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
|
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
|
||||||
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
|
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
|
||||||
$(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING
|
$(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING
|
||||||
|
@ -294,11 +314,12 @@ endif
|
||||||
|
|
||||||
lapack-test :
|
lapack-test :
|
||||||
(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
|
(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
|
||||||
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
|
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz
|
||||||
|
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
|
||||||
ifneq ($(CROSS), 1)
|
ifneq ($(CROSS), 1)
|
||||||
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
|
( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \
|
||||||
./testsecond; ./testdsecnd; ./testieee; ./testversion )
|
./testsecond; ./testdsecnd; ./testieee; ./testversion )
|
||||||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
|
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
lapack-runtest:
|
lapack-runtest:
|
||||||
|
@ -308,9 +329,9 @@ lapack-runtest:
|
||||||
|
|
||||||
|
|
||||||
blas-test:
|
blas-test:
|
||||||
(cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out)
|
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out)
|
||||||
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
|
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
|
||||||
(cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out)
|
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out)
|
||||||
|
|
||||||
|
|
||||||
dummy :
|
dummy :
|
||||||
|
|
|
@ -9,11 +9,6 @@ endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), ARMV6)
|
ifeq ($(CORE), ARMV6)
|
||||||
CCOMMON_OPT += -mfpu=vfp -march=armv6
|
CCOMMON_OPT += -mfpu=vfp
|
||||||
FCOMMON_OPT += -mfpu=vfp -march=armv6
|
FCOMMON_OPT += -mfpu=vfp
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(CORE), ARMV5)
|
|
||||||
CCOMMON_OPT += -march=armv5
|
|
||||||
FCOMMON_OPT += -march=armv5
|
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -4,22 +4,45 @@ CCOMMON_OPT += -march=armv8-a
|
||||||
FCOMMON_OPT += -march=armv8-a
|
FCOMMON_OPT += -march=armv8-a
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), CORTEXA57)
|
ifeq ($(CORE), CORTEXA53)
|
||||||
CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
|
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||||
FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
|
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), VULCAN)
|
ifeq ($(CORE), CORTEXA57)
|
||||||
CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
|
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
|
||||||
FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
|
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(CORE), CORTEXA72)
|
||||||
|
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||||
|
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(CORE), CORTEXA73)
|
||||||
|
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
|
||||||
|
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), THUNDERX)
|
ifeq ($(CORE), THUNDERX)
|
||||||
CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
|
CCOMMON_OPT += -march=armv8-a -mtune=thunderx
|
||||||
FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
|
FCOMMON_OPT += -march=armv8-a -mtune=thunderx
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(CORE), FALKOR)
|
||||||
|
CCOMMON_OPT += -march=armv8-a -mtune=falkor
|
||||||
|
FCOMMON_OPT += -march=armv8-a -mtune=falkor
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), THUNDERX2T99)
|
ifeq ($(CORE), THUNDERX2T99)
|
||||||
CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
|
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||||
FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
|
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||||
|
ifeq ($(CORE), TSV110)
|
||||||
|
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||||
|
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
|
@ -48,8 +48,10 @@ ifndef NO_CBLAS
|
||||||
@sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
|
@sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifneq ($(OSNAME), AIX)
|
||||||
ifndef NO_LAPACKE
|
ifndef NO_LAPACKE
|
||||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||||
|
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
|
||||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
|
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
|
||||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
||||||
|
@ -57,21 +59,22 @@ ifndef NO_LAPACKE
|
||||||
endif
|
endif
|
||||||
|
|
||||||
#for install static library
|
#for install static library
|
||||||
ifndef NO_STATIC
|
ifneq ($(NO_STATIC),1)
|
||||||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||||
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||||
endif
|
endif
|
||||||
#for install shared library
|
#for install shared library
|
||||||
ifndef NO_SHARED
|
ifneq ($(NO_SHARED),1)
|
||||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
|
||||||
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
|
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
|
||||||
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||||
|
@ -79,9 +82,10 @@ ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), Darwin)
|
ifeq ($(OSNAME), Darwin)
|
||||||
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
@-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
|
@-install_name_tool -id "$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(MAJOR_VERSION).dylib" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
|
||||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||||
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
|
||||||
|
ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||||
endif
|
endif
|
||||||
ifeq ($(OSNAME), WINNT)
|
ifeq ($(OSNAME), WINNT)
|
||||||
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
|
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
|
||||||
|
@ -93,11 +97,40 @@ ifeq ($(OSNAME), CYGWIN_NT)
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
else
|
||||||
|
#install on AIX has different options syntax
|
||||||
|
ifndef NO_LAPACKE
|
||||||
|
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||||
|
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
|
||||||
|
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||||
|
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
|
||||||
|
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
||||||
|
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
|
||||||
|
endif
|
||||||
|
|
||||||
|
#for install static library
|
||||||
|
ifneq ($(NO_STATIC),1)
|
||||||
|
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||||
|
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
|
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||||
|
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||||
|
endif
|
||||||
|
#for install shared library
|
||||||
|
ifneq ($(NO_SHARED),1)
|
||||||
|
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||||
|
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||||
|
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||||
|
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||||
|
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||||
|
endif
|
||||||
|
|
||||||
|
endif
|
||||||
|
|
||||||
#Generating openblas.pc
|
#Generating openblas.pc
|
||||||
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
|
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
|
||||||
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
|
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||||
|
@ -108,7 +141,7 @@ endif
|
||||||
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||||
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||||
|
|
||||||
ifndef NO_SHARED
|
ifneq ($(NO_SHARED),1)
|
||||||
#ifeq logical or
|
#ifeq logical or
|
||||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
|
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
|
||||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||||
|
|
|
@ -9,7 +9,15 @@ else
|
||||||
USE_OPENMP = 1
|
USE_OPENMP = 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(CORE), POWER9)
|
||||||
|
ifeq ($(USE_OPENMP), 1)
|
||||||
|
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||||
|
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
|
||||||
|
else
|
||||||
|
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math
|
||||||
|
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), POWER8)
|
ifeq ($(CORE), POWER8)
|
||||||
ifeq ($(USE_OPENMP), 1)
|
ifeq ($(USE_OPENMP), 1)
|
||||||
|
@ -21,6 +29,10 @@ FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fas
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# workaround for C->FORTRAN ABI violation in LAPACKE
|
||||||
|
ifeq ($(F_COMPILER), GFORTRAN)
|
||||||
|
FCOMMON_OPT += -fno-optimize-sibling-calls
|
||||||
|
endif
|
||||||
|
|
||||||
FLAMEPATH = $(HOME)/flame/lib
|
FLAMEPATH = $(HOME)/flame/lib
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,10 @@ ifdef CPUIDEMU
|
||||||
EXFLAGS = -DCPUIDEMU -DVENDOR=99
|
EXFLAGS = -DCPUIDEMU -DVENDOR=99
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(TARGET), 1004K)
|
||||||
|
TARGET_FLAGS = -mips32r2
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(TARGET), P5600)
|
ifeq ($(TARGET), P5600)
|
||||||
TARGET_FLAGS = -mips32r5
|
TARGET_FLAGS = -mips32r5
|
||||||
endif
|
endif
|
||||||
|
@ -38,7 +42,7 @@ all: getarch_2nd
|
||||||
./getarch_2nd 1 >> $(TARGET_CONF)
|
./getarch_2nd 1 >> $(TARGET_CONF)
|
||||||
|
|
||||||
config.h : c_check f_check getarch
|
config.h : c_check f_check getarch
|
||||||
perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS)
|
perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) $(CFLAGS)
|
||||||
ifneq ($(ONLY_CBLAS), 1)
|
ifneq ($(ONLY_CBLAS), 1)
|
||||||
perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS)
|
perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS)
|
||||||
else
|
else
|
||||||
|
@ -55,13 +59,13 @@ endif
|
||||||
|
|
||||||
|
|
||||||
getarch : getarch.c cpuid.S dummy $(CPUIDEMU)
|
getarch : getarch.c cpuid.S dummy $(CPUIDEMU)
|
||||||
$(HOSTCC) $(CFLAGS) $(EXFLAGS) -o $(@F) getarch.c cpuid.S $(CPUIDEMU)
|
$(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) -o $(@F) getarch.c cpuid.S $(CPUIDEMU)
|
||||||
|
|
||||||
getarch_2nd : getarch_2nd.c config.h dummy
|
getarch_2nd : getarch_2nd.c config.h dummy
|
||||||
ifndef TARGET_CORE
|
ifndef TARGET_CORE
|
||||||
$(HOSTCC) -I. $(CFLAGS) -o $(@F) getarch_2nd.c
|
$(HOSTCC) -I. $(HOST_CFLAGS) -o $(@F) getarch_2nd.c
|
||||||
else
|
else
|
||||||
$(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c
|
$(HOSTCC) -I. $(HOST_CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c
|
||||||
endif
|
endif
|
||||||
|
|
||||||
dummy:
|
dummy:
|
||||||
|
|
137
Makefile.rule
137
Makefile.rule
|
@ -3,7 +3,7 @@
|
||||||
#
|
#
|
||||||
|
|
||||||
# This library's version
|
# This library's version
|
||||||
VERSION = 0.3.0.dev
|
VERSION = 0.3.9.dev
|
||||||
|
|
||||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||||
|
@ -17,6 +17,11 @@ VERSION = 0.3.0.dev
|
||||||
# If you want to support multiple architecture in one binary
|
# If you want to support multiple architecture in one binary
|
||||||
# DYNAMIC_ARCH = 1
|
# DYNAMIC_ARCH = 1
|
||||||
|
|
||||||
|
# If you want the full list of x86_64 architectures supported in DYNAMIC_ARCH
|
||||||
|
# mode (including individual optimizied codes for PENRYN, DUNNINGTON, OPTERON,
|
||||||
|
# OPTERON_SSE3, ATOM and NANO rather than fallbacks to older architectures)
|
||||||
|
# DYNAMIC_OLDER = 1
|
||||||
|
|
||||||
# C compiler including binary type(32bit / 64bit). Default is gcc.
|
# C compiler including binary type(32bit / 64bit). Default is gcc.
|
||||||
# Don't use Intel Compiler or PGI, it won't generate right codes as I expect.
|
# Don't use Intel Compiler or PGI, it won't generate right codes as I expect.
|
||||||
# CC = gcc
|
# CC = gcc
|
||||||
|
@ -43,6 +48,8 @@ VERSION = 0.3.0.dev
|
||||||
# HOSTCC = gcc
|
# HOSTCC = gcc
|
||||||
|
|
||||||
# If you need 32bit binary, define BINARY=32, otherwise define BINARY=64
|
# If you need 32bit binary, define BINARY=32, otherwise define BINARY=64
|
||||||
|
# Please note that AVX is not available on 32-bit.
|
||||||
|
# Setting BINARY=32 disables AVX/AVX2/AVX-512.
|
||||||
# BINARY=64
|
# BINARY=64
|
||||||
|
|
||||||
# About threaded BLAS. It will be automatically detected if you don't
|
# About threaded BLAS. It will be automatically detected if you don't
|
||||||
|
@ -51,33 +58,72 @@ VERSION = 0.3.0.dev
|
||||||
# For force setting for multi threaded, specify USE_THREAD = 1
|
# For force setting for multi threaded, specify USE_THREAD = 1
|
||||||
# USE_THREAD = 0
|
# USE_THREAD = 0
|
||||||
|
|
||||||
|
# If you want to build a single-threaded OpenBLAS, but expect to call this
|
||||||
|
# from several concurrent threads in some other program, comment this in for
|
||||||
|
# thread safety. (This is done automatically for USE_THREAD=1 , and should not
|
||||||
|
# be necessary when USE_OPENMP=1)
|
||||||
|
# USE_LOCKING = 1
|
||||||
|
|
||||||
# If you're going to use this library with OpenMP, please comment it in.
|
# If you're going to use this library with OpenMP, please comment it in.
|
||||||
# This flag is always set for POWER8. Don't modify the flag
|
# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8.
|
||||||
# USE_OPENMP = 1
|
# USE_OPENMP = 1
|
||||||
|
|
||||||
# You can define maximum number of threads. Basically it should be
|
# The OpenMP scheduler to use - by default this is "static" and you
|
||||||
# less than actual number of cores. If you don't specify one, it's
|
# will normally not want to change this unless you know that your main
|
||||||
# automatically detected by the the script.
|
# workload will involve tasks that have highly unbalanced running times
|
||||||
|
# for individual threads. Changing away from "static" may also adversely
|
||||||
|
# affect memory access locality in NUMA systems. Setting to "runtime" will
|
||||||
|
# allow you to select the scheduler from the environment variable OMP_SCHEDULE
|
||||||
|
# CCOMMON_OPT += -DOMP_SCHED=dynamic
|
||||||
|
|
||||||
|
# You can define the maximum number of threads. Basically it should be less
|
||||||
|
# than or equal to the number of CPU threads. If you don't specify one, it's
|
||||||
|
# automatically detected by the build system.
|
||||||
|
# If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to
|
||||||
|
# restrict NUM_THREADS to the number of physical cores. By default, the automatic
|
||||||
|
# detection includes logical CPUs, thus allowing the use of SMT.
|
||||||
|
# Users may opt at runtime to use less than NUM_THREADS threads.
|
||||||
|
#
|
||||||
|
# Note for package maintainers: you can build OpenBLAS with a large NUM_THREADS
|
||||||
|
# value (eg. 32-256) if you expect your users to use that many threads. Due to the way
|
||||||
|
# some internal structures are allocated, using a large NUM_THREADS value has a RAM
|
||||||
|
# footprint penalty, even if users reduce the actual number of threads at runtime.
|
||||||
# NUM_THREADS = 24
|
# NUM_THREADS = 24
|
||||||
|
|
||||||
# if you don't need to install the static library, please comment it in.
|
# If you have enabled USE_OPENMP and your application would call
|
||||||
|
# OpenBLAS's calculation API from multiple threads, please comment this in.
|
||||||
|
# This flag defines how many instances of OpenBLAS's calculation API can actually
|
||||||
|
# run in parallel. If more than NUM_PARALLEL threads call OpenBLAS's calculation API,
|
||||||
|
# they need to wait for the preceding API calls to finish or risk data corruption.
|
||||||
|
# NUM_PARALLEL = 2
|
||||||
|
|
||||||
|
# When multithreading, OpenBLAS needs to use a memory buffer for communicating
|
||||||
|
# and collating results for individual subranges of the original matrix. Since
|
||||||
|
# the original GotoBLAS of the early 2000s, the default size of this buffer has
|
||||||
|
# been set at a value of 32<<20 (which is 32MB) on x86_64 , twice that on PPC.
|
||||||
|
# If you expect to handle large problem sizes (beyond about 30000x30000) uncomment
|
||||||
|
# this line and adjust the (32<<n) factor if necessary. Usually an insufficient value
|
||||||
|
# manifests itself as a crash in the relevant scal kernel (sscal_k, dscal_k etc)
|
||||||
|
# BUFFERSIZE = 25
|
||||||
|
|
||||||
|
# If you don't need to install the static library, please comment this in.
|
||||||
# NO_STATIC = 1
|
# NO_STATIC = 1
|
||||||
|
|
||||||
# if you don't need generate the shared library, please comment it in.
|
# If you don't need to generate the shared library, please comment this in.
|
||||||
# NO_SHARED = 1
|
# NO_SHARED = 1
|
||||||
|
|
||||||
# If you don't need CBLAS interface, please comment it in.
|
# If you don't need the CBLAS interface, please comment this in.
|
||||||
# NO_CBLAS = 1
|
# NO_CBLAS = 1
|
||||||
|
|
||||||
# If you only want CBLAS interface without installing Fortran compiler,
|
# If you only want the CBLAS interface without installing a Fortran compiler,
|
||||||
# please comment it in.
|
# please comment this in.
|
||||||
# ONLY_CBLAS = 1
|
# ONLY_CBLAS = 1
|
||||||
|
|
||||||
# If you don't need LAPACK, please comment it in.
|
# If you don't need LAPACK, please comment this in.
|
||||||
# If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1.
|
# If you set NO_LAPACK=1, the build system automatically sets NO_LAPACKE=1.
|
||||||
# NO_LAPACK = 1
|
# NO_LAPACK = 1
|
||||||
|
|
||||||
# If you don't need LAPACKE (C Interface to LAPACK), please comment it in.
|
# If you don't need LAPACKE (C Interface to LAPACK), please comment this in.
|
||||||
# NO_LAPACKE = 1
|
# NO_LAPACKE = 1
|
||||||
|
|
||||||
# Build LAPACK Deprecated functions since LAPACK 3.6.0
|
# Build LAPACK Deprecated functions since LAPACK 3.6.0
|
||||||
|
@ -86,12 +132,18 @@ BUILD_LAPACK_DEPRECATED = 1
|
||||||
# Build RecursiveLAPACK on top of LAPACK
|
# Build RecursiveLAPACK on top of LAPACK
|
||||||
# BUILD_RELAPACK = 1
|
# BUILD_RELAPACK = 1
|
||||||
|
|
||||||
# If you want to use legacy threaded Level 3 implementation.
|
# If you want to use the legacy threaded Level 3 implementation.
|
||||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||||
|
|
||||||
|
# If you want to use the new, still somewhat experimental code that uses
|
||||||
|
# thread-local storage instead of a central memory buffer in memory.c
|
||||||
|
# Note that if your system uses GLIBC, it needs to have at least glibc 2.21
|
||||||
|
# for this to work.
|
||||||
|
# USE_TLS = 1
|
||||||
|
|
||||||
# If you want to drive whole 64bit region by BLAS. Not all Fortran
|
# If you want to drive whole 64bit region by BLAS. Not all Fortran
|
||||||
# compiler supports this. It's safe to keep comment it out if you
|
# compilers support this. It's safe to keep this commented out if you
|
||||||
# are not sure(equivalent to "-i8" option).
|
# are not sure. (This is equivalent to the "-i8" ifort option).
|
||||||
# INTERFACE64 = 1
|
# INTERFACE64 = 1
|
||||||
|
|
||||||
# Unfortunately most of kernel won't give us high quality buffer.
|
# Unfortunately most of kernel won't give us high quality buffer.
|
||||||
|
@ -99,10 +151,18 @@ BUILD_LAPACK_DEPRECATED = 1
|
||||||
# but it will consume time. If you don't like it, you can disable one.
|
# but it will consume time. If you don't like it, you can disable one.
|
||||||
NO_WARMUP = 1
|
NO_WARMUP = 1
|
||||||
|
|
||||||
# If you want to disable CPU/Memory affinity on Linux.
|
# Comment this in if you want to disable OpenBLAS's CPU/Memory affinity handling.
|
||||||
|
# This feature is only implemented on Linux, and is always disabled on other platforms.
|
||||||
|
# Enabling affinity handling may improve performance, especially on NUMA systems, but
|
||||||
|
# it may conflict with certain applications that also try to manage affinity.
|
||||||
|
# This conflict can result in threads of the application calling OpenBLAS ending up locked
|
||||||
|
# to the same core(s) as OpenBLAS, possibly binding all threads to a single core.
|
||||||
|
# For this reason, affinity handling is disabled by default. Can be safely enabled if nothing
|
||||||
|
# else modifies affinity settings.
|
||||||
|
# Note: enabling affinity has been known to cause problems with NumPy and R
|
||||||
NO_AFFINITY = 1
|
NO_AFFINITY = 1
|
||||||
|
|
||||||
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
|
# If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
|
||||||
# BIGNUMA = 1
|
# BIGNUMA = 1
|
||||||
|
|
||||||
# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
|
# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
|
||||||
|
@ -112,6 +172,10 @@ NO_AFFINITY = 1
|
||||||
# Don't use Haswell optimizations if binutils is too old (e.g. RHEL6)
|
# Don't use Haswell optimizations if binutils is too old (e.g. RHEL6)
|
||||||
# NO_AVX2 = 1
|
# NO_AVX2 = 1
|
||||||
|
|
||||||
|
# Don't use SkylakeX optimizations if binutils or compiler are too old (the build
|
||||||
|
# system will try to determine this automatically)
|
||||||
|
# NO_AVX512 = 1
|
||||||
|
|
||||||
# Don't use parallel make.
|
# Don't use parallel make.
|
||||||
# NO_PARALLEL_MAKE = 1
|
# NO_PARALLEL_MAKE = 1
|
||||||
|
|
||||||
|
@ -126,6 +190,9 @@ NO_AFFINITY = 1
|
||||||
# FUNCTION_PROFILE = 1
|
# FUNCTION_PROFILE = 1
|
||||||
|
|
||||||
# Support for IEEE quad precision(it's *real* REAL*16)( under testing)
|
# Support for IEEE quad precision(it's *real* REAL*16)( under testing)
|
||||||
|
# This option should not be used - it is a holdover from unfinished code present
|
||||||
|
# in the original GotoBLAS2 library that may be usable as a starting point but
|
||||||
|
# is not even expected to compile in its present form.
|
||||||
# QUAD_PRECISION = 1
|
# QUAD_PRECISION = 1
|
||||||
|
|
||||||
# Theads are still working for a while after finishing BLAS operation
|
# Theads are still working for a while after finishing BLAS operation
|
||||||
|
@ -133,22 +200,25 @@ NO_AFFINITY = 1
|
||||||
# time out to improve performance. This number should be from 4 to 30
|
# time out to improve performance. This number should be from 4 to 30
|
||||||
# which corresponds to (1 << n) cycles. For example, if you set to 26,
|
# which corresponds to (1 << n) cycles. For example, if you set to 26,
|
||||||
# thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz
|
# thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz
|
||||||
# system). Also you can control this mumber by THREAD_TIMEOUT
|
# system). Also you can control this number by THREAD_TIMEOUT
|
||||||
# CCOMMON_OPT += -DTHREAD_TIMEOUT=26
|
# CCOMMON_OPT += -DTHREAD_TIMEOUT=26
|
||||||
|
|
||||||
# Using special device driver for mapping physically contigous memory
|
# Using special device driver for mapping physically contiguous memory
|
||||||
# to the user space. If bigphysarea is enabled, it will use it.
|
# to the user space. If bigphysarea is enabled, it will use it.
|
||||||
# DEVICEDRIVER_ALLOCATION = 1
|
# DEVICEDRIVER_ALLOCATION = 1
|
||||||
|
|
||||||
# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
|
# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
|
||||||
# CONSISTENT_FPCSR = 1
|
# CONSISTENT_FPCSR = 1
|
||||||
|
|
||||||
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
|
# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute
|
||||||
# with single thread. You can use this flag to avoid the overhead of multi-threading
|
# with single thread. (Actually in recent versions this is a factor proportional to the
|
||||||
# in small matrix sizes. The default value is 4.
|
# number of floating point operations necessary for the given problem size, no longer
|
||||||
|
# an individual dimension). You can use this setting to avoid the overhead of multi-
|
||||||
|
# threading in small matrix sizes. The default value is 4, but values as high as 50 have
|
||||||
|
# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
|
||||||
# GEMM_MULTITHREAD_THRESHOLD = 4
|
# GEMM_MULTITHREAD_THRESHOLD = 4
|
||||||
|
|
||||||
# If you need santy check by comparing reference BLAS. It'll be very
|
# If you need sanity check by comparing results to reference BLAS. It'll be very
|
||||||
# slow (Not implemented yet).
|
# slow (Not implemented yet).
|
||||||
# SANITY_CHECK = 1
|
# SANITY_CHECK = 1
|
||||||
|
|
||||||
|
@ -160,8 +230,8 @@ NO_AFFINITY = 1
|
||||||
# Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT
|
# Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT
|
||||||
# COMMON_OPT = -O2
|
# COMMON_OPT = -O2
|
||||||
|
|
||||||
# gfortran option for LAPACK
|
# gfortran option for LAPACK to improve thread-safety
|
||||||
# enable this flag only on 64bit Linux and if you need a thread safe lapack library
|
# It is enabled by default in Makefile.system for gfortran
|
||||||
# Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT
|
# Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT
|
||||||
# FCOMMON_OPT = -frecursive
|
# FCOMMON_OPT = -frecursive
|
||||||
|
|
||||||
|
@ -188,6 +258,21 @@ COMMON_PROF = -pg
|
||||||
# SYMBOLPREFIX=
|
# SYMBOLPREFIX=
|
||||||
# SYMBOLSUFFIX=
|
# SYMBOLSUFFIX=
|
||||||
|
|
||||||
|
# Run a C++ based thread safety tester after the build is done.
|
||||||
|
# This is mostly intended as a developer feature to spot regressions, but users and
|
||||||
|
# package maintainers can enable this if they have doubts about the thread safety of
|
||||||
|
# the library, given the configuration in this file.
|
||||||
|
# By default, the thread safety tester launches 52 concurrent calculations at the same
|
||||||
|
# time.
|
||||||
|
#
|
||||||
|
# Please note that the test uses ~1300 MiB of RAM for the DGEMM test.
|
||||||
|
#
|
||||||
|
# The test requires CBLAS to be built, a C++11 capable compiler and the presence of
|
||||||
|
# an OpenMP implementation. If you are cross-compiling this test will probably not
|
||||||
|
# work at all.
|
||||||
|
#
|
||||||
|
# CPP_THREAD_SAFETY_TEST = 1
|
||||||
|
|
||||||
#
|
#
|
||||||
# End of user configuration
|
# End of user configuration
|
||||||
#
|
#
|
||||||
|
|
182
Makefile.system
182
Makefile.system
|
@ -9,6 +9,26 @@ ifndef TOPDIR
|
||||||
TOPDIR = .
|
TOPDIR = .
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# If ARCH is not set, we use the host system's architecture for getarch compile options.
|
||||||
|
ifndef ARCH
|
||||||
|
HOSTARCH := $(shell uname -m)
|
||||||
|
else
|
||||||
|
HOSTARCH = $(ARCH)
|
||||||
|
endif
|
||||||
|
|
||||||
|
# Catch conflicting usage of ARCH in some BSD environments
|
||||||
|
ifeq ($(ARCH), amd64)
|
||||||
|
override ARCH=x86_64
|
||||||
|
else ifeq ($(ARCH), powerpc64)
|
||||||
|
override ARCH=power
|
||||||
|
else ifeq ($(ARCH), i386)
|
||||||
|
override ARCH=x86
|
||||||
|
else ifeq ($(ARCH), aarch64)
|
||||||
|
override ARCH=arm64
|
||||||
|
else ifeq ($(ARCH), zarch)
|
||||||
|
override ARCH=zarch
|
||||||
|
endif
|
||||||
|
|
||||||
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
|
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
|
||||||
|
|
||||||
# Default C compiler
|
# Default C compiler
|
||||||
|
@ -54,6 +74,7 @@ endif
|
||||||
|
|
||||||
ifdef TARGET
|
ifdef TARGET
|
||||||
GETARCH_FLAGS := -DFORCE_$(TARGET)
|
GETARCH_FLAGS := -DFORCE_$(TARGET)
|
||||||
|
GETARCH_FLAGS += -DUSER_TARGET
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# Force fallbacks for 32bit
|
# Force fallbacks for 32bit
|
||||||
|
@ -62,6 +83,9 @@ ifeq ($(BINARY), 32)
|
||||||
ifeq ($(TARGET), HASWELL)
|
ifeq ($(TARGET), HASWELL)
|
||||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(TARGET), SKYLAKEX)
|
||||||
|
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||||
|
endif
|
||||||
ifeq ($(TARGET), SANDYBRIDGE)
|
ifeq ($(TARGET), SANDYBRIDGE)
|
||||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||||
endif
|
endif
|
||||||
|
@ -80,6 +104,9 @@ endif
|
||||||
ifeq ($(TARGET), ZEN)
|
ifeq ($(TARGET), ZEN)
|
||||||
GETARCH_FLAGS := -DFORCE_BARCELONA
|
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(TARGET), ARMV8)
|
||||||
|
GETARCH_FLAGS := -DFORCE_ARMV7
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
@ -95,6 +122,9 @@ ifeq ($(BINARY), 32)
|
||||||
ifeq ($(TARGET_CORE), HASWELL)
|
ifeq ($(TARGET_CORE), HASWELL)
|
||||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(TARGET_CORE), SKYLAKEX)
|
||||||
|
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||||
|
endif
|
||||||
ifeq ($(TARGET_CORE), SANDYBRIDGE)
|
ifeq ($(TARGET_CORE), SANDYBRIDGE)
|
||||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||||
endif
|
endif
|
||||||
|
@ -116,7 +146,12 @@ endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
|
||||||
|
ifeq ($(HOSTARCH), x86_64)
|
||||||
|
ifeq ($(findstring pgcc,$(HOSTCC)),)
|
||||||
|
GETARCH_FLAGS += -march=native
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef INTERFACE64
|
ifdef INTERFACE64
|
||||||
ifneq ($(INTERFACE64), 0)
|
ifneq ($(INTERFACE64), 0)
|
||||||
|
@ -134,13 +169,18 @@ GETARCH_FLAGS += -DNO_AVX
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(BINARY), 32)
|
ifeq ($(BINARY), 32)
|
||||||
GETARCH_FLAGS += -DNO_AVX
|
GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512
|
||||||
|
NO_AVX512 = 1
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(NO_AVX2), 1)
|
ifeq ($(NO_AVX2), 1)
|
||||||
GETARCH_FLAGS += -DNO_AVX2
|
GETARCH_FLAGS += -DNO_AVX2
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(NO_AVX512), 1)
|
||||||
|
GETARCH_FLAGS += -DNO_AVX512
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(DEBUG), 1)
|
ifeq ($(DEBUG), 1)
|
||||||
GETARCH_FLAGS += -g
|
GETARCH_FLAGS += -g
|
||||||
endif
|
endif
|
||||||
|
@ -174,7 +214,7 @@ ifndef GOTOBLAS_MAKEFILE
|
||||||
export GOTOBLAS_MAKEFILE = 1
|
export GOTOBLAS_MAKEFILE = 1
|
||||||
|
|
||||||
# Generating Makefile.conf and config.h
|
# Generating Makefile.conf and config.h
|
||||||
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
|
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
|
||||||
|
|
||||||
ifndef TARGET_CORE
|
ifndef TARGET_CORE
|
||||||
include $(TOPDIR)/Makefile.conf
|
include $(TOPDIR)/Makefile.conf
|
||||||
|
@ -184,6 +224,10 @@ endif
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifndef NUM_PARALLEL
|
||||||
|
NUM_PARALLEL = 1
|
||||||
|
endif
|
||||||
|
|
||||||
ifndef NUM_THREADS
|
ifndef NUM_THREADS
|
||||||
NUM_THREADS = $(NUM_CORES)
|
NUM_THREADS = $(NUM_CORES)
|
||||||
endif
|
endif
|
||||||
|
@ -207,6 +251,10 @@ SMP = 1
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(SMP), 1)
|
||||||
|
USE_LOCKING =
|
||||||
|
endif
|
||||||
|
|
||||||
ifndef NEED_PIC
|
ifndef NEED_PIC
|
||||||
NEED_PIC = 1
|
NEED_PIC = 1
|
||||||
endif
|
endif
|
||||||
|
@ -223,9 +271,10 @@ OBJCOPY = $(CROSS_SUFFIX)objcopy
|
||||||
OBJCONV = $(CROSS_SUFFIX)objconv
|
OBJCONV = $(CROSS_SUFFIX)objconv
|
||||||
|
|
||||||
|
|
||||||
# For detect fortran failed, only build BLAS.
|
# When fortran support was either not detected or actively deselected, only build BLAS.
|
||||||
ifeq ($(NOFORTRAN), 1)
|
ifeq ($(NOFORTRAN), 1)
|
||||||
NO_LAPACK = 1
|
NO_LAPACK = 1
|
||||||
|
override FEXTRALIB =
|
||||||
endif
|
endif
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -234,7 +283,7 @@ endif
|
||||||
|
|
||||||
ifeq ($(OSNAME), Darwin)
|
ifeq ($(OSNAME), Darwin)
|
||||||
ifndef MACOSX_DEPLOYMENT_TARGET
|
ifndef MACOSX_DEPLOYMENT_TARGET
|
||||||
export MACOSX_DEPLOYMENT_TARGET=10.6
|
export MACOSX_DEPLOYMENT_TARGET=10.8
|
||||||
endif
|
endif
|
||||||
MD5SUM = md5 -r
|
MD5SUM = md5 -r
|
||||||
endif
|
endif
|
||||||
|
@ -275,12 +324,14 @@ CCOMMON_OPT += -DMS_ABI
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(C_COMPILER), GCC)
|
ifeq ($(C_COMPILER), GCC)
|
||||||
#Test for supporting MS_ABI
|
#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics)
|
||||||
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||||
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
|
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
|
||||||
|
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
|
||||||
|
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
|
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
|
||||||
ifeq ($(GCCVERSIONGT4), 1)
|
ifeq ($(GCCVERSIONGT4), 1)
|
||||||
# GCC Majar version > 4
|
# GCC Major version > 4
|
||||||
# It is compatible with MSVC ABI.
|
# It is compatible with MSVC ABI.
|
||||||
CCOMMON_OPT += -DMS_ABI
|
CCOMMON_OPT += -DMS_ABI
|
||||||
endif
|
endif
|
||||||
|
@ -358,6 +409,12 @@ ifneq ($(MAX_STACK_ALLOC), 0)
|
||||||
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
|
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef USE_LOCKING
|
||||||
|
ifneq ($(USE_LOCKING), 0)
|
||||||
|
CCOMMON_OPT += -DUSE_LOCKING
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
#
|
#
|
||||||
# Architecture dependent settings
|
# Architecture dependent settings
|
||||||
#
|
#
|
||||||
|
@ -433,7 +490,7 @@ CCOMMON_OPT += -fopenmp
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(C_COMPILER), INTEL)
|
ifeq ($(C_COMPILER), INTEL)
|
||||||
CCOMMON_OPT += -openmp
|
CCOMMON_OPT += -fopenmp
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(C_COMPILER), PGI)
|
ifeq ($(C_COMPILER), PGI)
|
||||||
|
@ -458,13 +515,70 @@ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(ARCH), x86_64)
|
ifeq ($(ARCH), x86_64)
|
||||||
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
DYNAMIC_CORE = PRESCOTT CORE2
|
||||||
|
ifeq ($(DYNAMIC_OLDER), 1)
|
||||||
|
DYNAMIC_CORE += PENRYN DUNNINGTON
|
||||||
|
endif
|
||||||
|
DYNAMIC_CORE += NEHALEM
|
||||||
|
ifeq ($(DYNAMIC_OLDER), 1)
|
||||||
|
DYNAMIC_CORE += OPTERON OPTERON_SSE3
|
||||||
|
endif
|
||||||
|
DYNAMIC_CORE += BARCELONA
|
||||||
|
ifeq ($(DYNAMIC_OLDER), 1)
|
||||||
|
DYNAMIC_CORE += BOBCAT ATOM NANO
|
||||||
|
endif
|
||||||
ifneq ($(NO_AVX), 1)
|
ifneq ($(NO_AVX), 1)
|
||||||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR
|
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR
|
||||||
endif
|
endif
|
||||||
ifneq ($(NO_AVX2), 1)
|
ifneq ($(NO_AVX2), 1)
|
||||||
DYNAMIC_CORE += HASWELL ZEN
|
DYNAMIC_CORE += HASWELL ZEN
|
||||||
endif
|
endif
|
||||||
|
ifneq ($(NO_AVX512), 1)
|
||||||
|
ifneq ($(NO_AVX2), 1)
|
||||||
|
DYNAMIC_CORE += SKYLAKEX
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifdef DYNAMIC_LIST
|
||||||
|
override DYNAMIC_CORE = PRESCOTT $(DYNAMIC_LIST)
|
||||||
|
XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_PRESCOTT
|
||||||
|
XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
|
||||||
|
CCOMMON_OPT += $(XCCOMMON_OPT)
|
||||||
|
#CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)'
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ARCH), arm64)
|
||||||
|
DYNAMIC_CORE = ARMV8
|
||||||
|
DYNAMIC_CORE += CORTEXA53
|
||||||
|
DYNAMIC_CORE += CORTEXA57
|
||||||
|
DYNAMIC_CORE += CORTEXA72
|
||||||
|
DYNAMIC_CORE += CORTEXA73
|
||||||
|
DYNAMIC_CORE += FALKOR
|
||||||
|
DYNAMIC_CORE += THUNDERX
|
||||||
|
DYNAMIC_CORE += THUNDERX2T99
|
||||||
|
DYNAMIC_CORE += TSV110
|
||||||
|
DYNAMIC_CORE += EMAG8180
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ARCH), zarch)
|
||||||
|
DYNAMIC_CORE = Z13
|
||||||
|
DYNAMIC_CORE += Z14
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ARCH), power)
|
||||||
|
DYNAMIC_CORE = POWER6
|
||||||
|
DYNAMIC_CORE += POWER8
|
||||||
|
ifneq ($(C_COMPILER), GCC)
|
||||||
|
DYNAMIC_CORE += POWER9
|
||||||
|
endif
|
||||||
|
ifeq ($(C_COMPILER), GCC)
|
||||||
|
ifeq ($(GCCVERSIONGT5), 1)
|
||||||
|
DYNAMIC_CORE += POWER9
|
||||||
|
else
|
||||||
|
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
|
||||||
|
endif
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
||||||
|
@ -564,6 +678,11 @@ CCOMMON_OPT += -march=mips64
|
||||||
FCOMMON_OPT += -march=mips64
|
FCOMMON_OPT += -march=mips64
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(CORE), 1004K)
|
||||||
|
CCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
|
||||||
|
FCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(CORE), P5600)
|
ifeq ($(CORE), P5600)
|
||||||
CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
||||||
FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
||||||
|
@ -606,7 +725,7 @@ endif
|
||||||
|
|
||||||
ifeq ($(C_COMPILER), PGI)
|
ifeq ($(C_COMPILER), PGI)
|
||||||
ifdef BINARY64
|
ifdef BINARY64
|
||||||
CCOMMON_OPT += -tp p7-64
|
CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm
|
||||||
else
|
else
|
||||||
CCOMMON_OPT += -tp p7
|
CCOMMON_OPT += -tp p7
|
||||||
endif
|
endif
|
||||||
|
@ -666,12 +785,19 @@ else
|
||||||
FCOMMON_OPT += -m32
|
FCOMMON_OPT += -m32
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
ifneq ($(NO_LAPACKE), 1)
|
||||||
|
FCOMMON_OPT += -fno-second-underscore
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(F_COMPILER), GFORTRAN)
|
ifeq ($(F_COMPILER), GFORTRAN)
|
||||||
CCOMMON_OPT += -DF_INTERFACE_GFORT
|
CCOMMON_OPT += -DF_INTERFACE_GFORT
|
||||||
FCOMMON_OPT += -Wall
|
FCOMMON_OPT += -Wall
|
||||||
|
# make single-threaded LAPACK calls thread-safe #1847
|
||||||
|
FCOMMON_OPT += -frecursive
|
||||||
|
# work around ABI problem with passing single-character arguments
|
||||||
|
FCOMMON_OPT += -fno-optimize-sibling-calls
|
||||||
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||||
ifneq ($(NO_LAPACK), 1)
|
ifneq ($(NO_LAPACK), 1)
|
||||||
EXTRALIB += -lgfortran
|
EXTRALIB += -lgfortran
|
||||||
|
@ -717,7 +843,7 @@ FCOMMON_OPT += -i8
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
ifeq ($(USE_OPENMP), 1)
|
ifeq ($(USE_OPENMP), 1)
|
||||||
FCOMMON_OPT += -openmp
|
FCOMMON_OPT += -fopenmp
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -897,6 +1023,10 @@ ifeq ($(DYNAMIC_ARCH), 1)
|
||||||
CCOMMON_OPT += -DDYNAMIC_ARCH
|
CCOMMON_OPT += -DDYNAMIC_ARCH
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(DYNAMIC_OLDER), 1)
|
||||||
|
CCOMMON_OPT += -DDYNAMIC_OLDER
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(NO_LAPACK), 1)
|
ifeq ($(NO_LAPACK), 1)
|
||||||
CCOMMON_OPT += -DNO_LAPACK
|
CCOMMON_OPT += -DNO_LAPACK
|
||||||
#Disable LAPACK C interface
|
#Disable LAPACK C interface
|
||||||
|
@ -919,6 +1049,10 @@ ifeq ($(NO_AVX2), 1)
|
||||||
CCOMMON_OPT += -DNO_AVX2
|
CCOMMON_OPT += -DNO_AVX2
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(NO_AVX512), 1)
|
||||||
|
CCOMMON_OPT += -DNO_AVX512
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef SMP
|
ifdef SMP
|
||||||
CCOMMON_OPT += -DSMP_SERVER
|
CCOMMON_OPT += -DSMP_SERVER
|
||||||
|
|
||||||
|
@ -965,10 +1099,18 @@ endif
|
||||||
|
|
||||||
CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS)
|
CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS)
|
||||||
|
|
||||||
|
CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL)
|
||||||
|
|
||||||
ifdef USE_SIMPLE_THREADED_LEVEL3
|
ifdef USE_SIMPLE_THREADED_LEVEL3
|
||||||
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
|
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(USE_TLS), 1)
|
||||||
|
CCOMMON_OPT += -DUSE_TLS
|
||||||
|
endif
|
||||||
|
|
||||||
|
CCOMMON_OPT += -DVERSION=\"$(VERSION)\"
|
||||||
|
|
||||||
ifndef SYMBOLPREFIX
|
ifndef SYMBOLPREFIX
|
||||||
SYMBOLPREFIX =
|
SYMBOLPREFIX =
|
||||||
endif
|
endif
|
||||||
|
@ -1016,8 +1158,12 @@ endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef NO_AFFINITY
|
ifdef NO_AFFINITY
|
||||||
|
ifeq ($(NO_AFFINITY), 0)
|
||||||
|
override undefine NO_AFFINITY
|
||||||
|
else
|
||||||
CCOMMON_OPT += -DNO_AFFINITY
|
CCOMMON_OPT += -DNO_AFFINITY
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef FUNCTION_PROFILE
|
ifdef FUNCTION_PROFILE
|
||||||
CCOMMON_OPT += -DFUNCTION_PROFILE
|
CCOMMON_OPT += -DFUNCTION_PROFILE
|
||||||
|
@ -1079,8 +1225,6 @@ ifndef FCOMMON_OPT
|
||||||
FCOMMON_OPT = -O2 -frecursive
|
FCOMMON_OPT = -O2 -frecursive
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
|
override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
|
||||||
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
|
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
|
||||||
|
|
||||||
|
@ -1088,6 +1232,12 @@ override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
|
||||||
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
|
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
|
||||||
#MAKEOVERRIDES =
|
#MAKEOVERRIDES =
|
||||||
|
|
||||||
|
ifdef NEED_PIC
|
||||||
|
ifeq (,$(findstring PIC,$(FFLAGS)))
|
||||||
|
override FFLAGS += -fPIC
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
#For LAPACK Fortran codes.
|
#For LAPACK Fortran codes.
|
||||||
#Disable -fopenmp for LAPACK Fortran codes on Windows.
|
#Disable -fopenmp for LAPACK Fortran codes on Windows.
|
||||||
ifdef OS_WINDOWS
|
ifdef OS_WINDOWS
|
||||||
|
@ -1146,7 +1296,11 @@ endif
|
||||||
|
|
||||||
LIBDLLNAME = $(LIBPREFIX).dll
|
LIBDLLNAME = $(LIBPREFIX).dll
|
||||||
IMPLIBNAME = lib$(LIBNAMEBASE).dll.a
|
IMPLIBNAME = lib$(LIBNAMEBASE).dll.a
|
||||||
|
ifneq ($(OSNAME), AIX)
|
||||||
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
|
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
|
||||||
|
else
|
||||||
|
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a)
|
||||||
|
endif
|
||||||
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
|
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
|
||||||
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
|
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
|
||||||
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)
|
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)
|
||||||
|
@ -1179,6 +1333,7 @@ export OSNAME
|
||||||
export ARCH
|
export ARCH
|
||||||
export CORE
|
export CORE
|
||||||
export LIBCORE
|
export LIBCORE
|
||||||
|
export __BYTE_ORDER__
|
||||||
export PGCPATH
|
export PGCPATH
|
||||||
export CONFIG
|
export CONFIG
|
||||||
export CC
|
export CC
|
||||||
|
@ -1223,6 +1378,7 @@ export MSA_FLAGS
|
||||||
export KERNELDIR
|
export KERNELDIR
|
||||||
export FUNCTION_PROFILE
|
export FUNCTION_PROFILE
|
||||||
export TARGET_CORE
|
export TARGET_CORE
|
||||||
|
export NO_AVX512
|
||||||
|
|
||||||
export SGEMM_UNROLL_M
|
export SGEMM_UNROLL_M
|
||||||
export SGEMM_UNROLL_N
|
export SGEMM_UNROLL_N
|
||||||
|
|
|
@ -8,6 +8,38 @@ endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(CORE), SKYLAKEX)
|
||||||
|
ifndef DYNAMIC_ARCH
|
||||||
|
ifndef NO_AVX512
|
||||||
|
CCOMMON_OPT += -march=skylake-avx512
|
||||||
|
FCOMMON_OPT += -march=skylake-avx512
|
||||||
|
ifeq ($(OSNAME), CYGWIN_NT)
|
||||||
|
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||||
|
endif
|
||||||
|
ifeq ($(OSNAME), WINNT)
|
||||||
|
ifeq ($(C_COMPILER), GCC)
|
||||||
|
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifeq ($(CORE), HASWELL)
|
||||||
|
ifndef DYNAMIC_ARCH
|
||||||
|
ifndef NO_AVX2
|
||||||
|
ifeq ($(C_COMPILER), GCC)
|
||||||
|
CCOMMON_OPT += -mavx2
|
||||||
|
endif
|
||||||
|
ifeq ($(F_COMPILER), GFORTRAN)
|
||||||
|
FCOMMON_OPT += -mavx2
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
ifeq ($(OSNAME), Interix)
|
ifeq ($(OSNAME), Interix)
|
||||||
ARFLAGS = -m x64
|
ARFLAGS = -m x64
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector
|
||||||
FCOMMON_OPT += -march=z13 -mzvector
|
FCOMMON_OPT += -march=z13 -mzvector
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(CORE), Z14)
|
||||||
|
CCOMMON_OPT += -march=z14 -mzvector
|
||||||
|
FCOMMON_OPT += -march=z14 -mzvector
|
||||||
|
endif
|
||||||
|
|
54
README.md
54
README.md
|
@ -6,11 +6,13 @@ Travis CI: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
|
AppVeyor: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
|
||||||
|
|
||||||
|
[](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
|
||||||
|
|
||||||
## Introduction
|
## Introduction
|
||||||
|
|
||||||
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
|
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
|
||||||
|
|
||||||
Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
|
Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
|
||||||
|
|
||||||
## Binary Packages
|
## Binary Packages
|
||||||
|
|
||||||
|
@ -22,8 +24,10 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
|
||||||
|
|
||||||
## Installation from Source
|
## Installation from Source
|
||||||
|
|
||||||
Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
|
Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
|
||||||
using Git from https://github.com/xianyi/OpenBLAS.git.
|
using Git from https://github.com/xianyi/OpenBLAS.git.
|
||||||
|
Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option.
|
||||||
|
Most can also be given directly on the make or cmake command line.
|
||||||
|
|
||||||
### Dependencies
|
### Dependencies
|
||||||
|
|
||||||
|
@ -63,9 +67,7 @@ A debug version can be built using `make DEBUG=1`.
|
||||||
|
|
||||||
### Compile with MASS support on Power CPU (optional)
|
### Compile with MASS support on Power CPU (optional)
|
||||||
|
|
||||||
The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
|
The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures.
|
||||||
consists of a set of mathematical functions for C, C++, and Fortran applications that are
|
|
||||||
are tuned for optimum performance on POWER architectures.
|
|
||||||
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
|
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
|
||||||
The library can be installed as shown:
|
The library can be installed as shown:
|
||||||
|
|
||||||
|
@ -101,7 +103,7 @@ The default installation directory is `/opt/OpenBLAS`.
|
||||||
|
|
||||||
## Supported CPUs and Operating Systems
|
## Supported CPUs and Operating Systems
|
||||||
|
|
||||||
Please read `GotoBLAS_01Readme.txt`.
|
Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by the 2010 GotoBLAS.
|
||||||
|
|
||||||
### Additional supported CPUs
|
### Additional supported CPUs
|
||||||
|
|
||||||
|
@ -110,10 +112,12 @@ Please read `GotoBLAS_01Readme.txt`.
|
||||||
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
|
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
|
||||||
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
|
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
|
||||||
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
|
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
|
||||||
|
- **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
|
||||||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
|
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
|
||||||
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
||||||
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
|
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
|
||||||
|
- **AMD ZEN**: Uses Haswell codes with some optimizations.
|
||||||
|
|
||||||
#### MIPS64
|
#### MIPS64
|
||||||
|
|
||||||
|
@ -127,26 +131,51 @@ Please read `GotoBLAS_01Readme.txt`.
|
||||||
|
|
||||||
#### ARM64
|
#### ARM64
|
||||||
|
|
||||||
- **ARMv8**: Experimental
|
- **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS
|
||||||
- **ARM Cortex-A57**: Experimental
|
- **Cortex-A53**: same as ARMV8 (different cpu specifications)
|
||||||
|
- **Cortex A57**: Optimized Level-3 and Level-2 functions
|
||||||
|
- **Cortex A72**: same as A57 ( different cpu specifications)
|
||||||
|
- **Cortex A73**: same as A57 (different cpu specifications)
|
||||||
|
- **Falkor**: same as A57 (different cpu specifications)
|
||||||
|
- **ThunderX**: Optimized some Level-1 functions
|
||||||
|
- **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
|
||||||
|
- **TSV110**: Optimized some Level-3 helper functions
|
||||||
|
|
||||||
#### PPC/PPC64
|
#### PPC/PPC64
|
||||||
|
|
||||||
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
|
- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
|
||||||
|
- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
|
||||||
|
|
||||||
#### IBM zEnterprise System
|
#### IBM zEnterprise System
|
||||||
|
|
||||||
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
|
- **Z13**: Optimized Level-3 BLAS and Level-1,2
|
||||||
|
- **Z14**: Optimized Level-3 BLAS and (single precision) Level-1,2
|
||||||
|
|
||||||
|
### Support for multiple targets in a single library
|
||||||
|
|
||||||
|
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying DYNAMIC_ARCH=1 in Makefile.rule, on the gmake command line or as -DDYNAMIC_ARCH=TRUE in cmake.
|
||||||
|
For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify DYNAMIC_OLDER=1, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option DYNAMIC_LIST that allows to specify an individual list of targets to include instead of the default.
|
||||||
|
DYNAMIC_ARCH is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
|
||||||
|
Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.
|
||||||
|
On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus.
|
||||||
|
For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
|
||||||
|
The TARGET option can be used in conjunction with DYNAMIC_ARCH=1 to specify which cpu model should be assumed for all the
|
||||||
|
common code in the library, usually you will want to set this to the oldest model you expect to encounter.
|
||||||
|
Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library.
|
||||||
|
|
||||||
### Supported OS
|
### Supported OS
|
||||||
|
|
||||||
- **GNU/Linux**
|
- **GNU/Linux**
|
||||||
- **MinGW or Visual Studio (CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
- **MinGW or Visual Studio (CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||||
- **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts.
|
- **Darwin/macOS/OSX/iOS**: Experimental. Although GotoBLAS2 already supports Darwin, we are not OSX/iOS experts.
|
||||||
- **FreeBSD**: Supported by the community. We don't actively test the library on this OS.
|
- **FreeBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||||
- **OpenBSD**: Supported by the community. We don't actively test the library on this OS.
|
- **OpenBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||||
|
- **NetBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||||
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
|
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
|
||||||
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
||||||
|
- **AIX**: Supported on PPC up to POWER8
|
||||||
|
- **Haiku**: Supported by the community. We don't actively test the library on this OS.
|
||||||
|
- **SunOS**: Supported by the community. We don't actively test the library on this OS:
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
@ -200,7 +229,8 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2
|
||||||
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
|
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
|
||||||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
|
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
|
||||||
Clang 3.0 will generate the wrong AVX binary code.
|
Clang 3.0 will generate the wrong AVX binary code.
|
||||||
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
|
* Please use GCC version 6 or LLVM version 6 and above to compile Skylake AVX512 kernels.
|
||||||
|
* The number of CPUs/cores should be less than or equal to 256. On Linux `x86_64` (`amd64`),
|
||||||
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
|
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
|
||||||
the library with `BIGNUMA=1`.
|
the library with `BIGNUMA=1`.
|
||||||
* OpenBLAS does not set processor affinity by default.
|
* OpenBLAS does not set processor affinity by default.
|
||||||
|
|
|
@ -20,6 +20,7 @@ DUNNINGTON
|
||||||
NEHALEM
|
NEHALEM
|
||||||
SANDYBRIDGE
|
SANDYBRIDGE
|
||||||
HASWELL
|
HASWELL
|
||||||
|
SKYLAKEX
|
||||||
ATOM
|
ATOM
|
||||||
|
|
||||||
b)AMD CPU:
|
b)AMD CPU:
|
||||||
|
@ -47,6 +48,7 @@ POWER5
|
||||||
POWER6
|
POWER6
|
||||||
POWER7
|
POWER7
|
||||||
POWER8
|
POWER8
|
||||||
|
POWER9
|
||||||
PPCG4
|
PPCG4
|
||||||
PPC970
|
PPC970
|
||||||
PPC970MP
|
PPC970MP
|
||||||
|
@ -56,6 +58,7 @@ CELL
|
||||||
|
|
||||||
3.MIPS CPU:
|
3.MIPS CPU:
|
||||||
P5600
|
P5600
|
||||||
|
1004K
|
||||||
|
|
||||||
4.MIPS64 CPU:
|
4.MIPS64 CPU:
|
||||||
SICORTEX
|
SICORTEX
|
||||||
|
@ -81,11 +84,16 @@ ARMV5
|
||||||
|
|
||||||
8.ARM 64-bit CPU:
|
8.ARM 64-bit CPU:
|
||||||
ARMV8
|
ARMV8
|
||||||
|
CORTEXA53
|
||||||
CORTEXA57
|
CORTEXA57
|
||||||
VULCAN
|
CORTEXA72
|
||||||
|
CORTEXA73
|
||||||
|
FALKOR
|
||||||
THUNDERX
|
THUNDERX
|
||||||
THUNDERX2T99
|
THUNDERX2T99
|
||||||
|
TSV110
|
||||||
|
|
||||||
9.System Z:
|
9.System Z:
|
||||||
ZARCH_GENERIC
|
ZARCH_GENERIC
|
||||||
Z13
|
Z13
|
||||||
|
Z14
|
||||||
|
|
20
appveyor.yml
20
appveyor.yml
|
@ -35,6 +35,14 @@ environment:
|
||||||
DYNAMIC_ARCH: ON
|
DYNAMIC_ARCH: ON
|
||||||
WITH_FORTRAN: no
|
WITH_FORTRAN: no
|
||||||
- COMPILER: cl
|
- COMPILER: cl
|
||||||
|
- COMPILER: MinGW64-gcc-7.2.0-mingw
|
||||||
|
DYNAMIC_ARCH: OFF
|
||||||
|
WITH_FORTRAN: ignore
|
||||||
|
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
|
||||||
|
COMPILER: MinGW-gcc-6.3.0-32
|
||||||
|
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
|
||||||
|
COMPILER: MinGW-gcc-5.3.0
|
||||||
|
WITH_FORTRAN: ignore
|
||||||
|
|
||||||
install:
|
install:
|
||||||
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
|
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
|
||||||
|
@ -52,10 +60,17 @@ install:
|
||||||
before_build:
|
before_build:
|
||||||
- ps: if (-Not (Test-Path .\build)) { mkdir build }
|
- ps: if (-Not (Test-Path .\build)) { mkdir build }
|
||||||
- cd build
|
- cd build
|
||||||
|
- set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
|
||||||
|
- if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
|
||||||
|
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
|
||||||
|
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] set PATH=C:\msys64\usr\bin;C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw64\bin;%PATH%
|
||||||
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
|
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
|
||||||
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl ..
|
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
|
||||||
|
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||||
|
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||||
|
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
|
||||||
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
||||||
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON ..
|
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
|
||||||
|
|
||||||
build_script:
|
build_script:
|
||||||
- cmake --build .
|
- cmake --build .
|
||||||
|
@ -64,3 +79,4 @@ test_script:
|
||||||
- echo Running Test
|
- echo Running Test
|
||||||
- cd utest
|
- cd utest
|
||||||
- openblas_utest
|
- openblas_utest
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,51 @@
|
||||||
|
trigger:
|
||||||
|
# start a new build for every push
|
||||||
|
batch: False
|
||||||
|
branches:
|
||||||
|
include:
|
||||||
|
- develop
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
# manylinux1 is useful to test because the
|
||||||
|
# standard Docker container uses an old version
|
||||||
|
# of gcc / glibc
|
||||||
|
- job: manylinux1_gcc
|
||||||
|
pool:
|
||||||
|
vmImage: 'ubuntu-16.04'
|
||||||
|
steps:
|
||||||
|
- script: |
|
||||||
|
echo "FROM quay.io/pypa/manylinux1_x86_64
|
||||||
|
COPY . /tmp/openblas
|
||||||
|
RUN cd /tmp/openblas && \
|
||||||
|
COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \
|
||||||
|
BTYPE='BINARY=64' CC=gcc && \
|
||||||
|
make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \
|
||||||
|
make -C test $COMMON_FLAGS $BTYPE && \
|
||||||
|
make -C ctest $COMMON_FLAGS $BTYPE && \
|
||||||
|
make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile
|
||||||
|
docker build .
|
||||||
|
displayName: Run manylinux1 docker build
|
||||||
|
- job: Intel_SDE_skx
|
||||||
|
pool:
|
||||||
|
vmImage: 'ubuntu-16.04'
|
||||||
|
steps:
|
||||||
|
- script: |
|
||||||
|
# at the time of writing the available Azure Ubuntu vm image
|
||||||
|
# does not support AVX512VL, so use more recent LTS version
|
||||||
|
echo "FROM ubuntu:bionic
|
||||||
|
COPY . /tmp/openblas
|
||||||
|
RUN apt-get -y update && apt-get -y install \\
|
||||||
|
cmake \\
|
||||||
|
gfortran \\
|
||||||
|
make \\
|
||||||
|
wget
|
||||||
|
RUN mkdir /tmp/SDE && cd /tmp/SDE && \\
|
||||||
|
mkdir sde-external-8.35.0-2019-03-11-lin && \\
|
||||||
|
wget --quiet -O sde-external-8.35.0-2019-03-11-lin.tar.bz2 https://www.dropbox.com/s/fopsnzj67572sj5/sde-external-8.35.0-2019-03-11-lin.tar.bz2?dl=0 && \\
|
||||||
|
tar -xjvf sde-external-8.35.0-2019-03-11-lin.tar.bz2 -C /tmp/SDE/sde-external-8.35.0-2019-03-11-lin --strip-components=1
|
||||||
|
RUN cd /tmp/openblas && CC=gcc make QUIET_MAKE=1 DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64
|
||||||
|
CMD cd /tmp/openblas && echo 0 > /proc/sys/kernel/yama/ptrace_scope && CC=gcc OPENBLAS_VERBOSE=2 /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/sde64 -cpuid_in /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/misc/cpuid/skx/cpuid.def -- make -C utest DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64" > Dockerfile
|
||||||
|
docker build -t intel_sde .
|
||||||
|
# we need a privileged docker run for sde process attachment
|
||||||
|
docker run --privileged intel_sde
|
||||||
|
displayName: 'Run AVX512 SkylakeX docker build / test'
|
|
@ -129,7 +129,10 @@ int main(int argc, char *argv[]){
|
||||||
int step = 1;
|
int step = 1;
|
||||||
|
|
||||||
struct timeval start, stop;
|
struct timeval start, stop;
|
||||||
double time1,timeg;
|
double time1 = 0.0, timeg = 0.0;
|
||||||
|
long nanos = 0;
|
||||||
|
time_t seconds = 0;
|
||||||
|
struct timespec time_start = { 0, 0 }, time_end = { 0, 0 };
|
||||||
|
|
||||||
argc--;argv++;
|
argc--;argv++;
|
||||||
|
|
||||||
|
@ -163,11 +166,6 @@ int main(int argc, char *argv[]){
|
||||||
timeg=0;
|
timeg=0;
|
||||||
|
|
||||||
fprintf(stderr, " %6d : ", (int)m);
|
fprintf(stderr, " %6d : ", (int)m);
|
||||||
|
|
||||||
|
|
||||||
for (l=0; l<loops; l++)
|
|
||||||
{
|
|
||||||
|
|
||||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
}
|
}
|
||||||
|
@ -175,23 +173,25 @@ int main(int argc, char *argv[]){
|
||||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
}
|
}
|
||||||
gettimeofday( &start, (struct timezone *)0);
|
|
||||||
|
|
||||||
|
for (l=0; l<loops; l++)
|
||||||
|
{
|
||||||
|
clock_gettime(CLOCK_REALTIME, &time_start);
|
||||||
COPY (&m, x, &inc_x, y, &inc_y );
|
COPY (&m, x, &inc_x, y, &inc_y );
|
||||||
|
clock_gettime(CLOCK_REALTIME, &time_end);
|
||||||
|
|
||||||
gettimeofday( &stop, (struct timezone *)0);
|
nanos = time_end.tv_nsec - time_start.tv_nsec;
|
||||||
|
seconds = time_end.tv_sec - time_start.tv_sec;
|
||||||
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
|
||||||
|
|
||||||
|
time1 = seconds + nanos / 1.e9;
|
||||||
timeg += time1;
|
timeg += time1;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
timeg /= loops;
|
timeg /= loops;
|
||||||
|
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
" %10.2f MBytes %10.6f sec\n",
|
" %10.2f MBytes %12.9f sec\n",
|
||||||
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
|
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg / 1.e6, timeg);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -122,7 +122,7 @@ int main(int argc, char *argv[]){
|
||||||
|
|
||||||
FLOAT *a, *x, *y;
|
FLOAT *a, *x, *y;
|
||||||
FLOAT alpha[] = {1.0, 1.0};
|
FLOAT alpha[] = {1.0, 1.0};
|
||||||
FLOAT beta [] = {1.0, 1.0};
|
FLOAT beta [] = {1.0, 0.0};
|
||||||
char trans='N';
|
char trans='N';
|
||||||
blasint m, i, j;
|
blasint m, i, j;
|
||||||
blasint inc_x=1,inc_y=1;
|
blasint inc_x=1,inc_y=1;
|
||||||
|
@ -197,7 +197,7 @@ int main(int argc, char *argv[]){
|
||||||
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
|
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
|
||||||
for(j = 0; j < m; j++){
|
for(j = 0; j < m; j++){
|
||||||
for(i = 0; i < n * COMPSIZE; i++){
|
for(i = 0; i < n * COMPSIZE; i++){
|
||||||
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
a[j + i * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -208,7 +208,7 @@ int main(int argc, char *argv[]){
|
||||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
|
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
}
|
}
|
||||||
gettimeofday( &start, (struct timezone *)0);
|
gettimeofday( &start, (struct timezone *)0);
|
||||||
|
@ -234,7 +234,7 @@ int main(int argc, char *argv[]){
|
||||||
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
|
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
|
||||||
for(j = 0; j < m; j++){
|
for(j = 0; j < m; j++){
|
||||||
for(i = 0; i < n * COMPSIZE; i++){
|
for(i = 0; i < n * COMPSIZE; i++){
|
||||||
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
a[j + i * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -245,7 +245,7 @@ int main(int argc, char *argv[]){
|
||||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
|
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
}
|
}
|
||||||
gettimeofday( &start, (struct timezone *)0);
|
gettimeofday( &start, (struct timezone *)0);
|
||||||
|
|
|
@ -2,6 +2,8 @@
|
||||||
|
|
||||||
argv <- commandArgs(trailingOnly = TRUE)
|
argv <- commandArgs(trailingOnly = TRUE)
|
||||||
|
|
||||||
|
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
|
||||||
|
|
||||||
nfrom <- 128
|
nfrom <- 128
|
||||||
nto <- 2048
|
nto <- 2048
|
||||||
nstep <- 128
|
nstep <- 128
|
||||||
|
@ -19,7 +21,6 @@ if (length(argv) > 0) {
|
||||||
loops <- as.numeric(argv[z])
|
loops <- as.numeric(argv[z])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
p <- Sys.getenv("OPENBLAS_LOOPS")
|
p <- Sys.getenv("OPENBLAS_LOOPS")
|
||||||
|
@ -27,29 +28,21 @@ if (p != "") {
|
||||||
loops <- as.numeric(p)
|
loops <- as.numeric(p)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
|
||||||
cat(sprintf(
|
|
||||||
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
|
|
||||||
nfrom,
|
|
||||||
nto,
|
|
||||||
nstep,
|
|
||||||
loops
|
|
||||||
))
|
|
||||||
cat(sprintf(" SIZE Flops Time\n"))
|
cat(sprintf(" SIZE Flops Time\n"))
|
||||||
|
|
||||||
n <- nfrom
|
n <- nfrom
|
||||||
while (n <= nto) {
|
while (n <= nto) {
|
||||||
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
|
A <- matrix(rnorm(n * n), nrow = n)
|
||||||
ev <- 0
|
ev <- 0
|
||||||
z <- system.time(for (l in 1:loops) {
|
z <- system.time(for (l in 1:loops) {
|
||||||
ev <- eigen(A)
|
ev <- eigen(A)
|
||||||
})
|
})
|
||||||
|
|
||||||
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6)
|
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1e+06)
|
||||||
|
|
||||||
st <- sprintf("%.0fx%.0f :", n, n)
|
st <- sprintf("%.0fx%.0f :", n, n)
|
||||||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
|
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
|
||||||
|
|
||||||
n <- n + nstep
|
n <- n + nstep
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,6 +2,8 @@
|
||||||
|
|
||||||
argv <- commandArgs(trailingOnly = TRUE)
|
argv <- commandArgs(trailingOnly = TRUE)
|
||||||
|
|
||||||
|
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
|
||||||
|
|
||||||
nfrom <- 128
|
nfrom <- 128
|
||||||
nto <- 2048
|
nto <- 2048
|
||||||
nstep <- 128
|
nstep <- 128
|
||||||
|
@ -19,7 +21,6 @@ if (length(argv) > 0) {
|
||||||
loops <- as.numeric(argv[z])
|
loops <- as.numeric(argv[z])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
p <- Sys.getenv("OPENBLAS_LOOPS")
|
p <- Sys.getenv("OPENBLAS_LOOPS")
|
||||||
|
@ -27,26 +28,13 @@ if (p != "") {
|
||||||
loops <- as.numeric(p)
|
loops <- as.numeric(p)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
|
||||||
cat(sprintf(
|
|
||||||
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
|
|
||||||
nfrom,
|
|
||||||
nto,
|
|
||||||
nstep,
|
|
||||||
loops
|
|
||||||
))
|
|
||||||
cat(sprintf(" SIZE Flops Time\n"))
|
cat(sprintf(" SIZE Flops Time\n"))
|
||||||
|
|
||||||
n <- nfrom
|
n <- nfrom
|
||||||
while (n <= nto) {
|
while (n <= nto) {
|
||||||
A <- matrix(runif(n * n),
|
A <- matrix(runif(n * n), nrow = n)
|
||||||
ncol = n,
|
B <- matrix(runif(n * n), nrow = n)
|
||||||
nrow = n,
|
|
||||||
byrow = TRUE)
|
|
||||||
B <- matrix(runif(n * n),
|
|
||||||
ncol = n,
|
|
||||||
nrow = n,
|
|
||||||
byrow = TRUE)
|
|
||||||
C <- 1
|
C <- 1
|
||||||
|
|
||||||
z <- system.time(for (l in 1:loops) {
|
z <- system.time(for (l in 1:loops) {
|
||||||
|
@ -54,11 +42,10 @@ while (n <= nto) {
|
||||||
l <- l + 1
|
l <- l + 1
|
||||||
})
|
})
|
||||||
|
|
||||||
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6)
|
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1e+06)
|
||||||
|
|
||||||
st <- sprintf("%.0fx%.0f :", n, n)
|
st <- sprintf("%.0fx%.0f :", n, n)
|
||||||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
|
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
|
||||||
|
|
||||||
n <- n + nstep
|
n <- n + nstep
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,6 +2,8 @@
|
||||||
|
|
||||||
argv <- commandArgs(trailingOnly = TRUE)
|
argv <- commandArgs(trailingOnly = TRUE)
|
||||||
|
|
||||||
|
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
|
||||||
|
|
||||||
nfrom <- 128
|
nfrom <- 128
|
||||||
nto <- 2048
|
nto <- 2048
|
||||||
nstep <- 128
|
nstep <- 128
|
||||||
|
@ -19,7 +21,6 @@ if (length(argv) > 0) {
|
||||||
loops <- as.numeric(argv[z])
|
loops <- as.numeric(argv[z])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
p <- Sys.getenv("OPENBLAS_LOOPS")
|
p <- Sys.getenv("OPENBLAS_LOOPS")
|
||||||
|
@ -27,31 +28,22 @@ if (p != "") {
|
||||||
loops <- as.numeric(p)
|
loops <- as.numeric(p)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
|
||||||
cat(sprintf(
|
|
||||||
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
|
|
||||||
nfrom,
|
|
||||||
nto,
|
|
||||||
nstep,
|
|
||||||
loops
|
|
||||||
))
|
|
||||||
cat(sprintf(" SIZE Flops Time\n"))
|
cat(sprintf(" SIZE Flops Time\n"))
|
||||||
|
|
||||||
n <- nfrom
|
n <- nfrom
|
||||||
while (n <= nto) {
|
while (n <= nto) {
|
||||||
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
|
A <- matrix(rnorm(n * n), nrow = n)
|
||||||
B <- matrix(rnorm(n * n), ncol = n, nrow = n)
|
B <- matrix(rnorm(n * n), nrow = n)
|
||||||
|
|
||||||
z <- system.time(for (l in 1:loops) {
|
z <- system.time(for (l in 1:loops) {
|
||||||
solve(A, B)
|
solve(A, B)
|
||||||
})
|
})
|
||||||
|
|
||||||
mflops <-
|
mflops <- (8.0 / 3 * n * n * n) * loops / (z[3] * 1e+06)
|
||||||
(2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6)
|
|
||||||
|
|
||||||
st <- sprintf("%.0fx%.0f :", n, n)
|
st <- sprintf("%.0fx%.0f :", n, n)
|
||||||
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
|
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
|
||||||
|
|
||||||
n <- n + nstep
|
n <- n + nstep
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
72
c_check
72
c_check
|
@ -1,7 +1,7 @@
|
||||||
#!/usr/bin/perl
|
#!/usr/bin/perl
|
||||||
|
|
||||||
use File::Basename;
|
#use File::Basename;
|
||||||
use File::Temp qw(tempfile);
|
# use File::Temp qw(tempfile);
|
||||||
|
|
||||||
# Checking cross compile
|
# Checking cross compile
|
||||||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
||||||
|
@ -12,17 +12,18 @@ $hostarch = "arm64" if ($hostarch eq "aarch64");
|
||||||
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
|
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
|
||||||
$hostarch = "zarch" if ($hostarch eq "s390x");
|
$hostarch = "zarch" if ($hostarch eq "s390x");
|
||||||
|
|
||||||
$tmpf = new File::Temp( UNLINK => 1 );
|
#$tmpf = new File::Temp( UNLINK => 1 );
|
||||||
$binary = $ENV{"BINARY"};
|
$binary = $ENV{"BINARY"};
|
||||||
|
|
||||||
$makefile = shift(@ARGV);
|
$makefile = shift(@ARGV);
|
||||||
$config = shift(@ARGV);
|
$config = shift(@ARGV);
|
||||||
|
|
||||||
$compiler_name = join(" ", @ARGV);
|
$compiler_name = shift(@ARGV);
|
||||||
|
$flags = join(" ", @ARGV);
|
||||||
|
|
||||||
# First, we need to know the target OS and compiler name
|
# First, we need to know the target OS and compiler name
|
||||||
|
|
||||||
$data = `$compiler_name -E ctest.c`;
|
$data = `$compiler_name $flags -E ctest.c`;
|
||||||
|
|
||||||
if ($?) {
|
if ($?) {
|
||||||
printf STDERR "C Compiler ($compiler_name) is something wrong.\n";
|
printf STDERR "C Compiler ($compiler_name) is something wrong.\n";
|
||||||
|
@ -31,6 +32,18 @@ if ($?) {
|
||||||
|
|
||||||
$cross_suffix = "";
|
$cross_suffix = "";
|
||||||
|
|
||||||
|
eval "use File::Basename";
|
||||||
|
if ($@){
|
||||||
|
warn "could not load PERL module File::Basename, emulating its functionality";
|
||||||
|
my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 );
|
||||||
|
if ($dirnam ne ".") {
|
||||||
|
$cross_suffix .= $dirnam . "/";
|
||||||
|
}
|
||||||
|
my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1);
|
||||||
|
if ($basnam =~ /([^\s]*-)(.*)/) {
|
||||||
|
$cross_suffix .= $1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
if (dirname($compiler_name) ne ".") {
|
if (dirname($compiler_name) ne ".") {
|
||||||
$cross_suffix .= dirname($compiler_name) . "/";
|
$cross_suffix .= dirname($compiler_name) . "/";
|
||||||
}
|
}
|
||||||
|
@ -38,6 +51,7 @@ if (dirname($compiler_name) ne ".") {
|
||||||
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
|
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
|
||||||
$cross_suffix .= $1;
|
$cross_suffix .= $1;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
$compiler = "";
|
$compiler = "";
|
||||||
$compiler = LSB if ($data =~ /COMPILER_LSB/);
|
$compiler = LSB if ($data =~ /COMPILER_LSB/);
|
||||||
|
@ -64,6 +78,7 @@ $os = WINNT if ($data =~ /OS_WINNT/);
|
||||||
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
|
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
|
||||||
$os = Interix if ($data =~ /OS_INTERIX/);
|
$os = Interix if ($data =~ /OS_INTERIX/);
|
||||||
$os = Android if ($data =~ /OS_ANDROID/);
|
$os = Android if ($data =~ /OS_ANDROID/);
|
||||||
|
$os = Haiku if ($data =~ /OS_HAIKU/);
|
||||||
|
|
||||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||||
|
@ -167,7 +182,7 @@ if ($defined == 0) {
|
||||||
|
|
||||||
# Do again
|
# Do again
|
||||||
|
|
||||||
$data = `$compiler_name -E ctest.c`;
|
$data = `$compiler_name $flags -E ctest.c`;
|
||||||
|
|
||||||
if ($?) {
|
if ($?) {
|
||||||
printf STDERR "C Compiler ($compiler_name) is something wrong.\n";
|
printf STDERR "C Compiler ($compiler_name) is something wrong.\n";
|
||||||
|
@ -176,13 +191,18 @@ if ($?) {
|
||||||
|
|
||||||
$have_msa = 0;
|
$have_msa = 0;
|
||||||
if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
||||||
|
eval "use File::Temp qw(tempfile)";
|
||||||
|
if ($@){
|
||||||
|
warn "could not load PERL module File::Temp, so could not check MSA capatibility";
|
||||||
|
} else {
|
||||||
|
$tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
|
||||||
$code = '"addvi.b $w0, $w1, 1"';
|
$code = '"addvi.b $w0, $w1, 1"';
|
||||||
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
|
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
|
||||||
print $tmpf "#include <msa.h>\n\n";
|
print $tmpf "#include <msa.h>\n\n";
|
||||||
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
|
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
|
||||||
|
|
||||||
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
|
$args = "$msa_flags -o $tmpf.o $tmpf";
|
||||||
my @cmd = ("$compiler_name $args");
|
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
|
||||||
system(@cmd) == 0;
|
system(@cmd) == 0;
|
||||||
if ($? != 0) {
|
if ($? != 0) {
|
||||||
$have_msa = 0;
|
$have_msa = 0;
|
||||||
|
@ -191,6 +211,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
||||||
}
|
}
|
||||||
unlink("$tmpf.o");
|
unlink("$tmpf.o");
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||||
|
@ -207,14 +228,39 @@ $architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||||
$binformat = bin32;
|
$binformat = bin32;
|
||||||
$binformat = bin64 if ($data =~ /BINARY_64/);
|
$binformat = bin64 if ($data =~ /BINARY_64/);
|
||||||
|
|
||||||
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
|
$no_avx512= 0;
|
||||||
|
if (($architecture eq "x86") || ($architecture eq "x86_64")) {
|
||||||
|
eval "use File::Temp qw(tempfile)";
|
||||||
|
if ($@){
|
||||||
|
warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512";
|
||||||
|
$no_avx512 = 0;
|
||||||
|
} else {
|
||||||
|
# $tmpf = new File::Temp( UNLINK => 1 );
|
||||||
|
($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
|
||||||
|
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
|
||||||
|
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
|
||||||
|
$args = " -march=skylake-avx512 -c -o $tmpf.o $tmpf";
|
||||||
|
if ($compiler eq "PGI") {
|
||||||
|
$args = " -tp skylake -c -o $tmpf.o $tmpf";
|
||||||
|
}
|
||||||
|
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
|
||||||
|
system(@cmd) == 0;
|
||||||
|
if ($? != 0) {
|
||||||
|
$no_avx512 = 1;
|
||||||
|
} else {
|
||||||
|
$no_avx512 = 0;
|
||||||
|
}
|
||||||
|
unlink("$tmpf.o");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
|
||||||
|
|
||||||
$data =~ /globl\s([_\.]*)(.*)/;
|
$data =~ /globl\s([_\.]*)(.*)/;
|
||||||
|
|
||||||
$need_fu = $1;
|
$need_fu = $1;
|
||||||
|
|
||||||
$cross = 0;
|
$cross = 0;
|
||||||
$cross = 1 if ($os ne $hostos);
|
|
||||||
|
|
||||||
if ($architecture ne $hostarch) {
|
if ($architecture ne $hostarch) {
|
||||||
$cross = 1;
|
$cross = 1;
|
||||||
|
@ -222,6 +268,8 @@ if ($architecture ne $hostarch) {
|
||||||
$cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips"));
|
$cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
$cross = 1 if ($os ne $hostos);
|
||||||
|
|
||||||
$openmp = "" if $ENV{USE_OPENMP} != 1;
|
$openmp = "" if $ENV{USE_OPENMP} != 1;
|
||||||
|
|
||||||
$linker_L = "";
|
$linker_L = "";
|
||||||
|
@ -229,7 +277,7 @@ $linker_l = "";
|
||||||
$linker_a = "";
|
$linker_a = "";
|
||||||
|
|
||||||
{
|
{
|
||||||
$link = `$compiler_name -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`;
|
$link = `$compiler_name $flags -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $flags $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`;
|
||||||
|
|
||||||
$link =~ s/\-Y\sP\,/\-Y/g;
|
$link =~ s/\-Y\sP\,/\-Y/g;
|
||||||
|
|
||||||
|
@ -267,6 +315,7 @@ $linker_a = "";
|
||||||
&& ($flags !~ /kernel32/)
|
&& ($flags !~ /kernel32/)
|
||||||
&& ($flags !~ /advapi32/)
|
&& ($flags !~ /advapi32/)
|
||||||
&& ($flags !~ /shell32/)
|
&& ($flags !~ /shell32/)
|
||||||
|
&& ($flags !~ /omp/)
|
||||||
) {
|
) {
|
||||||
$linker_l .= $flags . " "
|
$linker_l .= $flags . " "
|
||||||
}
|
}
|
||||||
|
@ -294,6 +343,7 @@ print MAKEFILE "CROSS=1\n" if $cross != 0;
|
||||||
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
|
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
|
||||||
print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
|
print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
|
||||||
print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
|
print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
|
||||||
|
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
|
||||||
|
|
||||||
$os =~ tr/[a-z]/[A-Z]/;
|
$os =~ tr/[a-z]/[A-Z]/;
|
||||||
$architecture =~ tr/[a-z]/[A-Z]/;
|
$architecture =~ tr/[a-z]/[A-Z]/;
|
||||||
|
|
21
cblas.h
21
cblas.h
|
@ -51,6 +51,7 @@ typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=1
|
||||||
typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
|
typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
|
||||||
typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
|
typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
|
||||||
typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
|
typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
|
||||||
|
typedef CBLAS_ORDER CBLAS_LAYOUT;
|
||||||
|
|
||||||
float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
|
float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
|
||||||
double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
|
double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
|
||||||
|
@ -72,6 +73,11 @@ double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS
|
||||||
float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
|
||||||
|
float cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||||
|
double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||||
|
float cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
|
||||||
float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX);
|
float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX);
|
||||||
double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);
|
double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);
|
||||||
float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX);
|
float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX);
|
||||||
|
@ -82,6 +88,21 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
|
||||||
CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
|
||||||
|
CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||||
|
CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||||
|
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
|
||||||
|
CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||||
|
CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||||
|
CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
CBLAS_INDEX cblas_izmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
|
||||||
|
CBLAS_INDEX cblas_ismin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
|
||||||
|
CBLAS_INDEX cblas_idmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
|
||||||
|
CBLAS_INDEX cblas_icmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
CBLAS_INDEX cblas_izmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
|
||||||
|
|
||||||
void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
|
void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
|
||||||
void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
|
void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
|
||||||
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
|
||||||
|
|
|
@ -0,0 +1,79 @@
|
||||||
|
# OpenBLASConfig.cmake
|
||||||
|
# --------------------
|
||||||
|
#
|
||||||
|
# OpenBLAS cmake module.
|
||||||
|
# This module sets the following variables in your project::
|
||||||
|
#
|
||||||
|
# OpenBLAS_FOUND - true if OpenBLAS and all required components found on the system
|
||||||
|
# OpenBLAS_VERSION - OpenBLAS version in format Major.Minor.Release
|
||||||
|
# OpenBLAS_INCLUDE_DIRS - Directory where OpenBLAS header is located.
|
||||||
|
# OpenBLAS_INCLUDE_DIR - same as DIRS
|
||||||
|
# OpenBLAS_LIBRARIES - OpenBLAS library to link against.
|
||||||
|
# OpenBLAS_LIBRARY - same as LIBRARIES
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# Available components::
|
||||||
|
#
|
||||||
|
## shared - search for only shared library
|
||||||
|
## static - search for only static library
|
||||||
|
# serial - search for unthreaded library
|
||||||
|
# pthread - search for native pthread threaded library
|
||||||
|
# openmp - search for OpenMP threaded library
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# Exported targets::
|
||||||
|
#
|
||||||
|
# If OpenBLAS is found, this module defines the following :prop_tgt:`IMPORTED`
|
||||||
|
## target. Target is shared _or_ static, so, for both, use separate, not
|
||||||
|
## overlapping, installations. ::
|
||||||
|
#
|
||||||
|
# OpenBLAS::OpenBLAS - the main OpenBLAS library #with header & defs attached.
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# Suggested usage::
|
||||||
|
#
|
||||||
|
# find_package(OpenBLAS)
|
||||||
|
# find_package(OpenBLAS 0.2.20 EXACT CONFIG REQUIRED COMPONENTS pthread)
|
||||||
|
#
|
||||||
|
#
|
||||||
|
# The following variables can be set to guide the search for this package::
|
||||||
|
#
|
||||||
|
# OpenBLAS_DIR - CMake variable, set to directory containing this Config file
|
||||||
|
# CMAKE_PREFIX_PATH - CMake variable, set to root directory of this package
|
||||||
|
# PATH - environment variable, set to bin directory of this package
|
||||||
|
# CMAKE_DISABLE_FIND_PACKAGE_OpenBLAS - CMake variable, disables
|
||||||
|
# find_package(OpenBLAS) when not REQUIRED, perhaps to force internal build
|
||||||
|
|
||||||
|
@PACKAGE_INIT@
|
||||||
|
|
||||||
|
set(PN OpenBLAS)
|
||||||
|
|
||||||
|
# need to check that the @USE_*@ evaluate to something cmake can perform boolean logic upon
|
||||||
|
if(@USE_OPENMP@)
|
||||||
|
set(${PN}_openmp_FOUND 1)
|
||||||
|
elseif(@USE_THREAD@)
|
||||||
|
set(${PN}_pthread_FOUND 1)
|
||||||
|
else()
|
||||||
|
set(${PN}_serial_FOUND 1)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
check_required_components(${PN})
|
||||||
|
|
||||||
|
#-----------------------------------------------------------------------------
|
||||||
|
# Don't include targets if this file is being picked up by another
|
||||||
|
# project which has already built this as a subproject
|
||||||
|
#-----------------------------------------------------------------------------
|
||||||
|
if(NOT TARGET ${PN}::OpenBLAS)
|
||||||
|
include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake")
|
||||||
|
|
||||||
|
get_property(_loc TARGET ${PN}::OpenBLAS PROPERTY LOCATION)
|
||||||
|
set(${PN}_LIBRARY ${_loc})
|
||||||
|
get_property(_ill TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_LINK_LIBRARIES)
|
||||||
|
set(${PN}_LIBRARIES ${_ill})
|
||||||
|
|
||||||
|
get_property(_id TARGET ${PN}::OpenBLAS PROPERTY INCLUDE_DIRECTORIES)
|
||||||
|
set(${PN}_INCLUDE_DIR ${_id})
|
||||||
|
get_property(_iid TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
|
||||||
|
set(${PN}_INCLUDE_DIRS ${_iid})
|
||||||
|
endif()
|
||||||
|
|
|
@ -44,22 +44,49 @@ endif ()
|
||||||
|
|
||||||
|
|
||||||
if (DYNAMIC_ARCH)
|
if (DYNAMIC_ARCH)
|
||||||
|
if (ARM64)
|
||||||
|
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180)
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
if (POWER)
|
||||||
|
set(DYNAMIC_CORE POWER6 POWER8 POWER9)
|
||||||
|
endif ()
|
||||||
|
|
||||||
if (X86)
|
if (X86)
|
||||||
set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO)
|
set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (X86_64)
|
if (X86_64)
|
||||||
set(DYNAMIC_CORE PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO)
|
set(DYNAMIC_CORE PRESCOTT CORE2)
|
||||||
|
if (DYNAMIC_OLDER)
|
||||||
|
set (DYNAMIC_CORE ${DYNAMIC_CORE} PENRYN DUNNINGTON)
|
||||||
|
endif ()
|
||||||
|
set (DYNAMIC_CORE ${DYNAMIC_CORE} NEHALEM)
|
||||||
|
if (DYNAMIC_OLDER)
|
||||||
|
set (DYNAMIC_CORE ${DYNAMIC_CORE} OPTERON OPTERON_SSE3)
|
||||||
|
endif ()
|
||||||
|
set (DYNAMIC_CORE ${DYNAMIC_CORE} BARCELONA)
|
||||||
|
if (DYNAMIC_OLDER)
|
||||||
|
set (DYNAMIC_CORE ${DYNAMIC_CORE} BOBCAT ATOM NANO)
|
||||||
|
endif ()
|
||||||
if (NOT NO_AVX)
|
if (NOT NO_AVX)
|
||||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR)
|
set(DYNAMIC_CORE ${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR)
|
||||||
endif ()
|
endif ()
|
||||||
if (NOT NO_AVX2)
|
if (NOT NO_AVX2)
|
||||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
|
set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
|
||||||
endif ()
|
endif ()
|
||||||
|
if (NOT NO_AVX512)
|
||||||
|
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
|
||||||
|
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
|
||||||
|
endif ()
|
||||||
|
if (DYNAMIC_LIST)
|
||||||
|
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
|
||||||
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (NOT DYNAMIC_CORE)
|
if (NOT DYNAMIC_CORE)
|
||||||
unset(DYNAMIC_ARCH)
|
message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options")
|
||||||
|
unset(DYNAMIC_ARCH CACHE)
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
## Description: Ported from portion of OpenBLAS/Makefile.system
|
## Description: Ported from portion of OpenBLAS/Makefile.system
|
||||||
## Sets C related variables.
|
## Sets C related variables.
|
||||||
|
|
||||||
if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_C_COMPILER} STREQUAL "Clang")
|
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB" OR ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
|
||||||
|
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -Wall")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -Wall")
|
||||||
set(COMMON_PROF "${COMMON_PROF} -fno-inline")
|
set(COMMON_PROF "${COMMON_PROF} -fno-inline")
|
||||||
|
@ -43,7 +43,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (${CMAKE_C_COMPILER} STREQUAL "PGI")
|
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
|
||||||
if (BINARY64)
|
if (BINARY64)
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7-64")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7-64")
|
||||||
else ()
|
else ()
|
||||||
|
@ -51,7 +51,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PGI")
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
|
if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
|
||||||
if (BINARY64)
|
if (BINARY64)
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
|
||||||
else ()
|
else ()
|
||||||
|
@ -59,7 +59,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
|
if (${CMAKE_C_COMPILER_ID} STREQUAL "OPEN64")
|
||||||
|
|
||||||
if (MIPS64)
|
if (MIPS64)
|
||||||
|
|
||||||
|
@ -87,7 +87,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (${CMAKE_C_COMPILER} STREQUAL "SUN")
|
if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN")
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -w")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -w")
|
||||||
if (X86)
|
if (X86)
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -m32")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -m32")
|
||||||
|
@ -96,3 +96,10 @@ if (${CMAKE_C_COMPILER} STREQUAL "SUN")
|
||||||
endif ()
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
if (${CORE} STREQUAL "SKYLAKEX")
|
||||||
|
if (NOT DYNAMIC_ARCH)
|
||||||
|
if (NOT NO_AVX512)
|
||||||
|
set (CCOMMON_OPT = "${CCOMMON_OPT} -march=skylake-avx512")
|
||||||
|
endif ()
|
||||||
|
endif ()
|
||||||
|
endif ()
|
||||||
|
|
|
@ -3,6 +3,11 @@
|
||||||
## Description: Ported from portion of OpenBLAS/Makefile.system
|
## Description: Ported from portion of OpenBLAS/Makefile.system
|
||||||
## Sets Fortran related variables.
|
## Sets Fortran related variables.
|
||||||
|
|
||||||
|
if (INTERFACE64)
|
||||||
|
set(SUFFIX64 64)
|
||||||
|
set(SUFFIX64_UNDERSCORE _64)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (${F_COMPILER} STREQUAL "FLANG")
|
if (${F_COMPILER} STREQUAL "FLANG")
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
|
||||||
if (BINARY64 AND INTERFACE64)
|
if (BINARY64 AND INTERFACE64)
|
||||||
|
@ -39,7 +44,10 @@ endif ()
|
||||||
|
|
||||||
if (${F_COMPILER} STREQUAL "GFORTRAN")
|
if (${F_COMPILER} STREQUAL "GFORTRAN")
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
|
||||||
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall")
|
# ensure reentrancy of lapack codes
|
||||||
|
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
|
||||||
|
# work around ABI violation in passing string arguments from C
|
||||||
|
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
|
||||||
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||||
if (NOT NO_LAPACK)
|
if (NOT NO_LAPACK)
|
||||||
set(EXTRALIB "{EXTRALIB} -lgfortran")
|
set(EXTRALIB "{EXTRALIB} -lgfortran")
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# helper functions for the kernel CMakeLists.txt
|
# helper functions for the kernel CMakeLists.txt
|
||||||
|
|
||||||
|
|
||||||
# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file.
|
# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file.
|
||||||
macro(SetDefaultL1)
|
macro(SetDefaultL1)
|
||||||
set(SAMAXKERNEL amax.S)
|
set(SAMAXKERNEL amax.S)
|
||||||
set(DAMAXKERNEL amax.S)
|
set(DAMAXKERNEL amax.S)
|
||||||
|
@ -107,6 +107,12 @@ macro(SetDefaultL1)
|
||||||
set(DAXPBYKERNEL ../arm/axpby.c)
|
set(DAXPBYKERNEL ../arm/axpby.c)
|
||||||
set(CAXPBYKERNEL ../arm/zaxpby.c)
|
set(CAXPBYKERNEL ../arm/zaxpby.c)
|
||||||
set(ZAXPBYKERNEL ../arm/zaxpby.c)
|
set(ZAXPBYKERNEL ../arm/zaxpby.c)
|
||||||
|
set(SSUMKERNEL sum.S)
|
||||||
|
set(DSUMKERNEL sum.S)
|
||||||
|
set(CSUMKERNEL zsum.S)
|
||||||
|
set(ZSUMKERNEL zsum.S)
|
||||||
|
set(QSUMKERNEL sum.S)
|
||||||
|
set(XSUMKERNEL zsum.S)
|
||||||
endmacro ()
|
endmacro ()
|
||||||
|
|
||||||
macro(SetDefaultL2)
|
macro(SetDefaultL2)
|
||||||
|
|
|
@ -115,7 +115,9 @@ set(SLASRC
|
||||||
stplqt.f stplqt2.f stpmlqt.f
|
stplqt.f stplqt2.f stpmlqt.f
|
||||||
ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f
|
ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f
|
||||||
ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
|
ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
|
||||||
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f)
|
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
|
||||||
|
scombssq.f sgesvdq.f slaorhr_col_getrfnp.f
|
||||||
|
slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f )
|
||||||
|
|
||||||
set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
|
set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
|
||||||
sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
|
sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
|
||||||
|
@ -210,7 +212,9 @@ set(CLASRC
|
||||||
ctplqt.f ctplqt2.f ctpmlqt.f
|
ctplqt.f ctplqt2.f ctpmlqt.f
|
||||||
chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f
|
chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f
|
||||||
cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
|
cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
|
||||||
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f)
|
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
|
||||||
|
cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f
|
||||||
|
cungtsqr.f cunhr_col.f )
|
||||||
|
|
||||||
set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
|
set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
|
||||||
cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
|
cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
|
||||||
|
@ -299,7 +303,9 @@ set(DLASRC
|
||||||
dtplqt.f dtplqt2.f dtpmlqt.f
|
dtplqt.f dtplqt2.f dtpmlqt.f
|
||||||
dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f
|
dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f
|
||||||
dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
|
dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
|
||||||
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f)
|
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
|
||||||
|
dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
|
||||||
|
dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f )
|
||||||
|
|
||||||
set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
|
set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
|
||||||
dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
|
dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
|
||||||
|
@ -398,7 +404,9 @@ set(ZLASRC
|
||||||
zgelq.f zlaswlq.f zlamswlq.f zgemlq.f
|
zgelq.f zlaswlq.f zlamswlq.f zgemlq.f
|
||||||
zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f
|
zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f
|
||||||
zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
|
zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
|
||||||
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f)
|
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
|
||||||
|
zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
|
||||||
|
zungtsqr.f zunhr_col.f)
|
||||||
|
|
||||||
set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
|
set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
|
||||||
zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
|
zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
|
||||||
|
|
|
@ -715,6 +715,8 @@ set(DSRC
|
||||||
lapacke_dgesv_work.c
|
lapacke_dgesv_work.c
|
||||||
lapacke_dgesvd.c
|
lapacke_dgesvd.c
|
||||||
lapacke_dgesvd_work.c
|
lapacke_dgesvd_work.c
|
||||||
|
lapacke_dgesvdq.c
|
||||||
|
lapacke_dgesvdq_work.c
|
||||||
lapacke_dgesvdx.c
|
lapacke_dgesvdx.c
|
||||||
lapacke_dgesvdx_work.c
|
lapacke_dgesvdx_work.c
|
||||||
lapacke_dgesvj.c
|
lapacke_dgesvj.c
|
||||||
|
@ -1287,6 +1289,8 @@ set(SSRC
|
||||||
lapacke_sgesv_work.c
|
lapacke_sgesv_work.c
|
||||||
lapacke_sgesvd.c
|
lapacke_sgesvd.c
|
||||||
lapacke_sgesvd_work.c
|
lapacke_sgesvd_work.c
|
||||||
|
lapacke_sgesvdq.c
|
||||||
|
lapacke_sgesvdq_work.c
|
||||||
lapacke_sgesvdx.c
|
lapacke_sgesvdx.c
|
||||||
lapacke_sgesvdx_work.c
|
lapacke_sgesvdx_work.c
|
||||||
lapacke_sgesvj.c
|
lapacke_sgesvj.c
|
||||||
|
@ -1853,6 +1857,8 @@ set(ZSRC
|
||||||
lapacke_zgesv_work.c
|
lapacke_zgesv_work.c
|
||||||
lapacke_zgesvd.c
|
lapacke_zgesvd.c
|
||||||
lapacke_zgesvd_work.c
|
lapacke_zgesvd_work.c
|
||||||
|
lapacke_zgesvdq.c
|
||||||
|
lapacke_zgesvdq_work.c
|
||||||
lapacke_zgesvdx.c
|
lapacke_zgesvdx.c
|
||||||
lapacke_zgesvdx_work.c
|
lapacke_zgesvdx_work.c
|
||||||
lapacke_zgesvj.c
|
lapacke_zgesvj.c
|
||||||
|
|
|
@ -1,9 +1,11 @@
|
||||||
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
||||||
|
libsuffix=@SUFFIX64_UNDERSCORE@
|
||||||
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
||||||
|
|
||||||
|
openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
|
||||||
Name: OpenBLAS
|
Name: OpenBLAS
|
||||||
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
||||||
Version: @OPENBLAS_VERSION@
|
Version: @OPENBLAS_VERSION@
|
||||||
URL: https://github.com/xianyi/OpenBLAS
|
URL: https://github.com/xianyi/OpenBLAS
|
||||||
Libs: -L${libdir} -lopenblas
|
Libs: -L${libdir} -lopenblas${libsuffix}
|
||||||
Cflags: -I${includedir}
|
Cflags: -I${includedir}
|
||||||
|
|
|
@ -8,6 +8,11 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
|
||||||
set(NO_EXPRECISION 1)
|
set(NO_EXPRECISION 1)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly")
|
||||||
|
set(EXTRALIB "${EXTRALIB} -lm")
|
||||||
|
set(NO_EXPRECISION 1)
|
||||||
|
endif ()
|
||||||
|
|
||||||
if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX")
|
if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX")
|
||||||
set(EXTRALIB "${EXTRALIB} -lm")
|
set(EXTRALIB "${EXTRALIB} -lm")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
|
@ -59,6 +59,9 @@ set(FU "")
|
||||||
if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang"))
|
if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang"))
|
||||||
set(FU "_")
|
set(FU "_")
|
||||||
endif()
|
endif()
|
||||||
|
if(MINGW AND NOT MINGW64)
|
||||||
|
set(FU "_")
|
||||||
|
endif()
|
||||||
|
|
||||||
set(COMPILER_ID ${CMAKE_C_COMPILER_ID})
|
set(COMPILER_ID ${CMAKE_C_COMPILER_ID})
|
||||||
if (${COMPILER_ID} STREQUAL "GNU")
|
if (${COMPILER_ID} STREQUAL "GNU")
|
||||||
|
@ -82,18 +85,59 @@ endif ()
|
||||||
# f_check
|
# f_check
|
||||||
if (NOT NOFORTRAN)
|
if (NOT NOFORTRAN)
|
||||||
include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake")
|
include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake")
|
||||||
|
else ()
|
||||||
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
|
"#define BUNDERSCORE _\n"
|
||||||
|
"#define NEEDBUNDERSCORE 1\n")
|
||||||
|
set(BU "_")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
# Cannot run getarch on target if we are cross-compiling
|
# Cannot run getarch on target if we are cross-compiling
|
||||||
if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
|
if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE"))
|
||||||
# Write to config as getarch would
|
# Write to config as getarch would
|
||||||
|
if (DEFINED TARGET_CORE)
|
||||||
|
set(TCORE ${TARGET_CORE})
|
||||||
|
else()
|
||||||
|
set(TCORE ${CORE})
|
||||||
|
endif()
|
||||||
|
|
||||||
# TODO: Set up defines that getarch sets up based on every other target
|
# TODO: Set up defines that getarch sets up based on every other target
|
||||||
# Perhaps this should be inside a different file as it grows larger
|
# Perhaps this should be inside a different file as it grows larger
|
||||||
file(APPEND ${TARGET_CONF_TEMP}
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
"#define ${CORE}\n"
|
"#define ${TCORE}\n"
|
||||||
"#define CHAR_CORENAME \"${CORE}\"\n")
|
"#define CORE_${TCORE}\n"
|
||||||
if ("${CORE}" STREQUAL "ARMV7")
|
"#define CHAR_CORENAME \"${TCORE}\"\n")
|
||||||
|
if ("${TCORE}" STREQUAL "CORE2")
|
||||||
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
|
"#define L1_DATA_SIZE\t32768\n"
|
||||||
|
"#define L1_DATA_LINESIZE\t64\n"
|
||||||
|
"#define L2_SIZE\t1048576\n"
|
||||||
|
"#define L2_LINESIZE\t64\n"
|
||||||
|
"#define DTB_DEFAULT_ENTRIES\t256\n"
|
||||||
|
"#define DTB_SIZE\t4096\n"
|
||||||
|
"#define HAVE_CMOV\n"
|
||||||
|
"#define HAVE_MMX\n"
|
||||||
|
"#define HAVE_SSE\n"
|
||||||
|
"#define HAVE_SSE2\n"
|
||||||
|
"#define HAVE_SSE3\n"
|
||||||
|
"#define HAVE_SSSE3\n"
|
||||||
|
"#define SLOCAL_BUFFER_SIZE\t16384\n"
|
||||||
|
"#define DLOCAL_BUFFER_SIZE\t16384\n"
|
||||||
|
"#define CLOCAL_BUFFER_SIZE\t16384\n"
|
||||||
|
"#define ZLOCAL_BUFFER_SIZE\t16384\n")
|
||||||
|
set(SGEMM_UNROLL_M 8)
|
||||||
|
set(SGEMM_UNROLL_N 4)
|
||||||
|
set(DGEMM_UNROLL_M 4)
|
||||||
|
set(DGEMM_UNROLL_N 4)
|
||||||
|
set(CGEMM_UNROLL_M 4)
|
||||||
|
set(CGEMM_UNROLL_N 2)
|
||||||
|
set(ZGEMM_UNROLL_M 2)
|
||||||
|
set(ZGEMM_UNROLL_N 2)
|
||||||
|
set(CGEMM3M_UNROLL_M 8)
|
||||||
|
set(CGEMM3M_UNROLL_N 4)
|
||||||
|
set(ZGEMM3M_UNROLL_M 4)
|
||||||
|
set(ZGEMM3M_UNROLL_N 4)
|
||||||
|
elseif ("${TCORE}" STREQUAL "ARMV7")
|
||||||
file(APPEND ${TARGET_CONF_TEMP}
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
"#define L1_DATA_SIZE\t65536\n"
|
"#define L1_DATA_SIZE\t65536\n"
|
||||||
"#define L1_DATA_LINESIZE\t32\n"
|
"#define L1_DATA_LINESIZE\t32\n"
|
||||||
|
@ -108,7 +152,11 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
|
||||||
set(SGEMM_UNROLL_N 4)
|
set(SGEMM_UNROLL_N 4)
|
||||||
set(DGEMM_UNROLL_M 4)
|
set(DGEMM_UNROLL_M 4)
|
||||||
set(DGEMM_UNROLL_N 4)
|
set(DGEMM_UNROLL_N 4)
|
||||||
elseif ("${CORE}" STREQUAL "ARMV8")
|
set(CGEMM_UNROLL_M 2)
|
||||||
|
set(CGEMM_UNROLL_N 2)
|
||||||
|
set(ZGEMM_UNROLL_M 2)
|
||||||
|
set(ZGEMM_UNROLL_N 2)
|
||||||
|
elseif ("${TCORE}" STREQUAL "ARMV8")
|
||||||
file(APPEND ${TARGET_CONF_TEMP}
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
"#define L1_DATA_SIZE\t32768\n"
|
"#define L1_DATA_SIZE\t32768\n"
|
||||||
"#define L1_DATA_LINESIZE\t64\n"
|
"#define L1_DATA_LINESIZE\t64\n"
|
||||||
|
@ -116,18 +164,26 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
|
||||||
"#define L2_LINESIZE\t64\n"
|
"#define L2_LINESIZE\t64\n"
|
||||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||||
"#define DTB_SIZE\t4096\n"
|
"#define DTB_SIZE\t4096\n"
|
||||||
"#define L2_ASSOCIATIVE\t32\n")
|
"#define L2_ASSOCIATIVE\t32\n"
|
||||||
set(SGEMM_UNROLL_M 4)
|
"#define ARMV8\n")
|
||||||
|
set(SGEMM_UNROLL_M 16)
|
||||||
set(SGEMM_UNROLL_N 4)
|
set(SGEMM_UNROLL_N 4)
|
||||||
elseif ("${CORE}" STREQUAL "CORTEXA57")
|
set(DGEMM_UNROLL_M 8)
|
||||||
|
set(DGEMM_UNROLL_N 4)
|
||||||
|
set(CGEMM_UNROLL_M 8)
|
||||||
|
set(CGEMM_UNROLL_N 4)
|
||||||
|
set(ZGEMM_UNROLL_M 4)
|
||||||
|
set(ZGEMM_UNROLL_N 4)
|
||||||
|
set(SYMV_P 16)
|
||||||
|
elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53")
|
||||||
file(APPEND ${TARGET_CONF_TEMP}
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
"#define L1_CODE_SIZE\t49152\n"
|
"#define L1_CODE_SIZE\t32768\n"
|
||||||
"#define L1_CODE_LINESIZE\t64\n"
|
"#define L1_CODE_LINESIZE\t64\n"
|
||||||
"#define L1_CODE_ASSOCIATIVE\t3\n"
|
"#define L1_CODE_ASSOCIATIVE\t3\n"
|
||||||
"#define L1_DATA_SIZE\t32768\n"
|
"#define L1_DATA_SIZE\t32768\n"
|
||||||
"#define L1_DATA_LINESIZE\t64\n"
|
"#define L1_DATA_LINESIZE\t64\n"
|
||||||
"#define L1_DATA_ASSOCIATIVE\t2\n"
|
"#define L1_DATA_ASSOCIATIVE\t2\n"
|
||||||
"#define L2_SIZE\t2097152\n"
|
"#define L2_SIZE\t262144\n"
|
||||||
"#define L2_LINESIZE\t64\n"
|
"#define L2_LINESIZE\t64\n"
|
||||||
"#define L2_ASSOCIATIVE\t16\n"
|
"#define L2_ASSOCIATIVE\t16\n"
|
||||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||||
|
@ -135,15 +191,224 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
|
||||||
"#define HAVE_VFPV4\n"
|
"#define HAVE_VFPV4\n"
|
||||||
"#define HAVE_VFPV3\n"
|
"#define HAVE_VFPV3\n"
|
||||||
"#define HAVE_VFP\n"
|
"#define HAVE_VFP\n"
|
||||||
"#define HAVE_NEON\n")
|
"#define HAVE_NEON\n"
|
||||||
|
"#define ARMV8\n")
|
||||||
set(SGEMM_UNROLL_M 16)
|
set(SGEMM_UNROLL_M 16)
|
||||||
set(SGEMM_UNROLL_N 4)
|
set(SGEMM_UNROLL_N 4)
|
||||||
set(DGEMM_UNROLL_M 8)
|
set(DGEMM_UNROLL_M 8)
|
||||||
set(DGEMM_UNROLL_N 4)
|
set(DGEMM_UNROLL_N 4)
|
||||||
set(CGEMM_UNROLL_M 8)
|
set(CGEMM_UNROLL_M 8)
|
||||||
set(CGEMM_UNROLL_N 4)
|
set(CGEMM_UNROLL_N 4)
|
||||||
set(ZGEMM_UNROLL_M 8)
|
set(ZGEMM_UNROLL_M 4)
|
||||||
set(ZGEMM_UNROLL_N 4)
|
set(ZGEMM_UNROLL_N 4)
|
||||||
|
set(SYMV_P 16)
|
||||||
|
elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73")
|
||||||
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
|
"#define L1_CODE_SIZE\t49152\n"
|
||||||
|
"#define L1_CODE_LINESIZE\t64\n"
|
||||||
|
"#define L1_CODE_ASSOCIATIVE\t3\n"
|
||||||
|
"#define L1_DATA_SIZE\t32768\n"
|
||||||
|
"#define L1_DATA_LINESIZE\t64\n"
|
||||||
|
"#define L1_DATA_ASSOCIATIVE\t2\n"
|
||||||
|
"#define L2_SIZE\t524288\n"
|
||||||
|
"#define L2_LINESIZE\t64\n"
|
||||||
|
"#define L2_ASSOCIATIVE\t16\n"
|
||||||
|
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||||
|
"#define DTB_SIZE\t4096\n"
|
||||||
|
"#define HAVE_VFPV4\n"
|
||||||
|
"#define HAVE_VFPV3\n"
|
||||||
|
"#define HAVE_VFP\n"
|
||||||
|
"#define HAVE_NEON\n"
|
||||||
|
"#define ARMV8\n")
|
||||||
|
set(SGEMM_UNROLL_M 16)
|
||||||
|
set(SGEMM_UNROLL_N 4)
|
||||||
|
set(DGEMM_UNROLL_M 8)
|
||||||
|
set(DGEMM_UNROLL_N 4)
|
||||||
|
set(CGEMM_UNROLL_M 8)
|
||||||
|
set(CGEMM_UNROLL_N 4)
|
||||||
|
set(ZGEMM_UNROLL_M 4)
|
||||||
|
set(ZGEMM_UNROLL_N 4)
|
||||||
|
set(SYMV_P 16)
|
||||||
|
elseif ("${TCORE}" STREQUAL "FALKOR")
|
||||||
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
|
"#define L1_CODE_SIZE\t65536\n"
|
||||||
|
"#define L1_CODE_LINESIZE\t64\n"
|
||||||
|
"#define L1_CODE_ASSOCIATIVE\t3\n"
|
||||||
|
"#define L1_DATA_SIZE\t32768\n"
|
||||||
|
"#define L1_DATA_LINESIZE\t128\n"
|
||||||
|
"#define L1_DATA_ASSOCIATIVE\t2\n"
|
||||||
|
"#define L2_SIZE\t524288\n"
|
||||||
|
"#define L2_LINESIZE\t64\n"
|
||||||
|
"#define L2_ASSOCIATIVE\t16\n"
|
||||||
|
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||||
|
"#define DTB_SIZE\t4096\n"
|
||||||
|
"#define HAVE_VFPV4\n"
|
||||||
|
"#define HAVE_VFPV3\n"
|
||||||
|
"#define HAVE_VFP\n"
|
||||||
|
"#define HAVE_NEON\n"
|
||||||
|
"#define ARMV8\n")
|
||||||
|
set(SGEMM_UNROLL_M 16)
|
||||||
|
set(SGEMM_UNROLL_N 4)
|
||||||
|
set(DGEMM_UNROLL_M 8)
|
||||||
|
set(DGEMM_UNROLL_N 4)
|
||||||
|
set(CGEMM_UNROLL_M 8)
|
||||||
|
set(CGEMM_UNROLL_N 4)
|
||||||
|
set(ZGEMM_UNROLL_M 4)
|
||||||
|
set(ZGEMM_UNROLL_N 4)
|
||||||
|
set(SYMV_P 16)
|
||||||
|
elseif ("${TCORE}" STREQUAL "THUNDERX")
|
||||||
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
|
"#define L1_CODE_SIZE\t32768\n"
|
||||||
|
"#define L1_CODE_LINESIZE\t64\n"
|
||||||
|
"#define L1_CODE_ASSOCIATIVE\t3\n"
|
||||||
|
"#define L1_DATA_SIZE\t32768\n"
|
||||||
|
"#define L1_DATA_LINESIZE\t128\n"
|
||||||
|
"#define L1_DATA_ASSOCIATIVE\t2\n"
|
||||||
|
"#define L2_SIZE\t167772164\n"
|
||||||
|
"#define L2_LINESIZE\t128\n"
|
||||||
|
"#define L2_ASSOCIATIVE\t16\n"
|
||||||
|
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||||
|
"#define DTB_SIZE\t4096\n"
|
||||||
|
"#define HAVE_VFPV4\n"
|
||||||
|
"#define HAVE_VFPV3\n"
|
||||||
|
"#define HAVE_VFP\n"
|
||||||
|
"#define HAVE_NEON\n"
|
||||||
|
"#define ARMV8\n")
|
||||||
|
set(SGEMM_UNROLL_M 4)
|
||||||
|
set(SGEMM_UNROLL_N 4)
|
||||||
|
set(DGEMM_UNROLL_M 2)
|
||||||
|
set(DGEMM_UNROLL_N 2)
|
||||||
|
set(CGEMM_UNROLL_M 2)
|
||||||
|
set(CGEMM_UNROLL_N 2)
|
||||||
|
set(ZGEMM_UNROLL_M 2)
|
||||||
|
set(ZGEMM_UNROLL_N 2)
|
||||||
|
set(SYMV_P 16)
|
||||||
|
elseif ("${TCORE}" STREQUAL "THUNDERX2T99")
|
||||||
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
|
"#define L1_CODE_SIZE\t32768\n"
|
||||||
|
"#define L1_CODE_LINESIZE\t64\n"
|
||||||
|
"#define L1_CODE_ASSOCIATIVE\t8\n"
|
||||||
|
"#define L1_DATA_SIZE\t32768\n"
|
||||||
|
"#define L1_DATA_LINESIZE\t64\n"
|
||||||
|
"#define L1_DATA_ASSOCIATIVE\t8\n"
|
||||||
|
"#define L2_SIZE\t262144\n"
|
||||||
|
"#define L2_LINESIZE\t64\n"
|
||||||
|
"#define L2_ASSOCIATIVE\t8\n"
|
||||||
|
"#define L3_SIZE\t33554432\n"
|
||||||
|
"#define L3_LINESIZE\t64\n"
|
||||||
|
"#define L3_ASSOCIATIVE\t32\n"
|
||||||
|
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||||
|
"#define DTB_SIZE\t4096\n"
|
||||||
|
"#define ARMV8\n")
|
||||||
|
set(SGEMM_UNROLL_M 16)
|
||||||
|
set(SGEMM_UNROLL_N 4)
|
||||||
|
set(DGEMM_UNROLL_M 8)
|
||||||
|
set(DGEMM_UNROLL_N 4)
|
||||||
|
set(CGEMM_UNROLL_M 8)
|
||||||
|
set(CGEMM_UNROLL_N 4)
|
||||||
|
set(ZGEMM_UNROLL_M 4)
|
||||||
|
set(ZGEMM_UNROLL_N 4)
|
||||||
|
set(SYMV_P 16)
|
||||||
|
elseif ("${TCORE}" STREQUAL "TSV110")
|
||||||
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
|
"#define ARMV8\n"
|
||||||
|
"#define L1_CODE_SIZE\t65536\n"
|
||||||
|
"#define L1_CODE_LINESIZE\t64\n"
|
||||||
|
"#define L1_CODE_ASSOCIATIVE\t4\n"
|
||||||
|
"#define L1_DATA_SIZE\t65536\n"
|
||||||
|
"#define L1_DATA_LINESIZE\t64\n"
|
||||||
|
"#define L1_DATA_ASSOCIATIVE\t4\n"
|
||||||
|
"#define L2_SIZE\t524288\n"
|
||||||
|
"#define L2_LINESIZE\t64\n"
|
||||||
|
"#define L2_ASSOCIATIVE\t8\n"
|
||||||
|
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||||
|
"#define DTB_SIZE\t4096\n")
|
||||||
|
set(SGEMM_UNROLL_M 16)
|
||||||
|
set(SGEMM_UNROLL_N 4)
|
||||||
|
set(DGEMM_UNROLL_M 8)
|
||||||
|
set(DGEMM_UNROLL_N 4)
|
||||||
|
set(CGEMM_UNROLL_M 8)
|
||||||
|
set(CGEMM_UNROLL_N 4)
|
||||||
|
set(ZGEMM_UNROLL_M 4)
|
||||||
|
set(ZGEMM_UNROLL_N 4)
|
||||||
|
set(SYMV_P 16)
|
||||||
|
elseif ("${TCORE}" STREQUAL "EMAG8180")
|
||||||
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
|
"#define ARMV8\n"
|
||||||
|
"#define L1_CODE_SIZE\t32768\n"
|
||||||
|
"#define L1_CODE_LINESIZE\t64\n"
|
||||||
|
"#define L1_CODE_ASSOCIATIVE\t4\n"
|
||||||
|
"#define L1_DATA_SIZE\t32768\n"
|
||||||
|
"#define L1_DATA_LINESIZE\t64\n"
|
||||||
|
"#define L1_DATA_ASSOCIATIVE\t4\n"
|
||||||
|
"#define L2_SIZE\t5262144\n"
|
||||||
|
"#define L2_LINESIZE\t64\n"
|
||||||
|
"#define L2_ASSOCIATIVE\t8\n"
|
||||||
|
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||||
|
"#define DTB_SIZE\t4096\n")
|
||||||
|
set(SGEMM_UNROLL_M 16)
|
||||||
|
set(SGEMM_UNROLL_N 4)
|
||||||
|
set(DGEMM_UNROLL_M 8)
|
||||||
|
set(DGEMM_UNROLL_N 4)
|
||||||
|
set(CGEMM_UNROLL_M 8)
|
||||||
|
set(CGEMM_UNROLL_N 4)
|
||||||
|
set(ZGEMM_UNROLL_M 4)
|
||||||
|
set(ZGEMM_UNROLL_N 4)
|
||||||
|
set(SYMV_P 16)
|
||||||
|
elseif ("${TCORE}" STREQUAL "POWER6")
|
||||||
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
|
"#define L1_DATA_SIZE 32768\n"
|
||||||
|
"#define L1_DATA_LINESIZE 128\n"
|
||||||
|
"#define L2_SIZE 524288\n"
|
||||||
|
"#define L2_LINESIZE 128 \n"
|
||||||
|
"#define DTB_DEFAULT_ENTRIES 128\n"
|
||||||
|
"#define DTB_SIZE 4096\n"
|
||||||
|
"#define L2_ASSOCIATIVE 8\n")
|
||||||
|
set(SGEMM_UNROLL_M 4)
|
||||||
|
set(SGEMM_UNROLL_N 4)
|
||||||
|
set(DGEMM_UNROLL_M 4)
|
||||||
|
set(DGEMM_UNROLL_N 4)
|
||||||
|
set(CGEMM_UNROLL_M 2)
|
||||||
|
set(CGEMM_UNROLL_N 4)
|
||||||
|
set(ZGEMM_UNROLL_M 2)
|
||||||
|
set(ZGEMM_UNROLL_N 4)
|
||||||
|
set(SYMV_P 8)
|
||||||
|
elseif ("${TCORE}" STREQUAL "POWER8")
|
||||||
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
|
"#define L1_DATA_SIZE 32768\n"
|
||||||
|
"#define L1_DATA_LINESIZE 128\n"
|
||||||
|
"#define L2_SIZE 524288\n"
|
||||||
|
"#define L2_LINESIZE 128 \n"
|
||||||
|
"#define DTB_DEFAULT_ENTRIES 128\n"
|
||||||
|
"#define DTB_SIZE 4096\n"
|
||||||
|
"#define L2_ASSOCIATIVE 8\n")
|
||||||
|
set(SGEMM_UNROLL_M 16)
|
||||||
|
set(SGEMM_UNROLL_N 8)
|
||||||
|
set(DGEMM_UNROLL_M 16)
|
||||||
|
set(DGEMM_UNROLL_N 4)
|
||||||
|
set(CGEMM_UNROLL_M 8)
|
||||||
|
set(CGEMM_UNROLL_N 4)
|
||||||
|
set(ZGEMM_UNROLL_M 8)
|
||||||
|
set(ZGEMM_UNROLL_N 2)
|
||||||
|
set(SYMV_P 8)
|
||||||
|
elseif ("${TCORE}" STREQUAL "POWER9")
|
||||||
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
|
"#define L1_DATA_SIZE 32768\n"
|
||||||
|
"#define L1_DATA_LINESIZE 128\n"
|
||||||
|
"#define L2_SIZE 524288\n"
|
||||||
|
"#define L2_LINESIZE 128 \n"
|
||||||
|
"#define DTB_DEFAULT_ENTRIES 128\n"
|
||||||
|
"#define DTB_SIZE 4096\n"
|
||||||
|
"#define L2_ASSOCIATIVE 8\n")
|
||||||
|
set(SGEMM_UNROLL_M 16)
|
||||||
|
set(SGEMM_UNROLL_N 8)
|
||||||
|
set(DGEMM_UNROLL_M 16)
|
||||||
|
set(DGEMM_UNROLL_N 4)
|
||||||
|
set(CGEMM_UNROLL_M 8)
|
||||||
|
set(CGEMM_UNROLL_N 4)
|
||||||
|
set(ZGEMM_UNROLL_M 8)
|
||||||
|
set(ZGEMM_UNROLL_N 2)
|
||||||
|
set(SYMV_P 8)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Or should this actually be NUM_CORES?
|
# Or should this actually be NUM_CORES?
|
||||||
|
@ -163,6 +428,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
|
||||||
file(APPEND ${TARGET_CONF_TEMP}
|
file(APPEND ${TARGET_CONF_TEMP}
|
||||||
"#define GEMM_MULTITHREAD_THRESHOLD\t${GEMM_MULTITHREAD_THRESHOLD}\n")
|
"#define GEMM_MULTITHREAD_THRESHOLD\t${GEMM_MULTITHREAD_THRESHOLD}\n")
|
||||||
# Move to where gen_config_h would place it
|
# Move to where gen_config_h would place it
|
||||||
|
file(MAKE_DIRECTORY ${TARGET_CONF_DIR})
|
||||||
file(RENAME ${TARGET_CONF_TEMP} "${TARGET_CONF_DIR}/${TARGET_CONF}")
|
file(RENAME ${TARGET_CONF_TEMP} "${TARGET_CONF_DIR}/${TARGET_CONF}")
|
||||||
|
|
||||||
else(NOT CMAKE_CROSSCOMPILING)
|
else(NOT CMAKE_CROSSCOMPILING)
|
||||||
|
@ -178,6 +444,9 @@ else(NOT CMAKE_CROSSCOMPILING)
|
||||||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
|
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
|
||||||
else()
|
else()
|
||||||
list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S)
|
list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S)
|
||||||
|
if (DEFINED TARGET_CORE)
|
||||||
|
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_${TARGET_CORE})
|
||||||
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||||
|
|
|
@ -33,12 +33,31 @@ endif ()
|
||||||
if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
|
if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
|
||||||
message(STATUS "Compiling a ${BINARY}-bit binary.")
|
message(STATUS "Compiling a ${BINARY}-bit binary.")
|
||||||
set(NO_AVX 1)
|
set(NO_AVX 1)
|
||||||
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE")
|
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX")
|
||||||
set(TARGET "NEHALEM")
|
set(TARGET "NEHALEM")
|
||||||
endif ()
|
endif ()
|
||||||
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
|
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
|
||||||
set(TARGET "BARCELONA")
|
set(TARGET "BARCELONA")
|
||||||
endif ()
|
endif ()
|
||||||
|
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53")
|
||||||
|
set(TARGET "ARMV7")
|
||||||
|
endif ()
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
if (DEFINED TARGET)
|
||||||
|
if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512)
|
||||||
|
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||||
|
endif()
|
||||||
|
if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2)
|
||||||
|
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||||
|
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||||
|
if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
|
||||||
|
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
||||||
|
endif()
|
||||||
|
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
|
||||||
|
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (DEFINED TARGET)
|
if (DEFINED TARGET)
|
||||||
|
@ -46,6 +65,18 @@ if (DEFINED TARGET)
|
||||||
set(GETARCH_FLAGS "-DFORCE_${TARGET}")
|
set(GETARCH_FLAGS "-DFORCE_${TARGET}")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
|
||||||
|
if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
|
||||||
|
set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
# On x86 no AVX support is available
|
||||||
|
if (X86 OR X86_64)
|
||||||
|
if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4"))
|
||||||
|
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512")
|
||||||
|
endif ()
|
||||||
|
endif ()
|
||||||
|
|
||||||
if (INTERFACE64)
|
if (INTERFACE64)
|
||||||
message(STATUS "Using 64-bit integers.")
|
message(STATUS "Using 64-bit integers.")
|
||||||
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT")
|
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT")
|
||||||
|
@ -96,6 +127,10 @@ if (NOT CMAKE_CROSSCOMPILING)
|
||||||
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (NOT DEFINED NUM_PARALLEL)
|
||||||
|
set(NUM_PARALLEL 1)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (NOT DEFINED NUM_THREADS)
|
if (NOT DEFINED NUM_THREADS)
|
||||||
if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0)
|
if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0)
|
||||||
# HT?
|
# HT?
|
||||||
|
@ -113,10 +148,16 @@ endif ()
|
||||||
|
|
||||||
if (USE_THREAD)
|
if (USE_THREAD)
|
||||||
message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.")
|
message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.")
|
||||||
|
else()
|
||||||
|
if (${USE_LOCKING})
|
||||||
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_LOCKING")
|
||||||
|
endif ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
|
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
|
||||||
|
if (DEFINED BINARY)
|
||||||
|
message(STATUS "Compiling a ${BINARY}-bit binary.")
|
||||||
|
endif ()
|
||||||
if (NOT DEFINED NEED_PIC)
|
if (NOT DEFINED NEED_PIC)
|
||||||
set(NEED_PIC 1)
|
set(NEED_PIC 1)
|
||||||
endif ()
|
endif ()
|
||||||
|
@ -133,6 +174,9 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
|
||||||
if (NOT NOFORTRAN)
|
if (NOT NOFORTRAN)
|
||||||
# Fortran Compiler dependent settings
|
# Fortran Compiler dependent settings
|
||||||
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
|
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
|
||||||
|
else ()
|
||||||
|
set(NO_LAPACK 1)
|
||||||
|
set(NO_LAPACKE 1)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (BINARY64)
|
if (BINARY64)
|
||||||
|
@ -158,7 +202,22 @@ if (NEED_PIC)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (DYNAMIC_ARCH)
|
if (DYNAMIC_ARCH)
|
||||||
|
if (X86 OR X86_64 OR ARM64 OR PPC)
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
|
||||||
|
if (DYNAMIC_OLDER)
|
||||||
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
|
||||||
|
endif ()
|
||||||
|
else ()
|
||||||
|
unset (DYNAMIC_ARCH)
|
||||||
|
message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing")
|
||||||
|
endif ()
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
if (DYNAMIC_LIST)
|
||||||
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_LIST")
|
||||||
|
foreach(DCORE ${DYNAMIC_LIST})
|
||||||
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYN_${DCORE}")
|
||||||
|
endforeach ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
if (NO_LAPACK)
|
if (NO_LAPACK)
|
||||||
|
@ -207,6 +266,10 @@ if (CONSISTENT_FPCSR)
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
if (USE_TLS)
|
||||||
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_TLS")
|
||||||
|
endif ()
|
||||||
|
|
||||||
# Only for development
|
# Only for development
|
||||||
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST")
|
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST")
|
||||||
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST")
|
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST")
|
||||||
|
@ -224,6 +287,12 @@ endif ()
|
||||||
|
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}")
|
||||||
|
|
||||||
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}")
|
||||||
|
|
||||||
|
if (BUFFERSIZE)
|
||||||
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DBUFFERSIZE=${BUFFERSIZE}")
|
||||||
|
endif ()
|
||||||
|
|
||||||
if (USE_SIMPLE_THREADED_LEVEL3)
|
if (USE_SIMPLE_THREADED_LEVEL3)
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
|
||||||
endif ()
|
endif ()
|
||||||
|
@ -244,7 +313,7 @@ endif ()
|
||||||
|
|
||||||
set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}")
|
set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}")
|
||||||
|
|
||||||
# TODO: nead to convert these Makefiles
|
# TODO: need to convert these Makefiles
|
||||||
# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake
|
# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake
|
||||||
|
|
||||||
if (${CORE} STREQUAL "PPC440")
|
if (${CORE} STREQUAL "PPC440")
|
||||||
|
@ -291,6 +360,8 @@ if (MIXED_MEMORY_ALLOCATION)
|
||||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION")
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION")
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
set(CCOMMON_OPT "${CCOMMON_OPT} -DVERSION=\"\\\"${OpenBLAS_VERSION}\\\"\"")
|
||||||
|
|
||||||
set(REVISION "-r${OpenBLAS_VERSION}")
|
set(REVISION "-r${OpenBLAS_VERSION}")
|
||||||
set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION})
|
set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION})
|
||||||
|
|
||||||
|
|
|
@ -10,6 +10,16 @@ if (${HOST_OS} STREQUAL "WINDOWS")
|
||||||
set(HOST_OS WINNT)
|
set(HOST_OS WINNT)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
if (${HOST_OS} STREQUAL "LINUX")
|
||||||
|
# check if we're building natively on Android (TERMUX)
|
||||||
|
EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
|
||||||
|
if(${OPERATING_SYSTEM} MATCHES "Android")
|
||||||
|
set(HOST_OS ANDROID)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if(CMAKE_COMPILER_IS_GNUCC AND WIN32)
|
if(CMAKE_COMPILER_IS_GNUCC AND WIN32)
|
||||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine
|
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine
|
||||||
OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE
|
OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE
|
||||||
|
@ -29,13 +39,45 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
|
||||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
|
||||||
set(MIPS64 1)
|
set(MIPS64 1)
|
||||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
||||||
|
if (NOT BINARY)
|
||||||
|
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||||
set(X86_64 1)
|
set(X86_64 1)
|
||||||
|
else()
|
||||||
|
set(X86 1)
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
if (${BINARY} EQUAL "64")
|
||||||
|
set(X86_64 1)
|
||||||
|
else ()
|
||||||
|
set(X86 1)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
|
||||||
set(X86 1)
|
set(X86 1)
|
||||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
|
||||||
set(ARM 1)
|
set(ARM 1)
|
||||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
|
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
|
||||||
|
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||||
set(ARM64 1)
|
set(ARM64 1)
|
||||||
|
else()
|
||||||
|
set(ARM 1)
|
||||||
|
endif()
|
||||||
|
elseif (${CMAKE_CROSSCOMPILING})
|
||||||
|
if (${TARGET} STREQUAL "CORE2")
|
||||||
|
if (NOT BINARY)
|
||||||
|
set(X86 1)
|
||||||
|
elseif (${BINARY} EQUAL "64")
|
||||||
|
set(X86_64 1)
|
||||||
|
else ()
|
||||||
|
set(X86 1)
|
||||||
|
endif()
|
||||||
|
elseif (${TARGET} STREQUAL "ARMV7")
|
||||||
|
set(ARM 1)
|
||||||
|
else()
|
||||||
|
set(ARM64 1)
|
||||||
|
endif ()
|
||||||
|
else ()
|
||||||
|
message(WARNING "Target ARCH could not be determined, got \"${CMAKE_SYSTEM_PROCESSOR}\"")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (X86_64)
|
if (X86_64)
|
||||||
|
@ -66,3 +108,11 @@ else()
|
||||||
set(BINARY32 1)
|
set(BINARY32 1)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (X86_64 OR X86)
|
||||||
|
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
|
||||||
|
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
|
||||||
|
if (NO_AVX512 EQUAL 1)
|
||||||
|
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
|
||||||
|
endif()
|
||||||
|
file(REMOVE "avx512.tmp" "avx512.o")
|
||||||
|
endif()
|
||||||
|
|
|
@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in)
|
||||||
set(CODES_OUT ${CODES_OUT} PARENT_SCOPE)
|
set(CODES_OUT ${CODES_OUT} PARENT_SCOPE)
|
||||||
endfunction ()
|
endfunction ()
|
||||||
|
|
||||||
# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition
|
# generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition
|
||||||
# @param sources_in the source files to build from
|
# @param sources_in the source files to build from
|
||||||
# @param defines_in (optional) preprocessor definitions that will be applied to all objects
|
# @param defines_in (optional) preprocessor definitions that will be applied to all objects
|
||||||
# @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended.
|
# @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended.
|
||||||
|
|
41
common.h
41
common.h
|
@ -85,6 +85,8 @@ extern "C" {
|
||||||
|
|
||||||
#if !defined(_MSC_VER)
|
#if !defined(_MSC_VER)
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
#elif _MSC_VER < 1900
|
||||||
|
#define snprintf _snprintf
|
||||||
#endif
|
#endif
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
|
|
||||||
|
@ -105,6 +107,10 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef OS_HAIKU
|
||||||
|
#define NO_SYSV_IPC
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef OS_WINDOWS
|
#ifdef OS_WINDOWS
|
||||||
#ifdef ATOM
|
#ifdef ATOM
|
||||||
#define GOTO_ATOM ATOM
|
#define GOTO_ATOM ATOM
|
||||||
|
@ -125,7 +131,7 @@ extern "C" {
|
||||||
#include <time.h>
|
#include <time.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
#ifdef SMP
|
#if defined(SMP) || defined(USE_LOCKING)
|
||||||
#include <pthread.h>
|
#include <pthread.h>
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
@ -179,7 +185,7 @@ extern "C" {
|
||||||
|
|
||||||
#define ALLOCA_ALIGN 63UL
|
#define ALLOCA_ALIGN 63UL
|
||||||
|
|
||||||
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2)
|
#define NUM_BUFFERS MAX(50,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER))
|
||||||
|
|
||||||
#ifdef NEEDBUNDERSCORE
|
#ifdef NEEDBUNDERSCORE
|
||||||
#define BLASFUNC(FUNC) FUNC##_
|
#define BLASFUNC(FUNC) FUNC##_
|
||||||
|
@ -194,7 +200,7 @@ extern "C" {
|
||||||
#error "You can't specify both LOCK operation!"
|
#error "You can't specify both LOCK operation!"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef SMP
|
#if defined(SMP) || defined(USE_LOCKING)
|
||||||
#define USE_PTHREAD_LOCK
|
#define USE_PTHREAD_LOCK
|
||||||
#undef USE_PTHREAD_SPINLOCK
|
#undef USE_PTHREAD_SPINLOCK
|
||||||
#endif
|
#endif
|
||||||
|
@ -253,8 +259,14 @@ typedef unsigned long BLASULONG;
|
||||||
|
|
||||||
#ifdef USE64BITINT
|
#ifdef USE64BITINT
|
||||||
typedef BLASLONG blasint;
|
typedef BLASLONG blasint;
|
||||||
|
#if defined(OS_WINDOWS) && defined(__64BIT__)
|
||||||
|
#define blasabs(x) llabs(x)
|
||||||
|
#else
|
||||||
|
#define blasabs(x) labs(x)
|
||||||
|
#endif
|
||||||
#else
|
#else
|
||||||
typedef int blasint;
|
typedef int blasint;
|
||||||
|
#define blasabs(x) abs(x)
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
#ifdef USE64BITINT
|
#ifdef USE64BITINT
|
||||||
|
@ -338,6 +350,11 @@ typedef int blasint;
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef POWER9
|
||||||
|
#ifndef YIELDING
|
||||||
|
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
#ifdef PILEDRIVER
|
#ifdef PILEDRIVER
|
||||||
|
@ -434,7 +451,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
||||||
typedef char env_var_t[MAX_PATH];
|
typedef char env_var_t[MAX_PATH];
|
||||||
#define readenv(p, n) 0
|
#define readenv(p, n) 0
|
||||||
#else
|
#else
|
||||||
#ifdef OS_WINDOWS
|
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
|
||||||
typedef char env_var_t[MAX_PATH];
|
typedef char env_var_t[MAX_PATH];
|
||||||
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
|
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
|
||||||
#else
|
#else
|
||||||
|
@ -647,6 +664,7 @@ void gotoblas_profile_init(void);
|
||||||
void gotoblas_profile_quit(void);
|
void gotoblas_profile_quit(void);
|
||||||
|
|
||||||
#ifdef USE_OPENMP
|
#ifdef USE_OPENMP
|
||||||
|
|
||||||
#ifndef C_MSVC
|
#ifndef C_MSVC
|
||||||
int omp_in_parallel(void);
|
int omp_in_parallel(void);
|
||||||
int omp_get_num_procs(void);
|
int omp_get_num_procs(void);
|
||||||
|
@ -654,6 +672,21 @@ int omp_get_num_procs(void);
|
||||||
__declspec(dllimport) int __cdecl omp_in_parallel(void);
|
__declspec(dllimport) int __cdecl omp_in_parallel(void);
|
||||||
__declspec(dllimport) int __cdecl omp_get_num_procs(void);
|
__declspec(dllimport) int __cdecl omp_get_num_procs(void);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if (__STDC_VERSION__ >= 201112L)
|
||||||
|
#if defined(C_GCC) && ( __GNUC__ < 7)
|
||||||
|
// workaround for GCC bug 65467
|
||||||
|
#ifndef _Atomic
|
||||||
|
#define _Atomic volatile
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#include <stdatomic.h>
|
||||||
|
#else
|
||||||
|
#ifndef _Atomic
|
||||||
|
#define _Atomic volatile
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#else
|
#else
|
||||||
#ifdef __ELF__
|
#ifdef __ELF__
|
||||||
int omp_in_parallel (void) __attribute__ ((weak));
|
int omp_in_parallel (void) __attribute__ ((weak));
|
||||||
|
|
|
@ -78,7 +78,18 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
||||||
|
|
||||||
#define BLAS_LOCK_DEFINED
|
#define BLAS_LOCK_DEFINED
|
||||||
|
|
||||||
|
#if !defined(OS_DARWIN) && !defined (OS_ANDROID)
|
||||||
|
static __inline BLASULONG rpcc(void){
|
||||||
|
BLASULONG ret = 0;
|
||||||
|
|
||||||
|
__asm__ __volatile__ ("isb; mrs %0,cntvct_el0":"=r"(ret));
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define RPCC_DEFINED
|
||||||
|
#define RPCC64BIT
|
||||||
|
#endif
|
||||||
|
|
||||||
static inline int blas_quickdivide(blasint x, blasint y){
|
static inline int blas_quickdivide(blasint x, blasint y){
|
||||||
return x / y;
|
return x / y;
|
||||||
|
@ -103,12 +114,16 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||||
|
|
||||||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||||
|
|
||||||
#define PROLOGUE \
|
.macro PROLOGUE
|
||||||
.text ;\
|
.text ;
|
||||||
.align 4 ;\
|
.p2align 2 ;
|
||||||
.global REALNAME ;\
|
.global REALNAME ;
|
||||||
.type REALNAME, %function ;\
|
#ifndef __APPLE__
|
||||||
|
.type REALNAME, %function ;
|
||||||
|
#endif
|
||||||
REALNAME:
|
REALNAME:
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
#define EPILOGUE
|
#define EPILOGUE
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
#define CDOTC_K cdotc_k
|
#define CDOTC_K cdotc_k
|
||||||
#define CNRM2_K cnrm2_k
|
#define CNRM2_K cnrm2_k
|
||||||
#define CSCAL_K cscal_k
|
#define CSCAL_K cscal_k
|
||||||
|
#define CSUM_K csum_k
|
||||||
#define CSWAP_K cswap_k
|
#define CSWAP_K cswap_k
|
||||||
#define CROT_K csrot_k
|
#define CROT_K csrot_k
|
||||||
|
|
||||||
|
@ -249,6 +250,7 @@
|
||||||
#define CDOTC_K gotoblas -> cdotc_k
|
#define CDOTC_K gotoblas -> cdotc_k
|
||||||
#define CNRM2_K gotoblas -> cnrm2_k
|
#define CNRM2_K gotoblas -> cnrm2_k
|
||||||
#define CSCAL_K gotoblas -> cscal_k
|
#define CSCAL_K gotoblas -> cscal_k
|
||||||
|
#define CSUM_K gotoblas -> csum_k
|
||||||
#define CSWAP_K gotoblas -> cswap_k
|
#define CSWAP_K gotoblas -> cswap_k
|
||||||
#define CROT_K gotoblas -> csrot_k
|
#define CROT_K gotoblas -> csrot_k
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
#define DDOTC_K ddot_k
|
#define DDOTC_K ddot_k
|
||||||
#define DNRM2_K dnrm2_k
|
#define DNRM2_K dnrm2_k
|
||||||
#define DSCAL_K dscal_k
|
#define DSCAL_K dscal_k
|
||||||
|
#define DSUM_K dsum_k
|
||||||
#define DSWAP_K dswap_k
|
#define DSWAP_K dswap_k
|
||||||
#define DROT_K drot_k
|
#define DROT_K drot_k
|
||||||
|
|
||||||
|
@ -174,6 +175,7 @@
|
||||||
#define DDOTC_K gotoblas -> ddot_k
|
#define DDOTC_K gotoblas -> ddot_k
|
||||||
#define DNRM2_K gotoblas -> dnrm2_k
|
#define DNRM2_K gotoblas -> dnrm2_k
|
||||||
#define DSCAL_K gotoblas -> dscal_k
|
#define DSCAL_K gotoblas -> dscal_k
|
||||||
|
#define DSUM_K gotoblas -> dsum_k
|
||||||
#define DSWAP_K gotoblas -> dswap_k
|
#define DSWAP_K gotoblas -> dswap_k
|
||||||
#define DROT_K gotoblas -> drot_k
|
#define DROT_K gotoblas -> drot_k
|
||||||
|
|
||||||
|
|
|
@ -122,6 +122,13 @@ xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *);
|
||||||
double BLASFUNC(dzasum)(blasint *, double *, blasint *);
|
double BLASFUNC(dzasum)(blasint *, double *, blasint *);
|
||||||
xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *);
|
xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *);
|
||||||
|
|
||||||
|
FLOATRET BLASFUNC(ssum) (blasint *, float *, blasint *);
|
||||||
|
FLOATRET BLASFUNC(scsum)(blasint *, float *, blasint *);
|
||||||
|
double BLASFUNC(dsum) (blasint *, double *, blasint *);
|
||||||
|
xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *);
|
||||||
|
double BLASFUNC(dzsum)(blasint *, double *, blasint *);
|
||||||
|
xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *);
|
||||||
|
|
||||||
blasint BLASFUNC(isamax)(blasint *, float *, blasint *);
|
blasint BLASFUNC(isamax)(blasint *, float *, blasint *);
|
||||||
blasint BLASFUNC(idamax)(blasint *, double *, blasint *);
|
blasint BLASFUNC(idamax)(blasint *, double *, blasint *);
|
||||||
blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *);
|
blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *);
|
||||||
|
|
146
common_lapack.h
146
common_lapack.h
|
@ -293,4 +293,150 @@ blasint zlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLO
|
||||||
blasint xlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
blasint xlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
blasint xlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
blasint xlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
|
||||||
|
blasint strtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint strtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint strtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint strtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint strtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint strtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint strtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint strtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint dtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint dtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint dtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint dtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint dtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint dtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint dtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint dtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint qtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint qtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint qtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint qtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint qtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint qtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint qtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint qtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint ctrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ztrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint xtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
|
||||||
|
blasint strtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint strtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint strtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint strtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint strtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint strtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint strtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint strtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint dtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint dtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint dtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint dtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint dtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint dtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint dtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint dtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint qtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint qtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint qtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint qtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint qtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint qtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint qtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint qtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint ctrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ctrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||||
|
blasint ztrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint ztrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||||
|
blasint xtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
blasint xtrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -100,6 +100,13 @@ float casum_k (BLASLONG, float *, BLASLONG);
|
||||||
double zasum_k (BLASLONG, double *, BLASLONG);
|
double zasum_k (BLASLONG, double *, BLASLONG);
|
||||||
xdouble xasum_k (BLASLONG, xdouble *, BLASLONG);
|
xdouble xasum_k (BLASLONG, xdouble *, BLASLONG);
|
||||||
|
|
||||||
|
float ssum_k (BLASLONG, float *, BLASLONG);
|
||||||
|
double dsum_k (BLASLONG, double *, BLASLONG);
|
||||||
|
xdouble qsum_k (BLASLONG, xdouble *, BLASLONG);
|
||||||
|
float csum_k (BLASLONG, float *, BLASLONG);
|
||||||
|
double zsum_k (BLASLONG, double *, BLASLONG);
|
||||||
|
xdouble xsum_k (BLASLONG, xdouble *, BLASLONG);
|
||||||
|
|
||||||
float samax_k (BLASLONG, float *, BLASLONG);
|
float samax_k (BLASLONG, float *, BLASLONG);
|
||||||
double damax_k (BLASLONG, double *, BLASLONG);
|
double damax_k (BLASLONG, double *, BLASLONG);
|
||||||
xdouble qamax_k (BLASLONG, xdouble *, BLASLONG);
|
xdouble qamax_k (BLASLONG, xdouble *, BLASLONG);
|
||||||
|
|
|
@ -47,6 +47,14 @@ __global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *);
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K,
|
||||||
|
float * A, BLASLONG strideA,
|
||||||
|
float * B, BLASLONG strideB,
|
||||||
|
float * R, BLASLONG strideR);
|
||||||
|
|
||||||
|
extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K);
|
||||||
|
|
||||||
|
|
||||||
int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
|
int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
|
||||||
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||||
int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double,
|
int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double,
|
||||||
|
|
163
common_macro.h
163
common_macro.h
|
@ -66,6 +66,7 @@
|
||||||
#define DOTC_K QDOTC_K
|
#define DOTC_K QDOTC_K
|
||||||
#define NRM2_K QNRM2_K
|
#define NRM2_K QNRM2_K
|
||||||
#define SCAL_K QSCAL_K
|
#define SCAL_K QSCAL_K
|
||||||
|
#define SUM_K QSUM_K
|
||||||
#define SWAP_K QSWAP_K
|
#define SWAP_K QSWAP_K
|
||||||
#define ROT_K QROT_K
|
#define ROT_K QROT_K
|
||||||
|
|
||||||
|
@ -356,6 +357,7 @@
|
||||||
#define DOTC_K DDOTC_K
|
#define DOTC_K DDOTC_K
|
||||||
#define NRM2_K DNRM2_K
|
#define NRM2_K DNRM2_K
|
||||||
#define SCAL_K DSCAL_K
|
#define SCAL_K DSCAL_K
|
||||||
|
#define SUM_K DSUM_K
|
||||||
#define SWAP_K DSWAP_K
|
#define SWAP_K DSWAP_K
|
||||||
#define ROT_K DROT_K
|
#define ROT_K DROT_K
|
||||||
|
|
||||||
|
@ -658,6 +660,7 @@
|
||||||
#define DOTC_K SDOTC_K
|
#define DOTC_K SDOTC_K
|
||||||
#define NRM2_K SNRM2_K
|
#define NRM2_K SNRM2_K
|
||||||
#define SCAL_K SSCAL_K
|
#define SCAL_K SSCAL_K
|
||||||
|
#define SUM_K SSUM_K
|
||||||
#define SWAP_K SSWAP_K
|
#define SWAP_K SSWAP_K
|
||||||
#define ROT_K SROT_K
|
#define ROT_K SROT_K
|
||||||
|
|
||||||
|
@ -962,6 +965,7 @@
|
||||||
#define DOTC_K XDOTC_K
|
#define DOTC_K XDOTC_K
|
||||||
#define NRM2_K XNRM2_K
|
#define NRM2_K XNRM2_K
|
||||||
#define SCAL_K XSCAL_K
|
#define SCAL_K XSCAL_K
|
||||||
|
#define SUM_K XSUM_K
|
||||||
#define SWAP_K XSWAP_K
|
#define SWAP_K XSWAP_K
|
||||||
#define ROT_K XROT_K
|
#define ROT_K XROT_K
|
||||||
|
|
||||||
|
@ -1363,6 +1367,7 @@
|
||||||
#define DOTC_K ZDOTC_K
|
#define DOTC_K ZDOTC_K
|
||||||
#define NRM2_K ZNRM2_K
|
#define NRM2_K ZNRM2_K
|
||||||
#define SCAL_K ZSCAL_K
|
#define SCAL_K ZSCAL_K
|
||||||
|
#define SUM_K ZSUM_K
|
||||||
#define SWAP_K ZSWAP_K
|
#define SWAP_K ZSWAP_K
|
||||||
#define ROT_K ZROT_K
|
#define ROT_K ZROT_K
|
||||||
|
|
||||||
|
@ -1785,6 +1790,7 @@
|
||||||
#define DOTC_K CDOTC_K
|
#define DOTC_K CDOTC_K
|
||||||
#define NRM2_K CNRM2_K
|
#define NRM2_K CNRM2_K
|
||||||
#define SCAL_K CSCAL_K
|
#define SCAL_K CSCAL_K
|
||||||
|
#define SUM_K CSUM_K
|
||||||
#define SWAP_K CSWAP_K
|
#define SWAP_K CSWAP_K
|
||||||
#define ROT_K CROT_K
|
#define ROT_K CROT_K
|
||||||
|
|
||||||
|
@ -2800,3 +2806,160 @@ typedef struct {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef COMPLEX
|
||||||
|
#ifdef XDOUBLE
|
||||||
|
#define TRTRS_UNU_SINGLE qtrtrs_UNU_single
|
||||||
|
#define TRTRS_UNN_SINGLE qtrtrs_UNN_single
|
||||||
|
#define TRTRS_UTU_SINGLE qtrtrs_UTU_single
|
||||||
|
#define TRTRS_UTN_SINGLE qtrtrs_UTN_single
|
||||||
|
#define TRTRS_LNU_SINGLE qtrtrs_LNU_single
|
||||||
|
#define TRTRS_LNN_SINGLE qtrtrs_LNN_single
|
||||||
|
#define TRTRS_LTU_SINGLE qtrtrs_LTU_single
|
||||||
|
#define TRTRS_LTN_SINGLE qtrtrs_LTN_single
|
||||||
|
#define TRTRS_UNU_PARALLEL qtrtrs_UNU_parallel
|
||||||
|
#define TRTRS_UNN_PARALLEL qtrtrs_UNN_parallel
|
||||||
|
#define TRTRS_UTU_PARALLEL qtrtrs_UTU_parallel
|
||||||
|
#define TRTRS_UTN_PARALLEL qtrtrs_UTN_parallel
|
||||||
|
#define TRTRS_LNU_PARALLEL qtrtrs_LNU_parallel
|
||||||
|
#define TRTRS_LNN_PARALLEL qtrtrs_LNN_parallel
|
||||||
|
#define TRTRS_LTU_PARALLEL qtrtrs_LTU_parallel
|
||||||
|
#define TRTRS_LTN_PARALLEL qtrtrs_LTN_parallel
|
||||||
|
|
||||||
|
#elif defined(DOUBLE)
|
||||||
|
#define TRTRS_UNU_SINGLE dtrtrs_UNU_single
|
||||||
|
#define TRTRS_UNN_SINGLE dtrtrs_UNN_single
|
||||||
|
#define TRTRS_UTU_SINGLE dtrtrs_UTU_single
|
||||||
|
#define TRTRS_UTN_SINGLE dtrtrs_UTN_single
|
||||||
|
#define TRTRS_LNU_SINGLE dtrtrs_LNU_single
|
||||||
|
#define TRTRS_LNN_SINGLE dtrtrs_LNN_single
|
||||||
|
#define TRTRS_LTU_SINGLE dtrtrs_LTU_single
|
||||||
|
#define TRTRS_LTN_SINGLE dtrtrs_LTN_single
|
||||||
|
#define TRTRS_UNU_PARALLEL dtrtrs_UNU_parallel
|
||||||
|
#define TRTRS_UNN_PARALLEL dtrtrs_UNN_parallel
|
||||||
|
#define TRTRS_UTU_PARALLEL dtrtrs_UTU_parallel
|
||||||
|
#define TRTRS_UTN_PARALLEL dtrtrs_UTN_parallel
|
||||||
|
#define TRTRS_LNU_PARALLEL dtrtrs_LNU_parallel
|
||||||
|
#define TRTRS_LNN_PARALLEL dtrtrs_LNN_parallel
|
||||||
|
#define TRTRS_LTU_PARALLEL dtrtrs_LTU_parallel
|
||||||
|
#define TRTRS_LTN_PARALLEL dtrtrs_LTN_parallel
|
||||||
|
#else
|
||||||
|
#define TRTRS_UNU_SINGLE strtrs_UNU_single
|
||||||
|
#define TRTRS_UNN_SINGLE strtrs_UNN_single
|
||||||
|
#define TRTRS_UTU_SINGLE strtrs_UTU_single
|
||||||
|
#define TRTRS_UTN_SINGLE strtrs_UTN_single
|
||||||
|
#define TRTRS_LNU_SINGLE strtrs_LNU_single
|
||||||
|
#define TRTRS_LNN_SINGLE strtrs_LNN_single
|
||||||
|
#define TRTRS_LTU_SINGLE strtrs_LTU_single
|
||||||
|
#define TRTRS_LTN_SINGLE strtrs_LTN_single
|
||||||
|
#define TRTRS_UNU_PARALLEL strtrs_UNU_parallel
|
||||||
|
#define TRTRS_UNN_PARALLEL strtrs_UNN_parallel
|
||||||
|
#define TRTRS_UTU_PARALLEL strtrs_UTU_parallel
|
||||||
|
#define TRTRS_UTN_PARALLEL strtrs_UTN_parallel
|
||||||
|
#define TRTRS_LNU_PARALLEL strtrs_LNU_parallel
|
||||||
|
#define TRTRS_LNN_PARALLEL strtrs_LNN_parallel
|
||||||
|
#define TRTRS_LTU_PARALLEL strtrs_LTU_parallel
|
||||||
|
#define TRTRS_LTN_PARALLEL strtrs_LTN_parallel
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#ifdef XDOUBLE
|
||||||
|
#define TRTRS_UNU_SINGLE xtrtrs_UNU_single
|
||||||
|
#define TRTRS_UNN_SINGLE xtrtrs_UNN_single
|
||||||
|
#define TRTRS_UTU_SINGLE xtrtrs_UTU_single
|
||||||
|
#define TRTRS_UTN_SINGLE xtrtrs_UTN_single
|
||||||
|
#define TRTRS_URU_SINGLE xtrtrs_URU_single
|
||||||
|
#define TRTRS_URN_SINGLE xtrtrs_URN_single
|
||||||
|
#define TRTRS_UCU_SINGLE xtrtrs_UCU_single
|
||||||
|
#define TRTRS_UCN_SINGLE xtrtrs_UCN_single
|
||||||
|
#define TRTRS_LNU_SINGLE xtrtrs_LNU_single
|
||||||
|
#define TRTRS_LNN_SINGLE xtrtrs_LNN_single
|
||||||
|
#define TRTRS_LTU_SINGLE xtrtrs_LTU_single
|
||||||
|
#define TRTRS_LTN_SINGLE xtrtrs_LTN_single
|
||||||
|
#define TRTRS_LRU_SINGLE xtrtrs_LRU_single
|
||||||
|
#define TRTRS_LRN_SINGLE xtrtrs_LRN_single
|
||||||
|
#define TRTRS_LCU_SINGLE xtrtrs_LCU_single
|
||||||
|
#define TRTRS_LCN_SINGLE xtrtrs_LCN_single
|
||||||
|
#define TRTRS_UNU_PARALLEL xtrtrs_UNU_parallel
|
||||||
|
#define TRTRS_UNN_PARALLEL xtrtrs_UNN_parallel
|
||||||
|
#define TRTRS_UTU_PARALLEL xtrtrs_UTU_parallel
|
||||||
|
#define TRTRS_UTN_PARALLEL xtrtrs_UTN_parallel
|
||||||
|
#define TRTRS_URU_PARALLEL xtrtrs_URU_parallel
|
||||||
|
#define TRTRS_URN_PARALLEL xtrtrs_URN_parallel
|
||||||
|
#define TRTRS_UCU_PARALLEL xtrtrs_UCU_parallel
|
||||||
|
#define TRTRS_UCN_PARALLEL xtrtrs_UCN_parallel
|
||||||
|
#define TRTRS_LNU_PARALLEL xtrtrs_LNU_parallel
|
||||||
|
#define TRTRS_LNN_PARALLEL xtrtrs_LNN_parallel
|
||||||
|
#define TRTRS_LTU_PARALLEL xtrtrs_LTU_parallel
|
||||||
|
#define TRTRS_LTN_PARALLEL xtrtrs_LTN_parallel
|
||||||
|
#define TRTRS_LRU_PARALLEL xtrtrs_LRU_parallel
|
||||||
|
#define TRTRS_LRN_PARALLEL xtrtrs_LRN_parallel
|
||||||
|
#define TRTRS_LCU_PARALLEL xtrtrs_LCU_parallel
|
||||||
|
#define TRTRS_LCN_PARALLEL xtrtrs_LCN_parallel
|
||||||
|
#elif defined(DOUBLE)
|
||||||
|
#define TRTRS_UNU_SINGLE ztrtrs_UNU_single
|
||||||
|
#define TRTRS_UNN_SINGLE ztrtrs_UNN_single
|
||||||
|
#define TRTRS_UTU_SINGLE ztrtrs_UTU_single
|
||||||
|
#define TRTRS_UTN_SINGLE ztrtrs_UTN_single
|
||||||
|
#define TRTRS_URU_SINGLE ztrtrs_URU_single
|
||||||
|
#define TRTRS_URN_SINGLE ztrtrs_URN_single
|
||||||
|
#define TRTRS_UCU_SINGLE ztrtrs_UCU_single
|
||||||
|
#define TRTRS_UCN_SINGLE ztrtrs_UCN_single
|
||||||
|
#define TRTRS_LNU_SINGLE ztrtrs_LNU_single
|
||||||
|
#define TRTRS_LNN_SINGLE ztrtrs_LNN_single
|
||||||
|
#define TRTRS_LTU_SINGLE ztrtrs_LTU_single
|
||||||
|
#define TRTRS_LTN_SINGLE ztrtrs_LTN_single
|
||||||
|
#define TRTRS_LRU_SINGLE ztrtrs_LRU_single
|
||||||
|
#define TRTRS_LRN_SINGLE ztrtrs_LRN_single
|
||||||
|
#define TRTRS_LCU_SINGLE ztrtrs_LCU_single
|
||||||
|
#define TRTRS_LCN_SINGLE ztrtrs_LCN_single
|
||||||
|
#define TRTRS_UNU_PARALLEL ztrtrs_UNU_parallel
|
||||||
|
#define TRTRS_UNN_PARALLEL ztrtrs_UNN_parallel
|
||||||
|
#define TRTRS_UTU_PARALLEL ztrtrs_UTU_parallel
|
||||||
|
#define TRTRS_UTN_PARALLEL ztrtrs_UTN_parallel
|
||||||
|
#define TRTRS_URU_PARALLEL ztrtrs_URU_parallel
|
||||||
|
#define TRTRS_URN_PARALLEL ztrtrs_URN_parallel
|
||||||
|
#define TRTRS_UCU_PARALLEL ztrtrs_UCU_parallel
|
||||||
|
#define TRTRS_UCN_PARALLEL ztrtrs_UCN_parallel
|
||||||
|
#define TRTRS_LNU_PARALLEL ztrtrs_LNU_parallel
|
||||||
|
#define TRTRS_LNN_PARALLEL ztrtrs_LNN_parallel
|
||||||
|
#define TRTRS_LTU_PARALLEL ztrtrs_LTU_parallel
|
||||||
|
#define TRTRS_LTN_PARALLEL ztrtrs_LTN_parallel
|
||||||
|
#define TRTRS_LRU_PARALLEL ztrtrs_LRU_parallel
|
||||||
|
#define TRTRS_LRN_PARALLEL ztrtrs_LRN_parallel
|
||||||
|
#define TRTRS_LCU_PARALLEL ztrtrs_LCU_parallel
|
||||||
|
#define TRTRS_LCN_PARALLEL ztrtrs_LCN_parallel
|
||||||
|
#else
|
||||||
|
#define TRTRS_UNU_SINGLE ctrtrs_UNU_single
|
||||||
|
#define TRTRS_UNN_SINGLE ctrtrs_UNN_single
|
||||||
|
#define TRTRS_UTU_SINGLE ctrtrs_UTU_single
|
||||||
|
#define TRTRS_UTN_SINGLE ctrtrs_UTN_single
|
||||||
|
#define TRTRS_URU_SINGLE ctrtrs_URU_single
|
||||||
|
#define TRTRS_URN_SINGLE ctrtrs_URN_single
|
||||||
|
#define TRTRS_UCU_SINGLE ctrtrs_UCU_single
|
||||||
|
#define TRTRS_UCN_SINGLE ctrtrs_UCN_single
|
||||||
|
#define TRTRS_LNU_SINGLE ctrtrs_LNU_single
|
||||||
|
#define TRTRS_LNN_SINGLE ctrtrs_LNN_single
|
||||||
|
#define TRTRS_LTU_SINGLE ctrtrs_LTU_single
|
||||||
|
#define TRTRS_LTN_SINGLE ctrtrs_LTN_single
|
||||||
|
#define TRTRS_LRU_SINGLE ctrtrs_LRU_single
|
||||||
|
#define TRTRS_LRN_SINGLE ctrtrs_LRN_single
|
||||||
|
#define TRTRS_LCU_SINGLE ctrtrs_LCU_single
|
||||||
|
#define TRTRS_LCN_SINGLE ctrtrs_LCN_single
|
||||||
|
#define TRTRS_UNU_PARALLEL ctrtrs_UNU_parallel
|
||||||
|
#define TRTRS_UNN_PARALLEL ctrtrs_UNN_parallel
|
||||||
|
#define TRTRS_UTU_PARALLEL ctrtrs_UTU_parallel
|
||||||
|
#define TRTRS_UTN_PARALLEL ctrtrs_UTN_parallel
|
||||||
|
#define TRTRS_URU_PARALLEL ctrtrs_URU_parallel
|
||||||
|
#define TRTRS_URN_PARALLEL ctrtrs_URN_parallel
|
||||||
|
#define TRTRS_UCU_PARALLEL ctrtrs_UCU_parallel
|
||||||
|
#define TRTRS_UCN_PARALLEL ctrtrs_UCN_parallel
|
||||||
|
#define TRTRS_LNU_PARALLEL ctrtrs_LNU_parallel
|
||||||
|
#define TRTRS_LNN_PARALLEL ctrtrs_LNN_parallel
|
||||||
|
#define TRTRS_LTU_PARALLEL ctrtrs_LTU_parallel
|
||||||
|
#define TRTRS_LTN_PARALLEL ctrtrs_LTN_parallel
|
||||||
|
#define TRTRS_LRU_PARALLEL ctrtrs_LRU_parallel
|
||||||
|
#define TRTRS_LRN_PARALLEL ctrtrs_LRN_parallel
|
||||||
|
#define TRTRS_LCU_PARALLEL ctrtrs_LCU_parallel
|
||||||
|
#define TRTRS_LCN_PARALLEL ctrtrs_LCN_parallel
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
|
@ -94,7 +94,7 @@ static inline unsigned int rpcc(void){
|
||||||
#define RPCC_DEFINED
|
#define RPCC_DEFINED
|
||||||
|
|
||||||
#ifndef NO_AFFINITY
|
#ifndef NO_AFFINITY
|
||||||
#define WHEREAMI
|
//#define WHEREAMI
|
||||||
static inline int WhereAmI(void){
|
static inline int WhereAmI(void){
|
||||||
int ret=0;
|
int ret=0;
|
||||||
__asm__ __volatile__(".set push \n"
|
__asm__ __volatile__(".set push \n"
|
||||||
|
|
|
@ -63,6 +63,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
|
||||||
|
|
||||||
float (*snrm2_k) (BLASLONG, float *, BLASLONG);
|
float (*snrm2_k) (BLASLONG, float *, BLASLONG);
|
||||||
float (*sasum_k) (BLASLONG, float *, BLASLONG);
|
float (*sasum_k) (BLASLONG, float *, BLASLONG);
|
||||||
|
float (*ssum_k) (BLASLONG, float *, BLASLONG);
|
||||||
int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||||
float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||||
double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||||
|
@ -154,6 +155,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
|
||||||
|
|
||||||
double (*dnrm2_k) (BLASLONG, double *, BLASLONG);
|
double (*dnrm2_k) (BLASLONG, double *, BLASLONG);
|
||||||
double (*dasum_k) (BLASLONG, double *, BLASLONG);
|
double (*dasum_k) (BLASLONG, double *, BLASLONG);
|
||||||
|
double (*dsum_k) (BLASLONG, double *, BLASLONG);
|
||||||
int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||||
double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||||
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
|
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
|
||||||
|
@ -245,6 +247,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);
|
||||||
|
|
||||||
xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG);
|
xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG);
|
||||||
xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG);
|
xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG);
|
||||||
|
xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG);
|
||||||
int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||||
xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||||
int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble);
|
int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble);
|
||||||
|
@ -332,6 +335,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
|
||||||
|
|
||||||
float (*cnrm2_k) (BLASLONG, float *, BLASLONG);
|
float (*cnrm2_k) (BLASLONG, float *, BLASLONG);
|
||||||
float (*casum_k) (BLASLONG, float *, BLASLONG);
|
float (*casum_k) (BLASLONG, float *, BLASLONG);
|
||||||
|
float (*csum_k) (BLASLONG, float *, BLASLONG);
|
||||||
int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||||
openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||||
openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||||
|
@ -495,6 +499,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
|
||||||
|
|
||||||
double (*znrm2_k) (BLASLONG, double *, BLASLONG);
|
double (*znrm2_k) (BLASLONG, double *, BLASLONG);
|
||||||
double (*zasum_k) (BLASLONG, double *, BLASLONG);
|
double (*zasum_k) (BLASLONG, double *, BLASLONG);
|
||||||
|
double (*zsum_k) (BLASLONG, double *, BLASLONG);
|
||||||
int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||||
openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||||
openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||||
|
@ -660,6 +665,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
|
||||||
|
|
||||||
xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG);
|
xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG);
|
||||||
xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG);
|
xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG);
|
||||||
|
xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG);
|
||||||
int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||||
openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||||
openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||||
|
|
|
@ -39,7 +39,36 @@
|
||||||
#ifndef COMMON_POWER
|
#ifndef COMMON_POWER
|
||||||
#define COMMON_POWER
|
#define COMMON_POWER
|
||||||
|
|
||||||
#if defined(POWER8)
|
#define str(x) #x
|
||||||
|
|
||||||
|
#ifdef OS_AIX
|
||||||
|
#define XXSPLTD(T,A,z) xxpermdi T, A, A, 0b##z##z
|
||||||
|
#define XXMRGHD(T,A,B) xxpermdi T, A, B, 0b00
|
||||||
|
#define XXMRGLD(T,A,B) xxpermdi T, A, B, 0b11
|
||||||
|
#define XXSWAPD(T,A) xxpermdi T, A, A, 0b10
|
||||||
|
#define XVMOVDP(T,A) xvcpsgndp T, A, A
|
||||||
|
|
||||||
|
#define XXSPLTD_S(T,A,z) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b" str(z ## z) " \n\t"
|
||||||
|
#define XXMRGHD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b00 \n\t"
|
||||||
|
#define XXMRGLD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b11 \n\t"
|
||||||
|
#define XXSWAPD_S(T,A) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b10 \n\t"
|
||||||
|
|
||||||
|
#else
|
||||||
|
#define XXSPLTD(T,A,z) xxspltd T, A, z
|
||||||
|
#define XXMRGHD(T,A,B) xxmrghd T, A, B
|
||||||
|
#define XXMRGLD(T,A,B) xxmrgld T, A, B
|
||||||
|
#define XXSWAPD(T,A) xxswapd T, A
|
||||||
|
#define XVMOVDP(T,A) xvmovdp T, A
|
||||||
|
|
||||||
|
#define XXSPLTD_S(T,A,z) "xxspltd " str(T) ", " str(A) ", " str(z)" \n\t"
|
||||||
|
#define XXMRGHD_S(T,A,B) "xxmrghd " str(T) ", " str(A) ", " str(B)" \n\t"
|
||||||
|
#define XXMRGLD_S(T,A,B) "xxmrgld " str(T) ", " str(A) ", " str(B)" \n\t"
|
||||||
|
#define XXSWAPD_S(T,A) "xxswapd " str(T) ", " str(A) " \n\t"
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(POWER8) || defined(POWER9)
|
||||||
#define MB __asm__ __volatile__ ("eieio":::"memory")
|
#define MB __asm__ __volatile__ ("eieio":::"memory")
|
||||||
#define WMB __asm__ __volatile__ ("eieio":::"memory")
|
#define WMB __asm__ __volatile__ ("eieio":::"memory")
|
||||||
#else
|
#else
|
||||||
|
@ -241,7 +270,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||||
#define HAVE_PREFETCH
|
#define HAVE_PREFETCH
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8)
|
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(PPC970)
|
||||||
#define DCBT_ARG 0
|
#define DCBT_ARG 0
|
||||||
#else
|
#else
|
||||||
#define DCBT_ARG 8
|
#define DCBT_ARG 8
|
||||||
|
@ -263,7 +292,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||||
#define L1_PREFETCH dcbtst
|
#define L1_PREFETCH dcbtst
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(POWER8)
|
#if defined(POWER8) || defined(POWER9)
|
||||||
#define L1_DUALFETCH
|
#define L1_DUALFETCH
|
||||||
#define L1_PREFETCHSIZE (16 + 128 * 100)
|
#define L1_PREFETCHSIZE (16 + 128 * 100)
|
||||||
#define L1_PREFETCH dcbtst
|
#define L1_PREFETCH dcbtst
|
||||||
|
@ -499,7 +528,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||||
|
|
||||||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||||
|
|
||||||
#ifdef OS_LINUX
|
#if defined(OS_LINUX) || defined(OS_FREEBSD)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define PROLOGUE \
|
#define PROLOGUE \
|
||||||
.section .text;\
|
.section .text;\
|
||||||
|
@ -598,9 +627,14 @@ REALNAME:;\
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define PROLOGUE \
|
#define PROLOGUE \
|
||||||
.machine "any";\
|
.machine "any";\
|
||||||
|
.toc;\
|
||||||
.globl .REALNAME;\
|
.globl .REALNAME;\
|
||||||
|
.globl REALNAME;\
|
||||||
|
.csect REALNAME[DS],3;\
|
||||||
|
REALNAME:;\
|
||||||
|
.long .REALNAME, TOC[tc0], 0;\
|
||||||
.csect .text[PR],5;\
|
.csect .text[PR],5;\
|
||||||
.REALNAME:;
|
.REALNAME:
|
||||||
|
|
||||||
#define EPILOGUE \
|
#define EPILOGUE \
|
||||||
_section_.text:;\
|
_section_.text:;\
|
||||||
|
@ -611,9 +645,14 @@ _section_.text:;\
|
||||||
|
|
||||||
#define PROLOGUE \
|
#define PROLOGUE \
|
||||||
.machine "any";\
|
.machine "any";\
|
||||||
|
.toc;\
|
||||||
.globl .REALNAME;\
|
.globl .REALNAME;\
|
||||||
|
.globl REALNAME;\
|
||||||
|
.csect REALNAME[DS],3;\
|
||||||
|
REALNAME:;\
|
||||||
|
.llong .REALNAME, TOC[tc0], 0;\
|
||||||
.csect .text[PR], 5;\
|
.csect .text[PR], 5;\
|
||||||
.REALNAME:;
|
.REALNAME:
|
||||||
|
|
||||||
#define EPILOGUE \
|
#define EPILOGUE \
|
||||||
_section_.text:;\
|
_section_.text:;\
|
||||||
|
@ -774,7 +813,7 @@ Lmcount$lazy_ptr:
|
||||||
|
|
||||||
#define HALT mfspr r0, 1023
|
#define HALT mfspr r0, 1023
|
||||||
|
|
||||||
#ifdef OS_LINUX
|
#if defined(OS_LINUX) || defined(OS_FREEBSD)
|
||||||
#if defined(PPC440) || defined(PPC440FP2)
|
#if defined(PPC440) || defined(PPC440FP2)
|
||||||
#undef MAX_CPU_NUMBER
|
#undef MAX_CPU_NUMBER
|
||||||
#define MAX_CPU_NUMBER 1
|
#define MAX_CPU_NUMBER 1
|
||||||
|
@ -802,7 +841,7 @@ Lmcount$lazy_ptr:
|
||||||
#define BUFFER_SIZE ( 2 << 20)
|
#define BUFFER_SIZE ( 2 << 20)
|
||||||
#elif defined(PPC440FP2)
|
#elif defined(PPC440FP2)
|
||||||
#define BUFFER_SIZE ( 16 << 20)
|
#define BUFFER_SIZE ( 16 << 20)
|
||||||
#elif defined(POWER8)
|
#elif defined(POWER8) || defined(POWER9)
|
||||||
#define BUFFER_SIZE ( 64 << 20)
|
#define BUFFER_SIZE ( 64 << 20)
|
||||||
#else
|
#else
|
||||||
#define BUFFER_SIZE ( 16 << 20)
|
#define BUFFER_SIZE ( 16 << 20)
|
||||||
|
@ -819,7 +858,7 @@ Lmcount$lazy_ptr:
|
||||||
#define MAP_ANONYMOUS MAP_ANON
|
#define MAP_ANONYMOUS MAP_ANON
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef OS_LINUX
|
#if defined(OS_LINUX) || defined(OS_FREEBSD)
|
||||||
#ifndef __64BIT__
|
#ifndef __64BIT__
|
||||||
#define FRAMESLOT(X) (((X) * 4) + 8)
|
#define FRAMESLOT(X) (((X) * 4) + 8)
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
#define QDOTC_K qdot_k
|
#define QDOTC_K qdot_k
|
||||||
#define QNRM2_K qnrm2_k
|
#define QNRM2_K qnrm2_k
|
||||||
#define QSCAL_K qscal_k
|
#define QSCAL_K qscal_k
|
||||||
|
#define QSUM_K qsum_k
|
||||||
#define QSWAP_K qswap_k
|
#define QSWAP_K qswap_k
|
||||||
#define QROT_K qrot_k
|
#define QROT_K qrot_k
|
||||||
|
|
||||||
|
@ -161,6 +162,7 @@
|
||||||
#define QDOTC_K gotoblas -> qdot_k
|
#define QDOTC_K gotoblas -> qdot_k
|
||||||
#define QNRM2_K gotoblas -> qnrm2_k
|
#define QNRM2_K gotoblas -> qnrm2_k
|
||||||
#define QSCAL_K gotoblas -> qscal_k
|
#define QSCAL_K gotoblas -> qscal_k
|
||||||
|
#define QSUM_K gotoblas -> qsum_k
|
||||||
#define QSWAP_K gotoblas -> qswap_k
|
#define QSWAP_K gotoblas -> qswap_k
|
||||||
#define QROT_K gotoblas -> qrot_k
|
#define QROT_K gotoblas -> qrot_k
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,7 @@
|
||||||
#define ISMAX_K ismax_k
|
#define ISMAX_K ismax_k
|
||||||
#define ISMIN_K ismin_k
|
#define ISMIN_K ismin_k
|
||||||
#define SASUM_K sasum_k
|
#define SASUM_K sasum_k
|
||||||
|
#define SSUM_K ssum_k
|
||||||
#define SAXPYU_K saxpy_k
|
#define SAXPYU_K saxpy_k
|
||||||
#define SAXPYC_K saxpy_k
|
#define SAXPYC_K saxpy_k
|
||||||
#define SCOPY_K scopy_k
|
#define SCOPY_K scopy_k
|
||||||
|
@ -170,6 +171,7 @@
|
||||||
#define ISMAX_K gotoblas -> ismax_k
|
#define ISMAX_K gotoblas -> ismax_k
|
||||||
#define ISMIN_K gotoblas -> ismin_k
|
#define ISMIN_K gotoblas -> ismin_k
|
||||||
#define SASUM_K gotoblas -> sasum_k
|
#define SASUM_K gotoblas -> sasum_k
|
||||||
|
#define SSUM_K gotoblas -> ssum_k
|
||||||
#define SAXPYU_K gotoblas -> saxpy_k
|
#define SAXPYU_K gotoblas -> saxpy_k
|
||||||
#define SAXPYC_K gotoblas -> saxpy_k
|
#define SAXPYC_K gotoblas -> saxpy_k
|
||||||
#define SCOPY_K gotoblas -> scopy_k
|
#define SCOPY_K gotoblas -> scopy_k
|
||||||
|
|
|
@ -45,16 +45,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
* SIZE must be carefully chosen to be:
|
* SIZE must be carefully chosen to be:
|
||||||
* - as small as possible to maximize the number of stack allocation
|
* - as small as possible to maximize the number of stack allocation
|
||||||
* - large enough to support all architectures and kernel
|
* - large enough to support all architectures and kernel
|
||||||
* Chosing a too small SIZE will lead to a stack smashing.
|
* Choosing a SIZE too small will lead to a stack smashing.
|
||||||
*/
|
*/
|
||||||
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
|
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
|
||||||
/* make it volatile because some function (ex: dgemv_n.S) */ \
|
/* make it volatile because some function (ex: dgemv_n.S) */ \
|
||||||
/* do not restore all register */ \
|
/* do not restore all register */ \
|
||||||
volatile int stack_alloc_size = SIZE; \
|
volatile int stack_alloc_size = SIZE; \
|
||||||
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \
|
if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \
|
||||||
stack_alloc_size = 0; \
|
|
||||||
STACK_ALLOC_PROTECT_SET \
|
STACK_ALLOC_PROTECT_SET \
|
||||||
TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \
|
/* Avoid declaring an array of length 0 */ \
|
||||||
|
TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1] \
|
||||||
|
__attribute__((aligned(0x20))); \
|
||||||
BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
|
BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
|
||||||
#else
|
#else
|
||||||
//Original OpenBLAS/GotoBLAS codes.
|
//Original OpenBLAS/GotoBLAS codes.
|
||||||
|
|
|
@ -194,10 +194,6 @@ int trsm_thread(int mode, BLASLONG m, BLASLONG n,
|
||||||
|
|
||||||
int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
|
int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
|
||||||
|
|
||||||
int beta_thread(int mode, BLASLONG m, BLASLONG n,
|
|
||||||
double alpha_r, double alpha_i,
|
|
||||||
void *c, BLASLONG ldc, int (*fuction)());
|
|
||||||
|
|
||||||
int getrf_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k,
|
int getrf_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k,
|
||||||
void *offsetA, BLASLONG lda,
|
void *offsetA, BLASLONG lda,
|
||||||
void *offsetB, BLASLONG jb,
|
void *offsetB, BLASLONG jb,
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
#define XDOTC_K xdotc_k
|
#define XDOTC_K xdotc_k
|
||||||
#define XNRM2_K xnrm2_k
|
#define XNRM2_K xnrm2_k
|
||||||
#define XSCAL_K xscal_k
|
#define XSCAL_K xscal_k
|
||||||
|
#define XSUM_K xsum_k
|
||||||
#define XSWAP_K xswap_k
|
#define XSWAP_K xswap_k
|
||||||
#define XROT_K xqrot_k
|
#define XROT_K xqrot_k
|
||||||
|
|
||||||
|
@ -227,6 +228,7 @@
|
||||||
#define XDOTC_K gotoblas -> xdotc_k
|
#define XDOTC_K gotoblas -> xdotc_k
|
||||||
#define XNRM2_K gotoblas -> xnrm2_k
|
#define XNRM2_K gotoblas -> xnrm2_k
|
||||||
#define XSCAL_K gotoblas -> xscal_k
|
#define XSCAL_K gotoblas -> xscal_k
|
||||||
|
#define XSUM_K gotoblas -> xsum_k
|
||||||
#define XSWAP_K gotoblas -> xswap_k
|
#define XSWAP_K gotoblas -> xswap_k
|
||||||
#define XROT_K gotoblas -> xqrot_k
|
#define XROT_K gotoblas -> xqrot_k
|
||||||
|
|
||||||
|
|
10
common_x86.h
10
common_x86.h
|
@ -178,10 +178,16 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||||
result = x/y;
|
result = x/y;
|
||||||
return result;
|
return result;
|
||||||
#else
|
#else
|
||||||
|
#if (MAX_CPU_NUMBER > 64)
|
||||||
|
if ( y > 64) {
|
||||||
|
result = x/y;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
y = blas_quick_divide_table[y];
|
y = blas_quick_divide_table[y];
|
||||||
|
|
||||||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
|
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x): "0" (y));
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
#endif
|
#endif
|
||||||
|
@ -208,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
//Enable some optimazation for barcelona.
|
//Enable some optimization for barcelona.
|
||||||
#define BARCELONA_OPTIMIZATION
|
#define BARCELONA_OPTIMIZATION
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -60,8 +60,13 @@
|
||||||
#endif
|
#endif
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define MB
|
#ifdef __GNUC__
|
||||||
#define WMB
|
#define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
|
||||||
|
#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
|
||||||
|
#else
|
||||||
|
#define MB do {} while (0)
|
||||||
|
#define WMB do {} while (0)
|
||||||
|
#endif
|
||||||
|
|
||||||
static void __inline blas_lock(volatile BLASULONG *address){
|
static void __inline blas_lock(volatile BLASULONG *address){
|
||||||
|
|
||||||
|
@ -124,7 +129,8 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
|
||||||
*ecx=cpuinfo[2];
|
*ecx=cpuinfo[2];
|
||||||
*edx=cpuinfo[3];
|
*edx=cpuinfo[3];
|
||||||
#else
|
#else
|
||||||
__asm__ __volatile__("cpuid"
|
__asm__ __volatile__("mov $0, %%ecx;"
|
||||||
|
"cpuid"
|
||||||
: "=a" (*eax),
|
: "=a" (*eax),
|
||||||
"=b" (*ebx),
|
"=b" (*ebx),
|
||||||
"=c" (*ecx),
|
"=c" (*ecx),
|
||||||
|
@ -196,9 +202,16 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||||
|
|
||||||
if (y <= 1) return x;
|
if (y <= 1) return x;
|
||||||
|
|
||||||
|
#if (MAX_CPU_NUMBER > 64)
|
||||||
|
if (y > 64) {
|
||||||
|
result = x / y;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
y = blas_quick_divide_table[y];
|
y = blas_quick_divide_table[y];
|
||||||
|
|
||||||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
|
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y));
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -212,7 +225,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||||
#endif
|
#endif
|
||||||
#define HUGE_PAGESIZE ( 2 << 20)
|
#define HUGE_PAGESIZE ( 2 << 20)
|
||||||
|
|
||||||
|
#ifndef BUFFERSIZE
|
||||||
#define BUFFER_SIZE (32 << 20)
|
#define BUFFER_SIZE (32 << 20)
|
||||||
|
#else
|
||||||
|
#define BUFFER_SIZE (32 << BUFFERSIZE)
|
||||||
|
#endif
|
||||||
|
|
||||||
#define SEEK_ADDRESS
|
#define SEEK_ADDRESS
|
||||||
|
|
||||||
|
@ -264,7 +281,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||||
#ifdef ASSEMBLER
|
#ifdef ASSEMBLER
|
||||||
|
|
||||||
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
|
||||||
//Enable some optimazation for barcelona.
|
//Enable some optimization for barcelona.
|
||||||
#define BARCELONA_OPTIMIZATION
|
#define BARCELONA_OPTIMIZATION
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
#define ZDOTC_K zdotc_k
|
#define ZDOTC_K zdotc_k
|
||||||
#define ZNRM2_K znrm2_k
|
#define ZNRM2_K znrm2_k
|
||||||
#define ZSCAL_K zscal_k
|
#define ZSCAL_K zscal_k
|
||||||
|
#define ZSUM_K zsum_k
|
||||||
#define ZSWAP_K zswap_k
|
#define ZSWAP_K zswap_k
|
||||||
#define ZROT_K zdrot_k
|
#define ZROT_K zdrot_k
|
||||||
|
|
||||||
|
@ -249,6 +250,7 @@
|
||||||
#define ZDOTC_K gotoblas -> zdotc_k
|
#define ZDOTC_K gotoblas -> zdotc_k
|
||||||
#define ZNRM2_K gotoblas -> znrm2_k
|
#define ZNRM2_K gotoblas -> znrm2_k
|
||||||
#define ZSCAL_K gotoblas -> zscal_k
|
#define ZSCAL_K gotoblas -> zscal_k
|
||||||
|
#define ZSUM_K gotoblas -> zsum_k
|
||||||
#define ZSWAP_K gotoblas -> zswap_k
|
#define ZSWAP_K gotoblas -> zswap_k
|
||||||
#define ZROT_K gotoblas -> zdrot_k
|
#define ZROT_K gotoblas -> zdrot_k
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
include ../Makefile.rule
|
||||||
|
|
||||||
|
all :: dgemv_tester dgemm_tester
|
||||||
|
|
||||||
|
dgemv_tester :
|
||||||
|
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester
|
||||||
|
./dgemv_tester
|
||||||
|
|
||||||
|
dgemm_tester : dgemv_tester
|
||||||
|
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester
|
||||||
|
./dgemm_tester
|
||||||
|
|
||||||
|
clean ::
|
||||||
|
rm -f dgemv_tester dgemm_tester
|
|
@ -0,0 +1,55 @@
|
||||||
|
inline void pauser(){
|
||||||
|
/// a portable way to pause a program
|
||||||
|
std::string dummy;
|
||||||
|
std::cout << "Press enter to continue...";
|
||||||
|
std::getline(std::cin, dummy);
|
||||||
|
}
|
||||||
|
|
||||||
|
void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
|
||||||
|
for(uint32_t i=0; i<numMat; i++){
|
||||||
|
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
|
||||||
|
matBlock[i][j] = rngdist(PRNG);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(uint32_t i=numMat; i<(numConcurrentThreads*numMat); i+=numMat){
|
||||||
|
for(uint32_t j=0; j<numMat; j++){
|
||||||
|
matBlock[i+j] = matBlock[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void FillVectors(std::vector<std::vector<double>>& vecBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numVec){
|
||||||
|
for(uint32_t i=0; i<numVec; i++){
|
||||||
|
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
|
||||||
|
vecBlock[i][j] = rngdist(PRNG);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(uint32_t i=numVec; i<(numConcurrentThreads*numVec); i+=numVec){
|
||||||
|
for(uint32_t j=0; j<numVec; j++){
|
||||||
|
vecBlock[i+j] = vecBlock[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::mt19937_64 InitPRNG(){
|
||||||
|
std::random_device rd;
|
||||||
|
std::mt19937_64 PRNG(rd()); //seed PRNG using /dev/urandom or similar OS provided RNG
|
||||||
|
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
|
||||||
|
//make sure the internal state of the PRNG is properly mixed by generating 10M random numbers
|
||||||
|
//PRNGs often have unreliable distribution uniformity and other statistical properties before their internal state is sufficiently mixed
|
||||||
|
for (uint32_t i=0;i<10000000;i++) rngdist(PRNG);
|
||||||
|
return PRNG;
|
||||||
|
}
|
||||||
|
|
||||||
|
void PrintMatrices(const std::vector<std::vector<double>>& matBlock, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
|
||||||
|
for (uint32_t i=0;i<numConcurrentThreads*numMat;i++){
|
||||||
|
std::cout<<i<<std::endl;
|
||||||
|
for (uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
|
||||||
|
for (uint32_t k = 0; k < static_cast<uint32_t>(randomMatSize); k++){
|
||||||
|
std::cout<<matBlock[i][j*randomMatSize + k]<<" ";
|
||||||
|
}
|
||||||
|
std::cout<<std::endl;
|
||||||
|
}
|
||||||
|
std::cout<<std::endl;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,92 @@
|
||||||
|
#include <iostream>
|
||||||
|
#include <vector>
|
||||||
|
#include <random>
|
||||||
|
#include <future>
|
||||||
|
#include <omp.h>
|
||||||
|
#include "../cblas.h"
|
||||||
|
#include "cpp_thread_safety_common.h"
|
||||||
|
|
||||||
|
void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMatSize){
|
||||||
|
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, randomMatSize, randomMatSize, randomMatSize, 1.0, A, randomMatSize, B, randomMatSize, 0.1, C, randomMatSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char* argv[]){
|
||||||
|
blasint randomMatSize = 1024; //dimension of the random square matrices used
|
||||||
|
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
|
||||||
|
uint32_t numTestRounds = 16; //number of testing rounds before success exit
|
||||||
|
|
||||||
|
if (argc > 4){
|
||||||
|
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
|
||||||
|
if(argc == 4){
|
||||||
|
std::vector<std::string> cliArgs;
|
||||||
|
for (int i = 1; i < argc; i++){
|
||||||
|
cliArgs.push_back(argv[i]);
|
||||||
|
std::cout<<argv[i]<<std::endl;
|
||||||
|
}
|
||||||
|
randomMatSize = std::stoul(cliArgs[0]);
|
||||||
|
numConcurrentThreads = std::stoul(cliArgs[1]);
|
||||||
|
numTestRounds = std::stoul(cliArgs[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
|
||||||
|
std::vector<std::vector<double>> matBlock(numConcurrentThreads*3);
|
||||||
|
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
|
||||||
|
|
||||||
|
std::cout<<"*----------------------------*\n";
|
||||||
|
std::cout<<"| DGEMM thread safety tester |\n";
|
||||||
|
std::cout<<"*----------------------------*\n";
|
||||||
|
std::cout<<"Size of random matrices(N=M=K): "<<randomMatSize<<'\n';
|
||||||
|
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
|
||||||
|
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
|
||||||
|
std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
|
||||||
|
|
||||||
|
std::cout<<"Initializing random number generator..."<<std::flush;
|
||||||
|
std::mt19937_64 PRNG = InitPRNG();
|
||||||
|
std::cout<<"done\n";
|
||||||
|
|
||||||
|
std::cout<<"Preparing to test CBLAS DGEMM thread safety\n";
|
||||||
|
std::cout<<"Allocating matrices..."<<std::flush;
|
||||||
|
for(uint32_t i=0; i<(numConcurrentThreads*3); i++){
|
||||||
|
matBlock[i].resize(randomMatSize*randomMatSize);
|
||||||
|
}
|
||||||
|
std::cout<<"done\n";
|
||||||
|
//pauser();
|
||||||
|
std::cout<<"Filling matrices with random numbers..."<<std::flush;
|
||||||
|
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 3);
|
||||||
|
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
|
||||||
|
std::cout<<"done\n";
|
||||||
|
std::cout<<"Testing CBLAS DGEMM thread safety\n";
|
||||||
|
omp_set_num_threads(numConcurrentThreads);
|
||||||
|
for(uint32_t R=0; R<numTestRounds; R++){
|
||||||
|
std::cout<<"DGEMM round #"<<R<<std::endl;
|
||||||
|
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
|
||||||
|
#pragma omp parallel for default(none) shared(futureBlock, matBlock, randomMatSize, numConcurrentThreads)
|
||||||
|
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||||
|
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemm, &matBlock[i*3][0], &matBlock[i*3+1][0], &matBlock[i*3+2][0], randomMatSize);
|
||||||
|
//launch_cblas_dgemm( &matBlock[i][0], &matBlock[i+1][0], &matBlock[i+2][0]);
|
||||||
|
}
|
||||||
|
std::cout<<"done\n";
|
||||||
|
std::cout<<"Waiting for threads to finish..."<<std::flush;
|
||||||
|
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||||
|
futureBlock[i].get();
|
||||||
|
}
|
||||||
|
std::cout<<"done\n";
|
||||||
|
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
|
||||||
|
std::cout<<"Comparing results from different threads..."<<std::flush;
|
||||||
|
for(uint32_t i=3; i<(numConcurrentThreads*3); i+=3){ //i is the index of matrix A, for a given thread
|
||||||
|
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
|
||||||
|
if (std::abs(matBlock[i+2][j] - matBlock[2][j]) > 1.0E-13){ //i+2 is the index of matrix C, for a given thread
|
||||||
|
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+2<<std::endl;
|
||||||
|
std::cout<<"CBLAS DGEMM thread safety test FAILED!"<<std::endl;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::cout<<"OK!\n"<<std::endl;
|
||||||
|
}
|
||||||
|
std::cout<<"CBLAS DGEMM thread safety test PASSED!\n"<<std::endl;
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -0,0 +1,101 @@
|
||||||
|
#include <iostream>
|
||||||
|
#include <vector>
|
||||||
|
#include <random>
|
||||||
|
#include <future>
|
||||||
|
#include <omp.h>
|
||||||
|
#include "../cblas.h"
|
||||||
|
#include "cpp_thread_safety_common.h"
|
||||||
|
|
||||||
|
void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){
|
||||||
|
const blasint inc = 1;
|
||||||
|
cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char* argv[]){
|
||||||
|
blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used
|
||||||
|
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
|
||||||
|
uint32_t numTestRounds = 16; //number of testing rounds before success exit
|
||||||
|
|
||||||
|
if (argc > 4){
|
||||||
|
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
if(argc == 4){
|
||||||
|
std::vector<std::string> cliArgs;
|
||||||
|
for (int i = 1; i < argc; i++){
|
||||||
|
cliArgs.push_back(argv[i]);
|
||||||
|
std::cout<<argv[i]<<std::endl;
|
||||||
|
}
|
||||||
|
randomMatSize = std::stoul(cliArgs.at(0));
|
||||||
|
numConcurrentThreads = std::stoul(cliArgs.at(1));
|
||||||
|
numTestRounds = std::stoul(cliArgs.at(2));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
|
||||||
|
std::vector<std::vector<double>> matBlock(numConcurrentThreads);
|
||||||
|
std::vector<std::vector<double>> vecBlock(numConcurrentThreads*2);
|
||||||
|
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
|
||||||
|
|
||||||
|
std::cout<<"*----------------------------*\n";
|
||||||
|
std::cout<<"| DGEMV thread safety tester |\n";
|
||||||
|
std::cout<<"*----------------------------*\n";
|
||||||
|
std::cout<<"Size of random matrices and vectors(N=M): "<<randomMatSize<<'\n';
|
||||||
|
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
|
||||||
|
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
|
||||||
|
std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
|
||||||
|
|
||||||
|
std::cout<<"Initializing random number generator..."<<std::flush;
|
||||||
|
std::mt19937_64 PRNG = InitPRNG();
|
||||||
|
std::cout<<"done\n";
|
||||||
|
|
||||||
|
std::cout<<"Preparing to test CBLAS DGEMV thread safety\n";
|
||||||
|
std::cout<<"Allocating matrices..."<<std::flush;
|
||||||
|
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||||
|
matBlock.at(i).resize(randomMatSize*randomMatSize);
|
||||||
|
}
|
||||||
|
std::cout<<"done\n";
|
||||||
|
std::cout<<"Allocating vectors..."<<std::flush;
|
||||||
|
for(uint32_t i=0; i<(numConcurrentThreads*2); i++){
|
||||||
|
vecBlock.at(i).resize(randomMatSize);
|
||||||
|
}
|
||||||
|
std::cout<<"done\n";
|
||||||
|
//pauser();
|
||||||
|
|
||||||
|
std::cout<<"Filling matrices with random numbers..."<<std::flush;
|
||||||
|
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 1);
|
||||||
|
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads);
|
||||||
|
std::cout<<"done\n";
|
||||||
|
std::cout<<"Filling vectors with random numbers..."<<std::flush;
|
||||||
|
FillVectors(vecBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 2);
|
||||||
|
std::cout<<"done\n";
|
||||||
|
|
||||||
|
std::cout<<"Testing CBLAS DGEMV thread safety"<<std::endl;
|
||||||
|
omp_set_num_threads(numConcurrentThreads);
|
||||||
|
for(uint32_t R=0; R<numTestRounds; R++){
|
||||||
|
std::cout<<"DGEMV round #"<<R<<std::endl;
|
||||||
|
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
|
||||||
|
#pragma omp parallel for default(none) shared(futureBlock, matBlock, vecBlock, randomMatSize, numConcurrentThreads)
|
||||||
|
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||||
|
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemv, &matBlock[i][0], &vecBlock[i*2][0], &vecBlock[i*2+1][0], randomMatSize);
|
||||||
|
}
|
||||||
|
std::cout<<"done\n";
|
||||||
|
std::cout<<"Waiting for threads to finish..."<<std::flush;
|
||||||
|
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||||
|
futureBlock[i].get();
|
||||||
|
}
|
||||||
|
std::cout<<"done\n";
|
||||||
|
std::cout<<"Comparing results from different threads..."<<std::flush;
|
||||||
|
for(uint32_t i=2; i<(numConcurrentThreads*2); i+=2){ //i is the index of vector x, for a given thread
|
||||||
|
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
|
||||||
|
if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread
|
||||||
|
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+1<<std::endl;
|
||||||
|
std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::cout<<"OK!\n"<<std::endl;
|
||||||
|
}
|
||||||
|
std::cout<<"CBLAS DGEMV thread safety test PASSED!\n"<<std::endl;
|
||||||
|
return 0;
|
||||||
|
}
|
9
cpuid.h
9
cpuid.h
|
@ -53,6 +53,7 @@
|
||||||
#define VENDOR_SIS 8
|
#define VENDOR_SIS 8
|
||||||
#define VENDOR_TRANSMETA 9
|
#define VENDOR_TRANSMETA 9
|
||||||
#define VENDOR_NSC 10
|
#define VENDOR_NSC 10
|
||||||
|
#define VENDOR_HYGON 11
|
||||||
#define VENDOR_UNKNOWN 99
|
#define VENDOR_UNKNOWN 99
|
||||||
|
|
||||||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
|
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
|
||||||
|
@ -115,6 +116,8 @@
|
||||||
#define CORE_STEAMROLLER 25
|
#define CORE_STEAMROLLER 25
|
||||||
#define CORE_EXCAVATOR 26
|
#define CORE_EXCAVATOR 26
|
||||||
#define CORE_ZEN 27
|
#define CORE_ZEN 27
|
||||||
|
#define CORE_SKYLAKEX 28
|
||||||
|
#define CORE_DHYANA 29
|
||||||
|
|
||||||
#define HAVE_SSE (1 << 0)
|
#define HAVE_SSE (1 << 0)
|
||||||
#define HAVE_SSE2 (1 << 1)
|
#define HAVE_SSE2 (1 << 1)
|
||||||
|
@ -137,6 +140,8 @@
|
||||||
#define HAVE_AVX (1 << 18)
|
#define HAVE_AVX (1 << 18)
|
||||||
#define HAVE_FMA4 (1 << 19)
|
#define HAVE_FMA4 (1 << 19)
|
||||||
#define HAVE_FMA3 (1 << 20)
|
#define HAVE_FMA3 (1 << 20)
|
||||||
|
#define HAVE_AVX512VL (1 << 21)
|
||||||
|
#define HAVE_AVX2 (1 << 22)
|
||||||
|
|
||||||
#define CACHE_INFO_L1_I 1
|
#define CACHE_INFO_L1_I 1
|
||||||
#define CACHE_INFO_L1_D 2
|
#define CACHE_INFO_L1_D 2
|
||||||
|
@ -211,5 +216,9 @@ typedef struct {
|
||||||
#define CPUTYPE_STEAMROLLER 49
|
#define CPUTYPE_STEAMROLLER 49
|
||||||
#define CPUTYPE_EXCAVATOR 50
|
#define CPUTYPE_EXCAVATOR 50
|
||||||
#define CPUTYPE_ZEN 51
|
#define CPUTYPE_ZEN 51
|
||||||
|
#define CPUTYPE_SKYLAKEX 52
|
||||||
|
#define CPUTYPE_DHYANA 53
|
||||||
|
|
||||||
|
#define CPUTYPE_HYGON_UNKNOWN 54
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -34,7 +34,7 @@
|
||||||
#define CPU_CORTEXA15 4
|
#define CPU_CORTEXA15 4
|
||||||
|
|
||||||
static char *cpuname[] = {
|
static char *cpuname[] = {
|
||||||
"UNKOWN",
|
"UNKNOWN",
|
||||||
"ARMV6",
|
"ARMV6",
|
||||||
"ARMV7",
|
"ARMV7",
|
||||||
"CORTEXA9",
|
"CORTEXA9",
|
||||||
|
|
192
cpuid_arm64.c
192
cpuid_arm64.c
|
@ -29,27 +29,47 @@
|
||||||
|
|
||||||
#define CPU_UNKNOWN 0
|
#define CPU_UNKNOWN 0
|
||||||
#define CPU_ARMV8 1
|
#define CPU_ARMV8 1
|
||||||
#define CPU_CORTEXA57 2
|
// Arm
|
||||||
#define CPU_VULCAN 3
|
#define CPU_CORTEXA53 2
|
||||||
#define CPU_THUNDERX 4
|
#define CPU_CORTEXA57 3
|
||||||
#define CPU_THUNDERX2T99 5
|
#define CPU_CORTEXA72 4
|
||||||
|
#define CPU_CORTEXA73 5
|
||||||
|
// Qualcomm
|
||||||
|
#define CPU_FALKOR 6
|
||||||
|
// Cavium
|
||||||
|
#define CPU_THUNDERX 7
|
||||||
|
#define CPU_THUNDERX2T99 8
|
||||||
|
//Hisilicon
|
||||||
|
#define CPU_TSV110 9
|
||||||
|
// Ampere
|
||||||
|
#define CPU_EMAG8180 10
|
||||||
|
|
||||||
static char *cpuname[] = {
|
static char *cpuname[] = {
|
||||||
"UNKNOWN",
|
"UNKNOWN",
|
||||||
"ARMV8" ,
|
"ARMV8" ,
|
||||||
|
"CORTEXA53",
|
||||||
"CORTEXA57",
|
"CORTEXA57",
|
||||||
"VULCAN",
|
"CORTEXA72",
|
||||||
|
"CORTEXA73",
|
||||||
|
"FALKOR",
|
||||||
"THUNDERX",
|
"THUNDERX",
|
||||||
"THUNDERX2T99"
|
"THUNDERX2T99",
|
||||||
|
"TSV110",
|
||||||
|
"EMAG8180"
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *cpuname_lower[] = {
|
static char *cpuname_lower[] = {
|
||||||
"unknown",
|
"unknown",
|
||||||
"armv8",
|
"armv8",
|
||||||
|
"cortexa53",
|
||||||
"cortexa57",
|
"cortexa57",
|
||||||
"vulcan",
|
"cortexa72",
|
||||||
|
"cortexa73",
|
||||||
|
"falkor",
|
||||||
"thunderx",
|
"thunderx",
|
||||||
"thunderx2t99"
|
"thunderx2t99",
|
||||||
|
"tsv110",
|
||||||
|
"emag8180"
|
||||||
};
|
};
|
||||||
|
|
||||||
int get_feature(char *search)
|
int get_feature(char *search)
|
||||||
|
@ -78,7 +98,7 @@ int get_feature(char *search)
|
||||||
if( p == NULL ) return 0;
|
if( p == NULL ) return 0;
|
||||||
|
|
||||||
t = strtok(p," ");
|
t = strtok(p," ");
|
||||||
while( t = strtok(NULL," "))
|
while( (t = strtok(NULL," ")))
|
||||||
{
|
{
|
||||||
if (!strcmp(t, search)) { return(1); }
|
if (!strcmp(t, search)) { return(1); }
|
||||||
}
|
}
|
||||||
|
@ -114,15 +134,31 @@ int detect(void)
|
||||||
|
|
||||||
fclose(infile);
|
fclose(infile);
|
||||||
if(cpu_part != NULL && cpu_implementer != NULL) {
|
if(cpu_part != NULL && cpu_implementer != NULL) {
|
||||||
if (strstr(cpu_implementer, "0x41") &&
|
// Arm
|
||||||
(strstr(cpu_part, "0xd07") || strstr(cpu_part,"0xd08") || strstr(cpu_part,"0xd03") ))
|
if (strstr(cpu_implementer, "0x41")) {
|
||||||
return CPU_CORTEXA57; //or compatible A53, A72
|
if (strstr(cpu_part, "0xd03"))
|
||||||
else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42"))
|
return CPU_CORTEXA53;
|
||||||
return CPU_VULCAN;
|
else if (strstr(cpu_part, "0xd07"))
|
||||||
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43"))
|
return CPU_CORTEXA57;
|
||||||
|
else if (strstr(cpu_part, "0xd08"))
|
||||||
|
return CPU_CORTEXA72;
|
||||||
|
else if (strstr(cpu_part, "0xd09"))
|
||||||
|
return CPU_CORTEXA73;
|
||||||
|
}
|
||||||
|
// Qualcomm
|
||||||
|
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
|
||||||
|
return CPU_FALKOR;
|
||||||
|
// Cavium
|
||||||
|
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0a1"))
|
||||||
return CPU_THUNDERX;
|
return CPU_THUNDERX;
|
||||||
else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */
|
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
|
||||||
return CPU_THUNDERX2T99;
|
return CPU_THUNDERX2T99;
|
||||||
|
// HiSilicon
|
||||||
|
else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01"))
|
||||||
|
return CPU_TSV110;
|
||||||
|
// Ampere
|
||||||
|
else if (strstr(cpu_implementer, "0x50") && strstr(cpu_part, "0x000"))
|
||||||
|
return CPU_EMAG8180;
|
||||||
}
|
}
|
||||||
|
|
||||||
p = (char *) NULL ;
|
p = (char *) NULL ;
|
||||||
|
@ -177,15 +213,50 @@ void get_subdirname(void)
|
||||||
printf("arm64");
|
printf("arm64");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void get_cpucount(void)
|
||||||
|
{
|
||||||
|
int n=0;
|
||||||
|
|
||||||
|
#ifdef linux
|
||||||
|
FILE *infile;
|
||||||
|
char buffer[2048], *p,*t;
|
||||||
|
p = (char *) NULL ;
|
||||||
|
|
||||||
|
infile = fopen("/proc/cpuinfo", "r");
|
||||||
|
|
||||||
|
while (fgets(buffer, sizeof(buffer), infile))
|
||||||
|
{
|
||||||
|
|
||||||
|
if (!strncmp("processor", buffer, 9))
|
||||||
|
n++;
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(infile);
|
||||||
|
|
||||||
|
printf("#define NUM_CORES %d\n",n);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void get_cpuconfig(void)
|
void get_cpuconfig(void)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
// All arches should define ARMv8
|
||||||
|
printf("#define ARMV8\n");
|
||||||
|
printf("#define HAVE_NEON\n"); // This shouldn't be necessary
|
||||||
|
printf("#define HAVE_VFPV4\n"); // This shouldn't be necessary
|
||||||
|
|
||||||
int d = detect();
|
int d = detect();
|
||||||
switch (d)
|
switch (d)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
case CPU_CORTEXA53:
|
||||||
|
printf("#define %s\n", cpuname[d]);
|
||||||
|
// Fall-through
|
||||||
case CPU_ARMV8:
|
case CPU_ARMV8:
|
||||||
printf("#define ARMV8\n");
|
// Minimum parameters for ARMv8 (based on A53)
|
||||||
printf("#define L1_DATA_SIZE 32768\n");
|
printf("#define L1_DATA_SIZE 32768\n");
|
||||||
printf("#define L1_DATA_LINESIZE 64\n");
|
printf("#define L1_DATA_LINESIZE 64\n");
|
||||||
printf("#define L2_SIZE 262144\n");
|
printf("#define L2_SIZE 262144\n");
|
||||||
|
@ -195,49 +266,40 @@ void get_cpuconfig(void)
|
||||||
printf("#define L2_ASSOCIATIVE 4\n");
|
printf("#define L2_ASSOCIATIVE 4\n");
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case CPU_VULCAN:
|
|
||||||
printf("#define VULCAN \n");
|
|
||||||
printf("#define HAVE_VFP \n");
|
|
||||||
printf("#define HAVE_VFPV3 \n");
|
|
||||||
printf("#define HAVE_NEON \n");
|
|
||||||
printf("#define HAVE_VFPV4 \n");
|
|
||||||
printf("#define L1_CODE_SIZE 32768 \n");
|
|
||||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
|
||||||
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
|
|
||||||
printf("#define L1_DATA_SIZE 32768 \n");
|
|
||||||
printf("#define L1_DATA_LINESIZE 64 \n");
|
|
||||||
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
|
|
||||||
printf("#define L2_SIZE 262144 \n");
|
|
||||||
printf("#define L2_LINESIZE 64 \n");
|
|
||||||
printf("#define L2_ASSOCIATIVE 8 \n");
|
|
||||||
printf("#define L3_SIZE 33554432 \n");
|
|
||||||
printf("#define L3_LINESIZE 64 \n");
|
|
||||||
printf("#define L3_ASSOCIATIVE 32 \n");
|
|
||||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
|
||||||
printf("#define DTB_SIZE 4096 \n");
|
|
||||||
break;
|
|
||||||
|
|
||||||
case CPU_CORTEXA57:
|
case CPU_CORTEXA57:
|
||||||
printf("#define CORTEXA57\n");
|
case CPU_CORTEXA72:
|
||||||
printf("#define HAVE_VFP\n");
|
case CPU_CORTEXA73:
|
||||||
printf("#define HAVE_VFPV3\n");
|
// Common minimum settings for these Arm cores
|
||||||
printf("#define HAVE_NEON\n");
|
// Can change a lot, but we need to be conservative
|
||||||
printf("#define HAVE_VFPV4\n");
|
// TODO: detect info from /sys if possible
|
||||||
|
printf("#define %s\n", cpuname[d]);
|
||||||
printf("#define L1_CODE_SIZE 49152\n");
|
printf("#define L1_CODE_SIZE 49152\n");
|
||||||
printf("#define L1_CODE_LINESIZE 64\n");
|
printf("#define L1_CODE_LINESIZE 64\n");
|
||||||
printf("#define L1_CODE_ASSOCIATIVE 3\n");
|
printf("#define L1_CODE_ASSOCIATIVE 3\n");
|
||||||
printf("#define L1_DATA_SIZE 32768\n");
|
printf("#define L1_DATA_SIZE 32768\n");
|
||||||
printf("#define L1_DATA_LINESIZE 64\n");
|
printf("#define L1_DATA_LINESIZE 64\n");
|
||||||
printf("#define L1_DATA_ASSOCIATIVE 2\n");
|
printf("#define L1_DATA_ASSOCIATIVE 2\n");
|
||||||
printf("#define L2_SIZE 2097152\n");
|
printf("#define L2_SIZE 524288\n");
|
||||||
printf("#define L2_LINESIZE 64\n");
|
printf("#define L2_LINESIZE 64\n");
|
||||||
printf("#define L2_ASSOCIATIVE 16\n");
|
printf("#define L2_ASSOCIATIVE 16\n");
|
||||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||||
printf("#define DTB_SIZE 4096\n");
|
printf("#define DTB_SIZE 4096\n");
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case CPU_FALKOR:
|
||||||
|
printf("#define FALKOR\n");
|
||||||
|
printf("#define L1_CODE_SIZE 65536\n");
|
||||||
|
printf("#define L1_CODE_LINESIZE 64\n");
|
||||||
|
printf("#define L1_DATA_SIZE 32768\n");
|
||||||
|
printf("#define L1_DATA_LINESIZE 128\n");
|
||||||
|
printf("#define L2_SIZE 524288\n");
|
||||||
|
printf("#define L2_LINESIZE 64\n");
|
||||||
|
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||||
|
printf("#define DTB_SIZE 4096\n");
|
||||||
|
printf("#define L2_ASSOCIATIVE 16\n");
|
||||||
|
break;
|
||||||
|
|
||||||
case CPU_THUNDERX:
|
case CPU_THUNDERX:
|
||||||
printf("#define ARMV8\n");
|
|
||||||
printf("#define THUNDERX\n");
|
printf("#define THUNDERX\n");
|
||||||
printf("#define L1_DATA_SIZE 32768\n");
|
printf("#define L1_DATA_SIZE 32768\n");
|
||||||
printf("#define L1_DATA_LINESIZE 128\n");
|
printf("#define L1_DATA_LINESIZE 128\n");
|
||||||
|
@ -249,11 +311,7 @@ void get_cpuconfig(void)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case CPU_THUNDERX2T99:
|
case CPU_THUNDERX2T99:
|
||||||
printf("#define VULCAN \n");
|
printf("#define THUNDERX2T99 \n");
|
||||||
printf("#define HAVE_VFP \n");
|
|
||||||
printf("#define HAVE_VFPV3 \n");
|
|
||||||
printf("#define HAVE_NEON \n");
|
|
||||||
printf("#define HAVE_VFPV4 \n");
|
|
||||||
printf("#define L1_CODE_SIZE 32768 \n");
|
printf("#define L1_CODE_SIZE 32768 \n");
|
||||||
printf("#define L1_CODE_LINESIZE 64 \n");
|
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||||
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
|
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
|
||||||
|
@ -269,7 +327,35 @@ void get_cpuconfig(void)
|
||||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||||
printf("#define DTB_SIZE 4096 \n");
|
printf("#define DTB_SIZE 4096 \n");
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case CPU_TSV110:
|
||||||
|
printf("#define TSV110 \n");
|
||||||
|
printf("#define L1_CODE_SIZE 65536 \n");
|
||||||
|
printf("#define L1_CODE_LINESIZE 64 \n");
|
||||||
|
printf("#define L1_CODE_ASSOCIATIVE 4 \n");
|
||||||
|
printf("#define L1_DATA_SIZE 65536 \n");
|
||||||
|
printf("#define L1_DATA_LINESIZE 64 \n");
|
||||||
|
printf("#define L1_DATA_ASSOCIATIVE 4 \n");
|
||||||
|
printf("#define L2_SIZE 524228 \n");
|
||||||
|
printf("#define L2_LINESIZE 64 \n");
|
||||||
|
printf("#define L2_ASSOCIATIVE 8 \n");
|
||||||
|
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||||
|
printf("#define DTB_SIZE 4096 \n");
|
||||||
|
break;
|
||||||
|
|
||||||
|
case CPU_EMAG8180:
|
||||||
|
// Minimum parameters for ARMv8 (based on A53)
|
||||||
|
printf("#define EMAG8180\n");
|
||||||
|
printf("#define L1_CODE_SIZE 32768\n");
|
||||||
|
printf("#define L1_DATA_SIZE 32768\n");
|
||||||
|
printf("#define L1_DATA_LINESIZE 64\n");
|
||||||
|
printf("#define L2_SIZE 262144\n");
|
||||||
|
printf("#define L2_LINESIZE 64\n");
|
||||||
|
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||||
|
printf("#define DTB_SIZE 4096\n");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
get_cpucount();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -305,12 +391,10 @@ void get_features(void)
|
||||||
if( p == NULL ) return;
|
if( p == NULL ) return;
|
||||||
|
|
||||||
t = strtok(p," ");
|
t = strtok(p," ");
|
||||||
while( t = strtok(NULL," "))
|
while( (t = strtok(NULL," ")))
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
58
cpuid_mips.c
58
cpuid_mips.c
|
@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#define CPU_UNKNOWN 0
|
#define CPU_UNKNOWN 0
|
||||||
#define CPU_P5600 1
|
#define CPU_P5600 1
|
||||||
|
#define CPU_1004K 2
|
||||||
|
|
||||||
static char *cpuname[] = {
|
static char *cpuname[] = {
|
||||||
"UNKOWN",
|
"UNKNOWN",
|
||||||
"P5600"
|
"P5600",
|
||||||
|
"1004K"
|
||||||
};
|
};
|
||||||
|
|
||||||
int detect(void){
|
int detect(void){
|
||||||
|
@ -99,43 +101,13 @@ int detect(void){
|
||||||
fclose(infile);
|
fclose(infile);
|
||||||
|
|
||||||
if(p != NULL){
|
if(p != NULL){
|
||||||
if (strstr(p, "Loongson-3A")){
|
if (strstr(p, "5600")) {
|
||||||
return CPU_LOONGSON3A;
|
return CPU_P5600;
|
||||||
}else if(strstr(p, "Loongson-3B")){
|
} else if (strstr(p, "1004K")) {
|
||||||
return CPU_LOONGSON3B;
|
return CPU_1004K;
|
||||||
}else if (strstr(p, "Loongson-3")){
|
} else
|
||||||
infile = fopen("/proc/cpuinfo", "r");
|
|
||||||
p = (char *)NULL;
|
|
||||||
while (fgets(buffer, sizeof(buffer), infile)){
|
|
||||||
if (!strncmp("system type", buffer, 11)){
|
|
||||||
p = strchr(buffer, ':') + 2;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fclose(infile);
|
|
||||||
if (strstr(p, "loongson3a"))
|
|
||||||
return CPU_LOONGSON3A;
|
|
||||||
}else{
|
|
||||||
return CPU_UNKNOWN;
|
return CPU_UNKNOWN;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
//Check model name for Loongson3
|
|
||||||
infile = fopen("/proc/cpuinfo", "r");
|
|
||||||
p = (char *)NULL;
|
|
||||||
while (fgets(buffer, sizeof(buffer), infile)){
|
|
||||||
if (!strncmp("model name", buffer, 10)){
|
|
||||||
p = strchr(buffer, ':') + 2;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fclose(infile);
|
|
||||||
if(p != NULL){
|
|
||||||
if (strstr(p, "Loongson-3A")){
|
|
||||||
return CPU_LOONGSON3A;
|
|
||||||
}else if(strstr(p, "Loongson-3B")){
|
|
||||||
return CPU_LOONGSON3B;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
return CPU_UNKNOWN;
|
return CPU_UNKNOWN;
|
||||||
}
|
}
|
||||||
|
@ -149,7 +121,7 @@ void get_architecture(void){
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_subarchitecture(void){
|
void get_subarchitecture(void){
|
||||||
if(detect()==CPU_P5600){
|
if(detect()==CPU_P5600|| detect()==CPU_1004K){
|
||||||
printf("P5600");
|
printf("P5600");
|
||||||
}else{
|
}else{
|
||||||
printf("UNKNOWN");
|
printf("UNKNOWN");
|
||||||
|
@ -170,6 +142,14 @@ void get_cpuconfig(void){
|
||||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||||
printf("#define DTB_SIZE 4096\n");
|
printf("#define DTB_SIZE 4096\n");
|
||||||
printf("#define L2_ASSOCIATIVE 8\n");
|
printf("#define L2_ASSOCIATIVE 8\n");
|
||||||
|
} else if (detect()==CPU_1004K) {
|
||||||
|
printf("#define MIPS1004K\n");
|
||||||
|
printf("#define L1_DATA_SIZE 32768\n");
|
||||||
|
printf("#define L1_DATA_LINESIZE 32\n");
|
||||||
|
printf("#define L2_SIZE 26144\n");
|
||||||
|
printf("#define DTB_DEFAULT_ENTRIES 8\n");
|
||||||
|
printf("#define DTB_SIZE 4096\n");
|
||||||
|
printf("#define L2_ASSOCIATIVE 4\n");
|
||||||
}else{
|
}else{
|
||||||
printf("#define UNKNOWN\n");
|
printf("#define UNKNOWN\n");
|
||||||
}
|
}
|
||||||
|
@ -178,6 +158,8 @@ void get_cpuconfig(void){
|
||||||
void get_libname(void){
|
void get_libname(void){
|
||||||
if(detect()==CPU_P5600) {
|
if(detect()==CPU_P5600) {
|
||||||
printf("p5600\n");
|
printf("p5600\n");
|
||||||
|
} else if (detect()==CPU_1004K) {
|
||||||
|
printf("1004K\n");
|
||||||
}else{
|
}else{
|
||||||
printf("mips\n");
|
printf("mips\n");
|
||||||
}
|
}
|
||||||
|
|
|
@ -79,7 +79,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define CPU_I6500 6
|
#define CPU_I6500 6
|
||||||
|
|
||||||
static char *cpuname[] = {
|
static char *cpuname[] = {
|
||||||
"UNKOWN",
|
"UNKNOWN",
|
||||||
"SICORTEX",
|
"SICORTEX",
|
||||||
"LOONGSON3A",
|
"LOONGSON3A",
|
||||||
"LOONGSON3B",
|
"LOONGSON3B",
|
||||||
|
|
|
@ -56,6 +56,7 @@
|
||||||
#define CPUTYPE_CELL 6
|
#define CPUTYPE_CELL 6
|
||||||
#define CPUTYPE_PPCG4 7
|
#define CPUTYPE_PPCG4 7
|
||||||
#define CPUTYPE_POWER8 8
|
#define CPUTYPE_POWER8 8
|
||||||
|
#define CPUTYPE_POWER9 9
|
||||||
|
|
||||||
char *cpuname[] = {
|
char *cpuname[] = {
|
||||||
"UNKNOWN",
|
"UNKNOWN",
|
||||||
|
@ -66,7 +67,8 @@ char *cpuname[] = {
|
||||||
"POWER6",
|
"POWER6",
|
||||||
"CELL",
|
"CELL",
|
||||||
"PPCG4",
|
"PPCG4",
|
||||||
"POWER8"
|
"POWER8",
|
||||||
|
"POWER9"
|
||||||
};
|
};
|
||||||
|
|
||||||
char *lowercpuname[] = {
|
char *lowercpuname[] = {
|
||||||
|
@ -78,7 +80,8 @@ char *lowercpuname[] = {
|
||||||
"power6",
|
"power6",
|
||||||
"cell",
|
"cell",
|
||||||
"ppcg4",
|
"ppcg4",
|
||||||
"power8"
|
"power8",
|
||||||
|
"power9"
|
||||||
};
|
};
|
||||||
|
|
||||||
char *corename[] = {
|
char *corename[] = {
|
||||||
|
@ -90,7 +93,8 @@ char *corename[] = {
|
||||||
"POWER6",
|
"POWER6",
|
||||||
"CELL",
|
"CELL",
|
||||||
"PPCG4",
|
"PPCG4",
|
||||||
"POWER8"
|
"POWER8",
|
||||||
|
"POWER9"
|
||||||
};
|
};
|
||||||
|
|
||||||
int detect(void){
|
int detect(void){
|
||||||
|
@ -120,6 +124,7 @@ int detect(void){
|
||||||
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
|
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
|
||||||
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
|
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
|
||||||
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
|
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
|
||||||
|
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
|
||||||
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
|
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
|
||||||
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
|
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
|
||||||
|
|
||||||
|
@ -127,6 +132,33 @@ int detect(void){
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef _AIX
|
#ifdef _AIX
|
||||||
|
FILE *infile;
|
||||||
|
char buffer[512], *p;
|
||||||
|
|
||||||
|
p = (char *)NULL;
|
||||||
|
infile = popen("prtconf|grep 'Processor Type'", "r");
|
||||||
|
while (fgets(buffer, sizeof(buffer), infile)){
|
||||||
|
if (!strncmp("Pro", buffer, 3)){
|
||||||
|
p = strchr(buffer, ':') + 2;
|
||||||
|
#if 0
|
||||||
|
fprintf(stderr, "%s\n", p);
|
||||||
|
#endif
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pclose(infile);
|
||||||
|
|
||||||
|
if (!strncasecmp(p, "POWER3", 6)) return CPUTYPE_POWER3;
|
||||||
|
if (!strncasecmp(p, "POWER4", 6)) return CPUTYPE_POWER4;
|
||||||
|
if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970;
|
||||||
|
if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
|
||||||
|
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
|
||||||
|
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
|
||||||
|
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
|
||||||
|
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
|
||||||
|
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
|
||||||
|
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
|
||||||
return CPUTYPE_POWER5;
|
return CPUTYPE_POWER5;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -142,6 +174,52 @@ int detect(void){
|
||||||
|
|
||||||
return CPUTYPE_PPC970;
|
return CPUTYPE_PPC970;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
|
||||||
|
int id;
|
||||||
|
__asm __volatile("mfpvr %0" : "=r"(id));
|
||||||
|
switch ( id >> 16 ) {
|
||||||
|
case 0x4e: // POWER9
|
||||||
|
return CPUTYPE_POWER9;
|
||||||
|
break;
|
||||||
|
case 0x4d:
|
||||||
|
case 0x4b: // POWER8/8E
|
||||||
|
return CPUTYPE_POWER8;
|
||||||
|
break;
|
||||||
|
case 0x4a:
|
||||||
|
case 0x3f: // POWER7/7E
|
||||||
|
return CPUTYPE_POWER6;
|
||||||
|
break;
|
||||||
|
case 0x3e:
|
||||||
|
return CPUTYPE_POWER6;
|
||||||
|
break;
|
||||||
|
case 0x3a:
|
||||||
|
return CPUTYPE_POWER5;
|
||||||
|
break;
|
||||||
|
case 0x35:
|
||||||
|
case 0x38: // POWER4 /4+
|
||||||
|
return CPUTYPE_POWER4;
|
||||||
|
break;
|
||||||
|
case 0x40:
|
||||||
|
case 0x41: // POWER3 /3+
|
||||||
|
return CPUTYPE_POWER3;
|
||||||
|
break;
|
||||||
|
case 0x39:
|
||||||
|
case 0x3c:
|
||||||
|
case 0x44:
|
||||||
|
case 0x45:
|
||||||
|
return CPUTYPE_PPC970;
|
||||||
|
break;
|
||||||
|
case 0x70:
|
||||||
|
return CPUTYPE_CELL;
|
||||||
|
break;
|
||||||
|
case 0x8003:
|
||||||
|
return CPUTYPE_PPCG4;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return CPUTYPE_UNKNOWN;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_architecture(void){
|
void get_architecture(void){
|
||||||
|
|
272
cpuid_x86.c
272
cpuid_x86.c
|
@ -50,6 +50,8 @@
|
||||||
#ifdef NO_AVX
|
#ifdef NO_AVX
|
||||||
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM
|
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM
|
||||||
#define CORE_HASWELL CORE_NEHALEM
|
#define CORE_HASWELL CORE_NEHALEM
|
||||||
|
#define CPUTYPE_SKYLAKEX CPUTYPE_NEHALEM
|
||||||
|
#define CORE_SKYLAKEX CORE_NEHALEM
|
||||||
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
|
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
|
||||||
#define CORE_SANDYBRIDGE CORE_NEHALEM
|
#define CORE_SANDYBRIDGE CORE_NEHALEM
|
||||||
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
|
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
|
||||||
|
@ -95,10 +97,10 @@ static C_INLINE void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
|
||||||
("mov %%ebx, %%edi;"
|
("mov %%ebx, %%edi;"
|
||||||
"cpuid;"
|
"cpuid;"
|
||||||
"xchgl %%ebx, %%edi;"
|
"xchgl %%ebx, %%edi;"
|
||||||
: "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc");
|
: "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op), "c" (0) : "cc");
|
||||||
#else
|
#else
|
||||||
__asm__ __volatile__
|
__asm__ __volatile__
|
||||||
("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc");
|
("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) , "c" (0) : "cc");
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -209,6 +211,44 @@ int support_avx(){
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int support_avx2(){
|
||||||
|
#ifndef NO_AVX2
|
||||||
|
int eax, ebx, ecx=0, edx;
|
||||||
|
int ret=0;
|
||||||
|
|
||||||
|
if (!support_avx())
|
||||||
|
return 0;
|
||||||
|
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||||
|
if((ebx & (1<<7)) != 0)
|
||||||
|
ret=1; //OS supports AVX2
|
||||||
|
return ret;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int support_avx512(){
|
||||||
|
#if !defined(NO_AVX) && !defined(NO_AVX512)
|
||||||
|
int eax, ebx, ecx, edx;
|
||||||
|
int ret=0;
|
||||||
|
|
||||||
|
if (!support_avx())
|
||||||
|
return 0;
|
||||||
|
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||||
|
if((ebx & 32) != 32){
|
||||||
|
ret=0; //OS does not even support AVX2
|
||||||
|
}
|
||||||
|
if((ebx & (1<<31)) != 0){
|
||||||
|
xgetbv(0, &eax, &edx);
|
||||||
|
if((eax & 0xe0) == 0xe0)
|
||||||
|
ret=1; //OS supports AVX512VL
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
int get_vendor(void){
|
int get_vendor(void){
|
||||||
int eax, ebx, ecx, edx;
|
int eax, ebx, ecx, edx;
|
||||||
|
@ -231,6 +271,7 @@ int get_vendor(void){
|
||||||
if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS;
|
if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS;
|
||||||
if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA;
|
if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA;
|
||||||
if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC;
|
if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC;
|
||||||
|
if (!strcmp(vendor, "HygonGenuine")) return VENDOR_HYGON;
|
||||||
|
|
||||||
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
|
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
|
||||||
|
|
||||||
|
@ -292,6 +333,8 @@ int get_cputype(int gettype){
|
||||||
if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2;
|
if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2;
|
||||||
#ifndef NO_AVX
|
#ifndef NO_AVX
|
||||||
if (support_avx()) feature |= HAVE_AVX;
|
if (support_avx()) feature |= HAVE_AVX;
|
||||||
|
if (support_avx2()) feature |= HAVE_AVX2;
|
||||||
|
if (support_avx512()) feature |= HAVE_AVX512VL;
|
||||||
if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3;
|
if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -1004,7 +1047,9 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((get_vendor() == VENDOR_AMD) || (get_vendor() == VENDOR_CENTAUR)) {
|
if ((get_vendor() == VENDOR_AMD) ||
|
||||||
|
(get_vendor() == VENDOR_HYGON) ||
|
||||||
|
(get_vendor() == VENDOR_CENTAUR)) {
|
||||||
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
|
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
|
||||||
|
|
||||||
LDTB.size = 4096;
|
LDTB.size = 4096;
|
||||||
|
@ -1152,7 +1197,11 @@ int get_cpuname(void){
|
||||||
case 3:
|
case 3:
|
||||||
case 5:
|
case 5:
|
||||||
case 6:
|
case 6:
|
||||||
|
#if defined(__x86_64__) || defined(__amd64__)
|
||||||
|
return CPUTYPE_CORE2;
|
||||||
|
#else
|
||||||
return CPUTYPE_PENTIUM2;
|
return CPUTYPE_PENTIUM2;
|
||||||
|
#endif
|
||||||
case 7:
|
case 7:
|
||||||
case 8:
|
case 8:
|
||||||
case 10:
|
case 10:
|
||||||
|
@ -1166,7 +1215,7 @@ int get_cpuname(void){
|
||||||
return CPUTYPE_CORE2;
|
return CPUTYPE_CORE2;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 1:
|
case 1: // family 6 exmodel 1
|
||||||
switch (model) {
|
switch (model) {
|
||||||
case 6:
|
case 6:
|
||||||
return CPUTYPE_CORE2;
|
return CPUTYPE_CORE2;
|
||||||
|
@ -1183,7 +1232,7 @@ int get_cpuname(void){
|
||||||
return CPUTYPE_DUNNINGTON;
|
return CPUTYPE_DUNNINGTON;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 2:
|
case 2: // family 6 exmodel 2
|
||||||
switch (model) {
|
switch (model) {
|
||||||
case 5:
|
case 5:
|
||||||
//Intel Core (Clarkdale) / Core (Arrandale)
|
//Intel Core (Clarkdale) / Core (Arrandale)
|
||||||
|
@ -1212,7 +1261,7 @@ int get_cpuname(void){
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 3:
|
case 3: // family 6 exmodel 3
|
||||||
switch (model) {
|
switch (model) {
|
||||||
case 7:
|
case 7:
|
||||||
// Bay Trail
|
// Bay Trail
|
||||||
|
@ -1226,57 +1275,47 @@ int get_cpuname(void){
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
case 12:
|
case 12:
|
||||||
case 15:
|
case 15:
|
||||||
if(support_avx())
|
if(support_avx2())
|
||||||
#ifndef NO_AVX2
|
|
||||||
return CPUTYPE_HASWELL;
|
return CPUTYPE_HASWELL;
|
||||||
#else
|
if(support_avx())
|
||||||
return CPUTYPE_SANDYBRIDGE;
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
#endif
|
|
||||||
else
|
else
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
case 13:
|
case 13:
|
||||||
//Broadwell
|
//Broadwell
|
||||||
if(support_avx())
|
if(support_avx2())
|
||||||
#ifndef NO_AVX2
|
|
||||||
return CPUTYPE_HASWELL;
|
return CPUTYPE_HASWELL;
|
||||||
#else
|
if(support_avx())
|
||||||
return CPUTYPE_SANDYBRIDGE;
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
#endif
|
|
||||||
else
|
else
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 4:
|
case 4: // family 6 exmodel 4
|
||||||
switch (model) {
|
switch (model) {
|
||||||
case 5:
|
case 5:
|
||||||
case 6:
|
case 6:
|
||||||
if(support_avx())
|
if(support_avx2())
|
||||||
#ifndef NO_AVX2
|
|
||||||
return CPUTYPE_HASWELL;
|
return CPUTYPE_HASWELL;
|
||||||
#else
|
if(support_avx())
|
||||||
return CPUTYPE_SANDYBRIDGE;
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
#endif
|
|
||||||
else
|
else
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
case 7:
|
case 7:
|
||||||
case 15:
|
case 15:
|
||||||
//Broadwell
|
//Broadwell
|
||||||
if(support_avx())
|
if(support_avx2())
|
||||||
#ifndef NO_AVX2
|
|
||||||
return CPUTYPE_HASWELL;
|
return CPUTYPE_HASWELL;
|
||||||
#else
|
if(support_avx())
|
||||||
return CPUTYPE_SANDYBRIDGE;
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
#endif
|
|
||||||
else
|
else
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
case 14:
|
case 14:
|
||||||
//Skylake
|
//Skylake
|
||||||
if(support_avx())
|
if(support_avx2())
|
||||||
#ifndef NO_AVX2
|
|
||||||
return CPUTYPE_HASWELL;
|
return CPUTYPE_HASWELL;
|
||||||
#else
|
if(support_avx())
|
||||||
return CPUTYPE_SANDYBRIDGE;
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
#endif
|
|
||||||
else
|
else
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
case 12:
|
case 12:
|
||||||
|
@ -1286,54 +1325,85 @@ int get_cpuname(void){
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 5:
|
case 5: // family 6 exmodel 5
|
||||||
switch (model) {
|
switch (model) {
|
||||||
case 6:
|
case 6:
|
||||||
//Broadwell
|
//Broadwell
|
||||||
if(support_avx())
|
if(support_avx2())
|
||||||
#ifndef NO_AVX2
|
|
||||||
return CPUTYPE_HASWELL;
|
return CPUTYPE_HASWELL;
|
||||||
#else
|
if(support_avx())
|
||||||
return CPUTYPE_SANDYBRIDGE;
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
#endif
|
|
||||||
else
|
else
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
case 5:
|
case 5:
|
||||||
|
// Skylake X
|
||||||
|
if(support_avx512())
|
||||||
|
return CPUTYPE_SKYLAKEX;
|
||||||
|
if(support_avx2())
|
||||||
|
return CPUTYPE_HASWELL;
|
||||||
|
if(support_avx())
|
||||||
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
|
else
|
||||||
|
return CPUTYPE_NEHALEM;
|
||||||
case 14:
|
case 14:
|
||||||
// Skylake
|
// Skylake
|
||||||
if(support_avx())
|
if(support_avx2())
|
||||||
#ifndef NO_AVX2
|
|
||||||
return CPUTYPE_HASWELL;
|
return CPUTYPE_HASWELL;
|
||||||
#else
|
if(support_avx())
|
||||||
return CPUTYPE_SANDYBRIDGE;
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
#endif
|
|
||||||
else
|
else
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
case 7:
|
case 7:
|
||||||
// Xeon Phi Knights Landing
|
// Xeon Phi Knights Landing
|
||||||
if(support_avx())
|
if(support_avx2())
|
||||||
#ifndef NO_AVX2
|
|
||||||
return CPUTYPE_HASWELL;
|
return CPUTYPE_HASWELL;
|
||||||
#else
|
if(support_avx())
|
||||||
return CPUTYPE_SANDYBRIDGE;
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
#endif
|
|
||||||
else
|
else
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
case 12:
|
case 12:
|
||||||
// Apollo Lake
|
// Apollo Lake
|
||||||
|
case 15:
|
||||||
|
// Denverton
|
||||||
|
return CPUTYPE_NEHALEM;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 6: // family 6 exmodel 6
|
||||||
|
switch (model) {
|
||||||
|
case 6: // Cannon Lake
|
||||||
|
if(support_avx512())
|
||||||
|
return CPUTYPE_SKYLAKEX;
|
||||||
|
if(support_avx2())
|
||||||
|
return CPUTYPE_HASWELL;
|
||||||
|
if(support_avx())
|
||||||
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
|
else
|
||||||
|
return CPUTYPE_NEHALEM;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 7: // family 6 exmodel 7
|
||||||
|
switch (model) {
|
||||||
|
case 10: // Goldmont Plus
|
||||||
|
return CPUTYPE_NEHALEM;
|
||||||
|
case 14: // Ice Lake
|
||||||
|
if(support_avx512())
|
||||||
|
return CPUTYPE_SKYLAKEX;
|
||||||
|
if(support_avx2())
|
||||||
|
return CPUTYPE_HASWELL;
|
||||||
|
if(support_avx())
|
||||||
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
|
else
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 9:
|
case 9:
|
||||||
case 8:
|
case 8:
|
||||||
switch (model) {
|
switch (model) {
|
||||||
case 14: // Kaby Lake
|
case 14: // Kaby Lake and refreshes
|
||||||
if(support_avx())
|
if(support_avx2())
|
||||||
#ifndef NO_AVX2
|
|
||||||
return CPUTYPE_HASWELL;
|
return CPUTYPE_HASWELL;
|
||||||
#else
|
if(support_avx())
|
||||||
return CPUTYPE_SANDYBRIDGE;
|
return CPUTYPE_SANDYBRIDGE;
|
||||||
#endif
|
|
||||||
else
|
else
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
}
|
}
|
||||||
|
@ -1361,7 +1431,11 @@ int get_cpuname(void){
|
||||||
case 0x5:
|
case 0x5:
|
||||||
return CPUTYPE_AMDK6;
|
return CPUTYPE_AMDK6;
|
||||||
case 0x6:
|
case 0x6:
|
||||||
|
#if defined(__x86_64__) || defined(__amd64__)
|
||||||
|
return CPUTYPE_BARCELONA;
|
||||||
|
#else
|
||||||
return CPUTYPE_ATHLON;
|
return CPUTYPE_ATHLON;
|
||||||
|
#endif
|
||||||
case 0xf:
|
case 0xf:
|
||||||
switch (exfamily) {
|
switch (exfamily) {
|
||||||
case 0:
|
case 0:
|
||||||
|
@ -1420,6 +1494,8 @@ int get_cpuname(void){
|
||||||
switch (model) {
|
switch (model) {
|
||||||
case 1:
|
case 1:
|
||||||
// AMD Ryzen
|
// AMD Ryzen
|
||||||
|
case 8:
|
||||||
|
// AMD Ryzen2
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
#ifndef NO_AVX2
|
#ifndef NO_AVX2
|
||||||
return CPUTYPE_ZEN;
|
return CPUTYPE_ZEN;
|
||||||
|
@ -1435,6 +1511,26 @@ int get_cpuname(void){
|
||||||
return CPUTYPE_AMD_UNKNOWN;
|
return CPUTYPE_AMD_UNKNOWN;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (vendor == VENDOR_HYGON){
|
||||||
|
switch (family) {
|
||||||
|
case 0xf:
|
||||||
|
switch (exfamily) {
|
||||||
|
case 9:
|
||||||
|
//Hygon Dhyana
|
||||||
|
if(support_avx())
|
||||||
|
#ifndef NO_AVX2
|
||||||
|
return CPUTYPE_ZEN;
|
||||||
|
#else
|
||||||
|
return CPUTYPE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator
|
||||||
|
#endif
|
||||||
|
else
|
||||||
|
return CPUTYPE_BARCELONA;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return CPUTYPE_HYGON_UNKNOWN;
|
||||||
|
}
|
||||||
|
|
||||||
if (vendor == VENDOR_CYRIX){
|
if (vendor == VENDOR_CYRIX){
|
||||||
switch (family) {
|
switch (family) {
|
||||||
case 0x4:
|
case 0x4:
|
||||||
|
@ -1556,6 +1652,8 @@ static char *cpuname[] = {
|
||||||
"STEAMROLLER",
|
"STEAMROLLER",
|
||||||
"EXCAVATOR",
|
"EXCAVATOR",
|
||||||
"ZEN",
|
"ZEN",
|
||||||
|
"SKYLAKEX",
|
||||||
|
"DHYANA"
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *lowercpuname[] = {
|
static char *lowercpuname[] = {
|
||||||
|
@ -1610,10 +1708,12 @@ static char *lowercpuname[] = {
|
||||||
"steamroller",
|
"steamroller",
|
||||||
"excavator",
|
"excavator",
|
||||||
"zen",
|
"zen",
|
||||||
|
"skylakex",
|
||||||
|
"dhyana"
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *corename[] = {
|
static char *corename[] = {
|
||||||
"UNKOWN",
|
"UNKNOWN",
|
||||||
"80486",
|
"80486",
|
||||||
"P5",
|
"P5",
|
||||||
"P6",
|
"P6",
|
||||||
|
@ -1641,6 +1741,8 @@ static char *corename[] = {
|
||||||
"STEAMROLLER",
|
"STEAMROLLER",
|
||||||
"EXCAVATOR",
|
"EXCAVATOR",
|
||||||
"ZEN",
|
"ZEN",
|
||||||
|
"SKYLAKEX",
|
||||||
|
"DHYANA"
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *corename_lower[] = {
|
static char *corename_lower[] = {
|
||||||
|
@ -1672,6 +1774,8 @@ static char *corename_lower[] = {
|
||||||
"steamroller",
|
"steamroller",
|
||||||
"excavator",
|
"excavator",
|
||||||
"zen",
|
"zen",
|
||||||
|
"skylakex",
|
||||||
|
"dhyana"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -1714,7 +1818,11 @@ int get_coretype(void){
|
||||||
case 4:
|
case 4:
|
||||||
case 5:
|
case 5:
|
||||||
case 6:
|
case 6:
|
||||||
|
#if defined(__x86_64__) || defined(__amd64__)
|
||||||
|
return CORE_CORE2;
|
||||||
|
#else
|
||||||
return CORE_P6;
|
return CORE_P6;
|
||||||
|
#endif
|
||||||
case 7:
|
case 7:
|
||||||
return CORE_KATMAI;
|
return CORE_KATMAI;
|
||||||
case 8:
|
case 8:
|
||||||
|
@ -1860,6 +1968,19 @@ int get_coretype(void){
|
||||||
else
|
else
|
||||||
return CORE_NEHALEM;
|
return CORE_NEHALEM;
|
||||||
case 5:
|
case 5:
|
||||||
|
// Skylake X
|
||||||
|
#ifndef NO_AVX512
|
||||||
|
return CORE_SKYLAKEX;
|
||||||
|
#else
|
||||||
|
if(support_avx())
|
||||||
|
#ifndef NO_AVX2
|
||||||
|
return CORE_HASWELL;
|
||||||
|
#else
|
||||||
|
return CORE_SANDYBRIDGE;
|
||||||
|
#endif
|
||||||
|
else
|
||||||
|
return CORE_NEHALEM;
|
||||||
|
#endif
|
||||||
case 14:
|
case 14:
|
||||||
// Skylake
|
// Skylake
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
|
@ -1885,6 +2006,38 @@ int get_coretype(void){
|
||||||
return CORE_NEHALEM;
|
return CORE_NEHALEM;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case 6:
|
||||||
|
if (model == 6)
|
||||||
|
#ifndef NO_AVX512
|
||||||
|
return CORE_SKYLAKEX;
|
||||||
|
#else
|
||||||
|
if(support_avx())
|
||||||
|
#ifndef NO_AVX2
|
||||||
|
return CORE_HASWELL;
|
||||||
|
#else
|
||||||
|
return CORE_SANDYBRIDGE;
|
||||||
|
#endif
|
||||||
|
else
|
||||||
|
return CORE_NEHALEM;
|
||||||
|
#endif
|
||||||
|
break;
|
||||||
|
case 7:
|
||||||
|
if (model == 10)
|
||||||
|
return CORE_NEHALEM;
|
||||||
|
if (model == 14)
|
||||||
|
#ifndef NO_AVX512
|
||||||
|
return CORE_SKYLAKEX;
|
||||||
|
#else
|
||||||
|
if(support_avx())
|
||||||
|
#ifndef NO_AVX2
|
||||||
|
return CORE_HASWELL;
|
||||||
|
#else
|
||||||
|
return CORE_SANDYBRIDGE;
|
||||||
|
#endif
|
||||||
|
else
|
||||||
|
return CORE_NEHALEM;
|
||||||
|
#endif
|
||||||
|
break;
|
||||||
case 9:
|
case 9:
|
||||||
case 8:
|
case 8:
|
||||||
if (model == 14) { // Kaby Lake
|
if (model == 14) { // Kaby Lake
|
||||||
|
@ -1908,7 +2061,11 @@ int get_coretype(void){
|
||||||
|
|
||||||
if (vendor == VENDOR_AMD){
|
if (vendor == VENDOR_AMD){
|
||||||
if (family <= 0x5) return CORE_80486;
|
if (family <= 0x5) return CORE_80486;
|
||||||
|
#if defined(__x86_64__) || defined(__amd64__)
|
||||||
|
if (family <= 0xe) return CORE_BARCELONA;
|
||||||
|
#else
|
||||||
if (family <= 0xe) return CORE_ATHLON;
|
if (family <= 0xe) return CORE_ATHLON;
|
||||||
|
#endif
|
||||||
if (family == 0xf){
|
if (family == 0xf){
|
||||||
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
|
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
|
||||||
else if (exfamily == 5) return CORE_BOBCAT;
|
else if (exfamily == 5) return CORE_BOBCAT;
|
||||||
|
@ -1958,6 +2115,8 @@ int get_coretype(void){
|
||||||
switch (model) {
|
switch (model) {
|
||||||
case 1:
|
case 1:
|
||||||
// AMD Ryzen
|
// AMD Ryzen
|
||||||
|
case 8:
|
||||||
|
// Ryzen 2
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
#ifndef NO_AVX2
|
#ifndef NO_AVX2
|
||||||
return CORE_ZEN;
|
return CORE_ZEN;
|
||||||
|
@ -1973,6 +2132,23 @@ int get_coretype(void){
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (vendor == VENDOR_HYGON){
|
||||||
|
if (family == 0xf){
|
||||||
|
if (exfamily == 9) {
|
||||||
|
if(support_avx())
|
||||||
|
#ifndef NO_AVX2
|
||||||
|
return CORE_ZEN;
|
||||||
|
#else
|
||||||
|
return CORE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator
|
||||||
|
#endif
|
||||||
|
else
|
||||||
|
return CORE_BARCELONA;
|
||||||
|
} else {
|
||||||
|
return CORE_BARCELONA;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (vendor == VENDOR_CENTAUR) {
|
if (vendor == VENDOR_CENTAUR) {
|
||||||
switch (family) {
|
switch (family) {
|
||||||
case 0x6:
|
case 0x6:
|
||||||
|
@ -2059,6 +2235,8 @@ void get_cpuconfig(void){
|
||||||
if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n");
|
if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n");
|
||||||
if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n");
|
if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n");
|
||||||
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n");
|
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n");
|
||||||
|
if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n");
|
||||||
|
if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n");
|
||||||
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
|
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
|
||||||
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
|
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
|
||||||
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
|
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
|
||||||
|
@ -2127,6 +2305,8 @@ void get_sse(void){
|
||||||
if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n");
|
if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n");
|
||||||
if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n");
|
if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n");
|
||||||
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n");
|
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n");
|
||||||
|
if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n");
|
||||||
|
if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n");
|
||||||
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
|
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
|
||||||
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
|
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
|
||||||
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
|
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
|
||||||
|
|
|
@ -29,15 +29,21 @@
|
||||||
|
|
||||||
#define CPU_GENERIC 0
|
#define CPU_GENERIC 0
|
||||||
#define CPU_Z13 1
|
#define CPU_Z13 1
|
||||||
|
#define CPU_Z14 2
|
||||||
|
#define CPU_Z15 3
|
||||||
|
|
||||||
static char *cpuname[] = {
|
static char *cpuname[] = {
|
||||||
"ZARCH_GENERIC",
|
"ZARCH_GENERIC",
|
||||||
"Z13"
|
"Z13",
|
||||||
|
"Z14",
|
||||||
|
"Z15"
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *cpuname_lower[] = {
|
static char *cpuname_lower[] = {
|
||||||
"zarch_generic",
|
"zarch_generic",
|
||||||
"z13"
|
"z13",
|
||||||
|
"z14",
|
||||||
|
"z15"
|
||||||
};
|
};
|
||||||
|
|
||||||
int detect(void)
|
int detect(void)
|
||||||
|
@ -61,6 +67,10 @@ int detect(void)
|
||||||
|
|
||||||
if (strstr(p, "2964")) return CPU_Z13;
|
if (strstr(p, "2964")) return CPU_Z13;
|
||||||
if (strstr(p, "2965")) return CPU_Z13;
|
if (strstr(p, "2965")) return CPU_Z13;
|
||||||
|
if (strstr(p, "3906")) return CPU_Z14;
|
||||||
|
if (strstr(p, "3907")) return CPU_Z14;
|
||||||
|
if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14
|
||||||
|
if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14
|
||||||
|
|
||||||
return CPU_GENERIC;
|
return CPU_GENERIC;
|
||||||
}
|
}
|
||||||
|
@ -107,5 +117,16 @@ void get_cpuconfig(void)
|
||||||
printf("#define Z13\n");
|
printf("#define Z13\n");
|
||||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||||
break;
|
break;
|
||||||
|
case CPU_Z14:
|
||||||
|
printf("#define Z14\n");
|
||||||
|
printf("#define L1_DATA_SIZE 131072\n");
|
||||||
|
printf("#define L1_DATA_LINESIZE 256\n");
|
||||||
|
printf("#define L1_DATA_ASSOCIATIVE 8\n");
|
||||||
|
printf("#define L2_SIZE 4194304\n");
|
||||||
|
printf("#define L2_LINESIZE 256\n");
|
||||||
|
printf("#define L2_ASSOCIATIVE 8\n");
|
||||||
|
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||||
|
printf("#define DTB_SIZE 4096\n");
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
6
ctest.c
6
ctest.c
|
@ -101,6 +101,10 @@ OS_INTERIX
|
||||||
OS_LINUX
|
OS_LINUX
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__HAIKU__)
|
||||||
|
OS_HAIKU
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__i386) || defined(_X86)
|
#if defined(__i386) || defined(_X86)
|
||||||
ARCH_X86
|
ARCH_X86
|
||||||
#endif
|
#endif
|
||||||
|
@ -109,7 +113,7 @@ ARCH_X86
|
||||||
ARCH_X86_64
|
ARCH_X86_64
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER)
|
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) || defined(__POWERPC__)
|
||||||
ARCH_POWER
|
ARCH_POWER
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,8 @@ TOPDIR = ..
|
||||||
include $(TOPDIR)/Makefile.system
|
include $(TOPDIR)/Makefile.system
|
||||||
|
|
||||||
override CFLAGS += -DADD$(BU) -DCBLAS
|
override CFLAGS += -DADD$(BU) -DCBLAS
|
||||||
|
override TARGET_ARCH=
|
||||||
|
override TARGET_MACH=
|
||||||
|
|
||||||
LIB = $(TOPDIR)/$(LIBNAME)
|
LIB = $(TOPDIR)/$(LIBNAME)
|
||||||
|
|
||||||
|
@ -102,7 +104,13 @@ clean ::
|
||||||
rm -f x*
|
rm -f x*
|
||||||
|
|
||||||
FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
|
FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
|
||||||
CEXTRALIB =
|
ifeq ($(USE_OPENMP), 1)
|
||||||
|
ifeq ($(F_COMPILER), GFORTRAN)
|
||||||
|
ifeq ($(C_COMPILER), CLANG)
|
||||||
|
CEXTRALIB = -lomp
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
# Single real
|
# Single real
|
||||||
xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME)
|
xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME)
|
||||||
|
|
|
@ -577,7 +577,7 @@
|
||||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
||||||
* ************************* STEST1 *****************************
|
* ************************* STEST1 *****************************
|
||||||
*
|
*
|
||||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
|
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
|
||||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
||||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
||||||
*
|
*
|
||||||
|
|
|
@ -1503,6 +1503,8 @@ C $ ' .' )
|
||||||
NC = 0
|
NC = 0
|
||||||
RESET = .TRUE.
|
RESET = .TRUE.
|
||||||
ERRMAX = RZERO
|
ERRMAX = RZERO
|
||||||
|
RALS = RONE
|
||||||
|
RBETS = RONE
|
||||||
*
|
*
|
||||||
DO 100 IN = 1, NIDIM
|
DO 100 IN = 1, NIDIM
|
||||||
N = IDIM( IN )
|
N = IDIM( IN )
|
||||||
|
|
|
@ -653,7 +653,7 @@
|
||||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
||||||
* ************************* STEST1 *****************************
|
* ************************* STEST1 *****************************
|
||||||
*
|
*
|
||||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
|
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
|
||||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
||||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
||||||
*
|
*
|
||||||
|
|
|
@ -653,7 +653,7 @@
|
||||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
||||||
* ************************* STEST1 *****************************
|
* ************************* STEST1 *****************************
|
||||||
*
|
*
|
||||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
|
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
|
||||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
||||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
||||||
*
|
*
|
||||||
|
|
|
@ -577,7 +577,7 @@
|
||||||
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
|
||||||
* ************************* STEST1 *****************************
|
* ************************* STEST1 *****************************
|
||||||
*
|
*
|
||||||
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
|
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
|
||||||
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
|
||||||
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
|
||||||
*
|
*
|
||||||
|
|
|
@ -1504,6 +1504,8 @@ C $ ' .' )
|
||||||
NC = 0
|
NC = 0
|
||||||
RESET = .TRUE.
|
RESET = .TRUE.
|
||||||
ERRMAX = RZERO
|
ERRMAX = RZERO
|
||||||
|
RALS = RONE
|
||||||
|
RBETS = RONE
|
||||||
*
|
*
|
||||||
DO 100 IN = 1, NIDIM
|
DO 100 IN = 1, NIDIM
|
||||||
N = IDIM( IN )
|
N = IDIM( IN )
|
||||||
|
|
|
@ -5,7 +5,7 @@ T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||||
6 NUMBER OF VALUES OF N
|
7 NUMBER OF VALUES OF N
|
||||||
1 2 3 5 7 9 35 VALUES OF N
|
1 2 3 5 7 9 35 VALUES OF N
|
||||||
3 NUMBER OF VALUES OF ALPHA
|
3 NUMBER OF VALUES OF ALPHA
|
||||||
0.0 1.0 0.7 VALUES OF ALPHA
|
0.0 1.0 0.7 VALUES OF ALPHA
|
||||||
|
|
|
@ -5,7 +5,7 @@ T LOGICAL FLAG, T TO STOP ON FAILURES.
|
||||||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||||
6 NUMBER OF VALUES OF N
|
7 NUMBER OF VALUES OF N
|
||||||
0 1 2 3 5 9 35 VALUES OF N
|
0 1 2 3 5 9 35 VALUES OF N
|
||||||
3 NUMBER OF VALUES OF ALPHA
|
3 NUMBER OF VALUES OF ALPHA
|
||||||
0.0 1.0 0.7 VALUES OF ALPHA
|
0.0 1.0 0.7 VALUES OF ALPHA
|
||||||
|
|
|
@ -62,9 +62,36 @@
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef TRANSA
|
#ifndef thread_local
|
||||||
|
# if __STDC_VERSION__ >= 201112 && !defined __STDC_NO_THREADS__
|
||||||
|
# define thread_local _Thread_local
|
||||||
|
# elif defined _WIN32 && ( \
|
||||||
|
defined _MSC_VER || \
|
||||||
|
defined __ICL || \
|
||||||
|
defined __DMC__ || \
|
||||||
|
defined __BORLANDC__ )
|
||||||
|
# define thread_local __declspec(thread)
|
||||||
|
/* note that ICC (linux) and Clang are covered by __GNUC__ */
|
||||||
|
# elif defined __GNUC__ || \
|
||||||
|
defined __SUNPRO_C || \
|
||||||
|
defined __xlC__
|
||||||
|
# define thread_local __thread
|
||||||
|
# else
|
||||||
|
# define UNSAFE
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#if defined USE_OPENMP
|
||||||
|
#undef UNSAFE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(TRANSA) && !defined(UNSAFE)
|
||||||
#define Y_DUMMY_NUM 1024
|
#define Y_DUMMY_NUM 1024
|
||||||
|
#if defined(USE_OPENMP)
|
||||||
static FLOAT y_dummy[Y_DUMMY_NUM];
|
static FLOAT y_dummy[Y_DUMMY_NUM];
|
||||||
|
#pragma omp threadprivate(y_dummy)
|
||||||
|
# else
|
||||||
|
static thread_local FLOAT y_dummy[Y_DUMMY_NUM];
|
||||||
|
# endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
|
static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
|
||||||
|
@ -105,10 +132,12 @@ static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
||||||
#ifdef TRANSA
|
#ifdef TRANSA
|
||||||
y += n_from * incy * COMPSIZE;
|
y += n_from * incy * COMPSIZE;
|
||||||
#else
|
#else
|
||||||
|
# ifndef UNSAFE
|
||||||
//for split matrix row (n) direction and vector x of gemv_n
|
//for split matrix row (n) direction and vector x of gemv_n
|
||||||
x += n_from * incx * COMPSIZE;
|
x += n_from * incx * COMPSIZE;
|
||||||
//store partial result for every thread
|
//store partial result for every thread
|
||||||
y += (m_to - m_from) * 1 * COMPSIZE * pos;
|
y += (m_to - m_from) * 1 * COMPSIZE * pos;
|
||||||
|
# endif
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -136,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
||||||
|
|
||||||
BLASLONG width, i, num_cpu;
|
BLASLONG width, i, num_cpu;
|
||||||
|
|
||||||
#ifndef TRANSA
|
#if !defined(TRANSA) && !defined(UNSAFE)
|
||||||
int split_x=0;
|
int split_x=0;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -212,7 +241,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
||||||
i -= width;
|
i -= width;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef TRANSA
|
#if !defined(TRANSA) && !defined(UNSAFE)
|
||||||
//try to split matrix on row direction and x.
|
//try to split matrix on row direction and x.
|
||||||
//Then, reduction.
|
//Then, reduction.
|
||||||
if (num_cpu < nthreads) {
|
if (num_cpu < nthreads) {
|
||||||
|
@ -272,7 +301,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
|
||||||
exec_blas(num_cpu, queue);
|
exec_blas(num_cpu, queue);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef TRANSA
|
#if !defined(TRANSA) && !defined(UNSAFE)
|
||||||
if(split_x==1){
|
if(split_x==1){
|
||||||
//reduction
|
//reduction
|
||||||
for(i=0; i<num_cpu; i++){
|
for(i=0; i<num_cpu; i++){
|
||||||
|
|
|
@ -54,16 +54,12 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
|
||||||
COPY_K(m, b, incb, buffer, 1);
|
COPY_K(m, b, incb, buffer, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*FIXME the GEMV unrolling performed here was found to be broken, see issue 1332 */
|
for (is = 0; is < m; is += DTB_ENTRIES){
|
||||||
/* Multiplying DTB size by 100 is just a quick-and-dirty hack to disable it for now[B */
|
|
||||||
|
|
||||||
for (is = 0; is < m; is += DTB_ENTRIES * 100){
|
min_i = MIN(m - is, DTB_ENTRIES);
|
||||||
|
|
||||||
min_i = MIN(m - is, DTB_ENTRIES * 100);
|
|
||||||
|
|
||||||
#ifndef TRANSA
|
#ifndef TRANSA
|
||||||
if (is > 0){
|
if (is > 0){
|
||||||
fprintf(stderr,"WARNING unrolling of the trmv_U loop may give wrong results\n");
|
|
||||||
GEMV_N(is, min_i, 0, dp1,
|
GEMV_N(is, min_i, 0, dp1,
|
||||||
a + is * lda, lda,
|
a + is * lda, lda,
|
||||||
B + is, 1,
|
B + is, 1,
|
||||||
|
|
|
@ -346,7 +346,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
|
||||||
|
|
||||||
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
|
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
|
||||||
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
|
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
|
||||||
if (range_n[num_cpu] > m) range_n[num_cpu] = m;
|
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
|
||||||
|
|
||||||
queue[num_cpu].mode = mode;
|
queue[num_cpu].mode = mode;
|
||||||
queue[num_cpu].routine = trmv_kernel;
|
queue[num_cpu].routine = trmv_kernel;
|
||||||
|
@ -386,7 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
|
||||||
|
|
||||||
range_m[num_cpu + 1] = range_m[num_cpu] + width;
|
range_m[num_cpu + 1] = range_m[num_cpu] + width;
|
||||||
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
|
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
|
||||||
if (range_n[num_cpu] > m) range_n[num_cpu] = m;
|
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
|
||||||
|
|
||||||
queue[num_cpu].mode = mode;
|
queue[num_cpu].mode = mode;
|
||||||
queue[num_cpu].routine = trmv_kernel;
|
queue[num_cpu].routine = trmv_kernel;
|
||||||
|
|
|
@ -362,7 +362,7 @@ cgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -410,7 +410,7 @@ zgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -458,7 +458,7 @@ xgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -558,7 +558,7 @@ cgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -606,7 +606,7 @@ zgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -654,7 +654,7 @@ xgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -1821,7 +1821,7 @@ cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -1869,7 +1869,7 @@ zgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -1917,7 +1917,7 @@ xgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -1974,7 +1974,7 @@ cgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -2022,7 +2022,7 @@ zgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -2070,7 +2070,7 @@ xgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -2731,7 +2731,7 @@ cgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -2779,7 +2779,7 @@ zgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -2827,7 +2827,7 @@ xgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -2927,7 +2927,7 @@ cgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -2975,7 +2975,7 @@ zgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -3023,7 +3023,7 @@ xgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -4190,7 +4190,7 @@ cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -4238,7 +4238,7 @@ zgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -4286,7 +4286,7 @@ xgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -4343,7 +4343,7 @@ cgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -4391,7 +4391,7 @@ zgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
@ -4439,7 +4439,7 @@ xgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||||
|
|
||||||
xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||||
|
|
||||||
xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||||
|
|
|
@ -338,7 +338,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||||
min_jj = min_j + js - jjs;
|
min_jj = min_j + js - jjs;
|
||||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||||
|
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
|
|
||||||
|
@ -398,7 +398,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||||
min_jj = min_j + js - jjs;
|
min_jj = min_j + js - jjs;
|
||||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||||
|
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
|
|
||||||
|
@ -463,7 +463,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||||
min_jj = min_j + js - jjs;
|
min_jj = min_j + js - jjs;
|
||||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||||
|
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
|
|
||||||
|
|
|
@ -332,13 +332,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
#else
|
#else
|
||||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||||
min_jj = min_j + js - jjs;
|
min_jj = min_j + js - jjs;
|
||||||
|
#ifdef SKYLAKEX
|
||||||
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */
|
||||||
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
|
#else
|
||||||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||||
else
|
else
|
||||||
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
|
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
|
|
|
@ -91,7 +91,12 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
#if __STDC_VERSION__ >= 201112L
|
||||||
|
_Atomic
|
||||||
|
#else
|
||||||
|
volatile
|
||||||
|
#endif
|
||||||
|
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||||
} job_t;
|
} job_t;
|
||||||
|
|
||||||
|
|
||||||
|
@ -99,7 +104,7 @@ typedef struct {
|
||||||
#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
|
#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
|
||||||
GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
|
GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
|
||||||
BETA[0], BETA[1], NULL, 0, NULL, 0, \
|
BETA[0], BETA[1], NULL, 0, NULL, 0, \
|
||||||
(FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC)
|
(FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef ICOPYB_OPERATION
|
#ifndef ICOPYB_OPERATION
|
||||||
|
@ -403,13 +408,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
/* Make sure if no one is using another buffer */
|
/* Make sure if no one is using another buffer */
|
||||||
for (i = 0; i < args -> nthreads; i++)
|
for (i = 0; i < args -> nthreads; i++)
|
||||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||||
|
|
||||||
STOP_RPCC(waiting1);
|
STOP_RPCC(waiting1);
|
||||||
|
|
||||||
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
|
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
|
||||||
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
||||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||||
|
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
|
|
||||||
|
@ -436,6 +441,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
for (i = 0; i < args -> nthreads; i++)
|
for (i = 0; i < args -> nthreads; i++)
|
||||||
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
||||||
|
WMB;
|
||||||
}
|
}
|
||||||
|
|
||||||
current = mypos;
|
current = mypos;
|
||||||
|
@ -453,7 +459,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
|
|
||||||
/* thread has to wait */
|
/* thread has to wait */
|
||||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||||
|
|
||||||
STOP_RPCC(waiting2);
|
STOP_RPCC(waiting2);
|
||||||
|
|
||||||
|
@ -472,6 +478,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
if (m_to - m_from == min_i) {
|
if (m_to - m_from == min_i) {
|
||||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||||
|
WMB;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while (current != mypos);
|
} while (current != mypos);
|
||||||
|
@ -512,6 +519,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
if (is + min_i >= m_to) {
|
if (is + min_i >= m_to) {
|
||||||
/* Thread doesn't need this buffer any more */
|
/* Thread doesn't need this buffer any more */
|
||||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||||
|
WMB;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -536,13 +544,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
/* Make sure if no one is using another buffer */
|
/* Make sure if no one is using another buffer */
|
||||||
for (i = 0; i < args -> nthreads; i++)
|
for (i = 0; i < args -> nthreads; i++)
|
||||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||||
|
|
||||||
STOP_RPCC(waiting1);
|
STOP_RPCC(waiting1);
|
||||||
|
|
||||||
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
|
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
|
||||||
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
||||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||||
|
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
|
|
||||||
|
@ -590,7 +598,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
|
|
||||||
/* thread has to wait */
|
/* thread has to wait */
|
||||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||||
|
|
||||||
STOP_RPCC(waiting2);
|
STOP_RPCC(waiting2);
|
||||||
|
|
||||||
|
@ -608,6 +616,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
if (m_to - m_from == min_i) {
|
if (m_to - m_from == min_i) {
|
||||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||||
|
WMB;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while (current != mypos);
|
} while (current != mypos);
|
||||||
|
@ -672,13 +681,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
/* Make sure if no one is using another buffer */
|
/* Make sure if no one is using another buffer */
|
||||||
for (i = 0; i < args -> nthreads; i++)
|
for (i = 0; i < args -> nthreads; i++)
|
||||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||||
|
|
||||||
STOP_RPCC(waiting1);
|
STOP_RPCC(waiting1);
|
||||||
|
|
||||||
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
|
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
|
||||||
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
||||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||||
|
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
|
|
||||||
|
@ -726,7 +735,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
|
|
||||||
/* thread has to wait */
|
/* thread has to wait */
|
||||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||||
|
|
||||||
STOP_RPCC(waiting2);
|
STOP_RPCC(waiting2);
|
||||||
|
|
||||||
|
@ -743,7 +752,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (m_to - m_from == min_i) {
|
if (m_to - m_from == min_i) {
|
||||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||||
|
WMB;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while (current != mypos);
|
} while (current != mypos);
|
||||||
|
@ -782,7 +792,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
#endif
|
#endif
|
||||||
if (is + min_i >= m_to) {
|
if (is + min_i >= m_to) {
|
||||||
/* Thread doesn't need this buffer any more */
|
/* Thread doesn't need this buffer any more */
|
||||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||||
|
WMB;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -799,7 +810,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
for (i = 0; i < args -> nthreads; i++) {
|
for (i = 0; i < args -> nthreads; i++) {
|
||||||
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
|
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
|
||||||
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
|
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;MB;};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -835,6 +846,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
*range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
|
*range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
|
||||||
|
|
||||||
|
#ifndef USE_OPENMP
|
||||||
|
#ifndef OS_WINDOWS
|
||||||
|
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||||
|
#else
|
||||||
|
CRITICAL_SECTION level3_lock;
|
||||||
|
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
blas_arg_t newarg;
|
blas_arg_t newarg;
|
||||||
|
|
||||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||||
|
@ -864,6 +884,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE;
|
mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef USE_OPENMP
|
||||||
|
#ifndef OS_WINDOWS
|
||||||
|
pthread_mutex_lock(&level3_lock);
|
||||||
|
#else
|
||||||
|
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
newarg.m = args -> m;
|
newarg.m = args -> m;
|
||||||
newarg.n = args -> n;
|
newarg.n = args -> n;
|
||||||
newarg.k = args -> k;
|
newarg.k = args -> k;
|
||||||
|
@ -968,6 +996,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
free(job);
|
free(job);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef USE_OPENMP
|
||||||
|
#ifndef OS_WINDOWS
|
||||||
|
pthread_mutex_unlock(&level3_lock);
|
||||||
|
#else
|
||||||
|
LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -67,7 +67,12 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
#if __STDC_VERSION__ >= 201112L
|
||||||
|
_Atomic
|
||||||
|
#else
|
||||||
|
volatile
|
||||||
|
#endif
|
||||||
|
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||||
} job_t;
|
} job_t;
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -48,6 +48,10 @@
|
||||||
#define SWITCH_RATIO 2
|
#define SWITCH_RATIO 2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef GEMM_PREFERED_SIZE
|
||||||
|
#define GEMM_PREFERED_SIZE 1
|
||||||
|
#endif
|
||||||
|
|
||||||
//The array of job_t may overflow the stack.
|
//The array of job_t may overflow the stack.
|
||||||
//Instead, use malloc to alloc job_t.
|
//Instead, use malloc to alloc job_t.
|
||||||
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
|
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
|
||||||
|
@ -91,7 +95,8 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
volatile
|
||||||
|
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||||
} job_t;
|
} job_t;
|
||||||
|
|
||||||
|
|
||||||
|
@ -346,7 +351,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
/* Make sure if no one is using workspace */
|
/* Make sure if no one is using workspace */
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
for (i = 0; i < args -> nthreads; i++)
|
for (i = 0; i < args -> nthreads; i++)
|
||||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||||
STOP_RPCC(waiting1);
|
STOP_RPCC(waiting1);
|
||||||
|
|
||||||
#if defined(FUSED_GEMM) && !defined(TIMING)
|
#if defined(FUSED_GEMM) && !defined(TIMING)
|
||||||
|
@ -360,12 +365,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
/* Split local region of B into parts */
|
/* Split local region of B into parts */
|
||||||
for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){
|
for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){
|
||||||
min_jj = MIN(n_to, js + div_n) - jjs;
|
min_jj = MIN(n_to, js + div_n) - jjs;
|
||||||
|
#ifdef SKYLAKEX
|
||||||
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||||
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
|
#else
|
||||||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||||
else
|
else
|
||||||
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
|
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
|
#endif
|
||||||
/* Copy part of local region of B into workspace */
|
/* Copy part of local region of B into workspace */
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
|
OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
|
||||||
|
@ -408,7 +417,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
|
|
||||||
/* Wait until other region of B is initialized */
|
/* Wait until other region of B is initialized */
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||||
STOP_RPCC(waiting2);
|
STOP_RPCC(waiting2);
|
||||||
|
|
||||||
/* Apply kernel with local region of A and part of other region of B */
|
/* Apply kernel with local region of A and part of other region of B */
|
||||||
|
@ -426,6 +435,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
/* Clear synchronization flag if this thread is done with other region of B */
|
/* Clear synchronization flag if this thread is done with other region of B */
|
||||||
if (m_to - m_from == min_i) {
|
if (m_to - m_from == min_i) {
|
||||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||||
|
WMB;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} while (current != mypos);
|
} while (current != mypos);
|
||||||
|
@ -487,7 +497,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
for (i = 0; i < args -> nthreads; i++) {
|
for (i = 0; i < args -> nthreads; i++) {
|
||||||
for (js = 0; js < DIVIDE_RATE; js++) {
|
for (js = 0; js < DIVIDE_RATE; js++) {
|
||||||
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
|
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
STOP_RPCC(waiting3);
|
STOP_RPCC(waiting3);
|
||||||
|
@ -508,10 +518,29 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int round_up(int remainder, int width, int multiple)
|
||||||
|
{
|
||||||
|
if (multiple > remainder || width <= multiple)
|
||||||
|
return width;
|
||||||
|
width = (width + multiple - 1) / multiple;
|
||||||
|
width = width * multiple;
|
||||||
|
return width;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
*range_n, FLOAT *sa, FLOAT *sb,
|
*range_n, FLOAT *sa, FLOAT *sb,
|
||||||
BLASLONG nthreads_m, BLASLONG nthreads_n) {
|
BLASLONG nthreads_m, BLASLONG nthreads_n) {
|
||||||
|
|
||||||
|
#ifndef USE_OPENMP
|
||||||
|
#ifndef OS_WINDOWS
|
||||||
|
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||||
|
#else
|
||||||
|
CRITICAL_SECTION level3_lock;
|
||||||
|
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
blas_arg_t newarg;
|
blas_arg_t newarg;
|
||||||
|
|
||||||
#ifndef USE_ALLOC_HEAP
|
#ifndef USE_ALLOC_HEAP
|
||||||
|
@ -552,6 +581,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef USE_OPENMP
|
||||||
|
#ifndef OS_WINDOWS
|
||||||
|
pthread_mutex_lock(&level3_lock);
|
||||||
|
#else
|
||||||
|
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef USE_ALLOC_HEAP
|
#ifdef USE_ALLOC_HEAP
|
||||||
/* Dynamically allocate workspace */
|
/* Dynamically allocate workspace */
|
||||||
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
|
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
|
||||||
|
@ -599,9 +636,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
num_parts = 0;
|
num_parts = 0;
|
||||||
while (m > 0){
|
while (m > 0){
|
||||||
width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
|
width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
|
||||||
|
|
||||||
|
width = round_up(m, width, GEMM_PREFERED_SIZE);
|
||||||
|
|
||||||
m -= width;
|
m -= width;
|
||||||
|
|
||||||
if (m < 0) width = width + m;
|
if (m < 0) width = width + m;
|
||||||
range_M[num_parts + 1] = range_M[num_parts] + width;
|
range_M[num_parts + 1] = range_M[num_parts] + width;
|
||||||
|
|
||||||
num_parts ++;
|
num_parts ++;
|
||||||
}
|
}
|
||||||
for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
|
for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
|
||||||
|
@ -643,9 +685,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
if (width < SWITCH_RATIO) {
|
if (width < SWITCH_RATIO) {
|
||||||
width = SWITCH_RATIO;
|
width = SWITCH_RATIO;
|
||||||
}
|
}
|
||||||
|
width = round_up(n, width, GEMM_PREFERED_SIZE);
|
||||||
|
|
||||||
n -= width;
|
n -= width;
|
||||||
if (n < 0) width = width + n;
|
if (n < 0) width = width + n;
|
||||||
range_N[num_parts + 1] = range_N[num_parts] + width;
|
range_N[num_parts + 1] = range_N[num_parts] + width;
|
||||||
|
|
||||||
num_parts ++;
|
num_parts ++;
|
||||||
}
|
}
|
||||||
for (j = num_parts; j < MAX_CPU_NUMBER; j++) {
|
for (j = num_parts; j < MAX_CPU_NUMBER; j++) {
|
||||||
|
@ -653,8 +698,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Clear synchronization flags */
|
/* Clear synchronization flags */
|
||||||
for (i = 0; i < MAX_CPU_NUMBER; i++) {
|
for (i = 0; i < nthreads; i++) {
|
||||||
for (j = 0; j < MAX_CPU_NUMBER; j++) {
|
for (j = 0; j < nthreads; j++) {
|
||||||
for (k = 0; k < DIVIDE_RATE; k++) {
|
for (k = 0; k < DIVIDE_RATE; k++) {
|
||||||
job[i].working[j][CACHE_LINE_SIZE * k] = 0;
|
job[i].working[j][CACHE_LINE_SIZE * k] = 0;
|
||||||
}
|
}
|
||||||
|
@ -669,6 +714,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||||
free(job);
|
free(job);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifndef USE_OPENMP
|
||||||
|
#ifndef OS_WINDOWS
|
||||||
|
pthread_mutex_unlock(&level3_lock);
|
||||||
|
#else
|
||||||
|
LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -48,7 +48,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
||||||
|
|
||||||
BLASLONG width, i;
|
BLASLONG width, i;
|
||||||
BLASLONG n_from, n_to;
|
BLASLONG n_from, n_to;
|
||||||
double dnum, nf, nt, di;
|
double dnum, nf, nt, di, dinum;
|
||||||
|
|
||||||
int num_cpu;
|
int num_cpu;
|
||||||
int mask = 0;
|
int mask = 0;
|
||||||
|
@ -109,7 +109,11 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
||||||
if (nthreads - num_cpu > 1) {
|
if (nthreads - num_cpu > 1) {
|
||||||
|
|
||||||
di = (double)i;
|
di = (double)i;
|
||||||
width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1);
|
dinum = di * di +dnum;
|
||||||
|
if (dinum <0)
|
||||||
|
width = (BLASLONG)(( - di + mask)/(mask+1)) * (mask+1);
|
||||||
|
else
|
||||||
|
width = (BLASLONG)(( sqrt(dinum) - di + mask)/(mask+1)) * (mask+1);
|
||||||
|
|
||||||
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
|
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
|
||||||
|
|
||||||
|
@ -136,9 +140,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
||||||
|
|
||||||
nf = (double)(arg -> n - n_from);
|
nf = (double)(arg -> n - n_from);
|
||||||
nt = (double)(arg -> n - n_to);
|
nt = (double)(arg -> n - n_to);
|
||||||
|
|
||||||
dnum = (nt * nt - nf * nf) / (double)nthreads;
|
dnum = (nt * nt - nf * nf) / (double)nthreads;
|
||||||
|
|
||||||
num_cpu = 0;
|
num_cpu = 0;
|
||||||
|
|
||||||
range[0] = n_from;
|
range[0] = n_from;
|
||||||
|
@ -149,8 +151,11 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
||||||
if (nthreads - num_cpu > 1) {
|
if (nthreads - num_cpu > 1) {
|
||||||
|
|
||||||
di = (double)(arg -> n - i);
|
di = (double)(arg -> n - i);
|
||||||
width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1);
|
dinum = di * di + dnum;
|
||||||
|
if (dinum<0)
|
||||||
|
width = ((BLASLONG)(di + mask)/(mask+1)) * (mask+1);
|
||||||
|
else
|
||||||
|
width = ((BLASLONG)((-sqrt(dinum) + di) + mask)/(mask+1)) * (mask+1);
|
||||||
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
|
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -135,10 +135,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
|
|
||||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||||
min_jj = min_j + js - jjs;
|
min_jj = min_j + js - jjs;
|
||||||
|
#ifdef SKYLAKEX
|
||||||
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||||
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
|
#else
|
||||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
|
#endif
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
|
|
||||||
GEMM_ONCOPY(min_l, min_jj, b + (jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
|
GEMM_ONCOPY(min_l, min_jj, b + (jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
|
||||||
|
@ -201,10 +205,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
|
|
||||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||||
min_jj = min_j + js - jjs;
|
min_jj = min_j + js - jjs;
|
||||||
|
#ifdef SKYLAKEX
|
||||||
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||||
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
|
#else
|
||||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
|
#endif
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
|
|
||||||
GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
|
GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
|
||||||
|
@ -292,10 +300,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
|
|
||||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||||
min_jj = min_j + js - jjs;
|
min_jj = min_j + js - jjs;
|
||||||
|
#ifdef SKYLAKEX
|
||||||
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||||
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
|
#else
|
||||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
|
#endif
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
|
|
||||||
GEMM_ONCOPY(min_l, min_jj, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb,
|
GEMM_ONCOPY(min_l, min_jj, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb,
|
||||||
|
@ -358,10 +370,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
|
|
||||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||||
min_jj = min_j + js - jjs;
|
min_jj = min_j + js - jjs;
|
||||||
|
#ifdef SKYLAKEX
|
||||||
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||||
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
|
#else
|
||||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
|
#endif
|
||||||
START_RPCC();
|
START_RPCC();
|
||||||
|
|
||||||
GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb,
|
GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb,
|
||||||
|
|
|
@ -122,10 +122,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
|
|
||||||
for(jjs = 0; jjs < ls - js; jjs += min_jj){
|
for(jjs = 0; jjs < ls - js; jjs += min_jj){
|
||||||
min_jj = ls - js - jjs;
|
min_jj = ls - js - jjs;
|
||||||
|
#ifdef SKYLAKEX
|
||||||
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||||
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
|
#else
|
||||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
|
#endif
|
||||||
#ifndef TRANSA
|
#ifndef TRANSA
|
||||||
GEMM_ONCOPY(min_l, min_jj, a + (ls + (js + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE);
|
GEMM_ONCOPY(min_l, min_jj, a + (ls + (js + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE);
|
||||||
#else
|
#else
|
||||||
|
@ -142,10 +146,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
|
|
||||||
for(jjs = 0; jjs < min_l; jjs += min_jj){
|
for(jjs = 0; jjs < min_l; jjs += min_jj){
|
||||||
min_jj = min_l - jjs;
|
min_jj = min_l - jjs;
|
||||||
|
#ifdef SKYLAKEX
|
||||||
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||||
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
|
#else
|
||||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
|
#endif
|
||||||
#ifndef TRANSA
|
#ifndef TRANSA
|
||||||
TRMM_OLNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE);
|
TRMM_OLNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE);
|
||||||
#else
|
#else
|
||||||
|
@ -195,10 +203,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
|
|
||||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||||
min_jj = min_j + js - jjs;
|
min_jj = min_j + js - jjs;
|
||||||
|
#ifdef SKYLAKEX
|
||||||
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||||
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
|
#else
|
||||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
|
#endif
|
||||||
#ifndef TRANSA
|
#ifndef TRANSA
|
||||||
GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
|
GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
|
||||||
#else
|
#else
|
||||||
|
@ -246,10 +258,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
|
|
||||||
for(jjs = 0; jjs < min_l; jjs += min_jj){
|
for(jjs = 0; jjs < min_l; jjs += min_jj){
|
||||||
min_jj = min_l - jjs;
|
min_jj = min_l - jjs;
|
||||||
|
#ifdef SKYLAKEX
|
||||||
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||||
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
|
#else
|
||||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
|
#endif
|
||||||
#ifndef TRANSA
|
#ifndef TRANSA
|
||||||
TRMM_OUNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE);
|
TRMM_OUNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE);
|
||||||
#else
|
#else
|
||||||
|
@ -267,10 +283,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
|
|
||||||
for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){
|
for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){
|
||||||
min_jj = js - ls - min_l - jjs;
|
min_jj = js - ls - min_l - jjs;
|
||||||
|
#ifdef SKYLAKEX
|
||||||
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||||
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
|
#else
|
||||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
|
#endif
|
||||||
#ifndef TRANSA
|
#ifndef TRANSA
|
||||||
GEMM_ONCOPY(min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda,
|
GEMM_ONCOPY(min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda,
|
||||||
sb + min_l * (min_l + jjs) * COMPSIZE);
|
sb + min_l * (min_l + jjs) * COMPSIZE);
|
||||||
|
@ -324,10 +344,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
||||||
|
|
||||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||||
min_jj = min_j + js - jjs;
|
min_jj = min_j + js - jjs;
|
||||||
|
#ifdef SKYLAKEX
|
||||||
|
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||||
|
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||||
|
#else
|
||||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||||
else
|
else
|
||||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||||
|
#endif
|
||||||
#ifndef TRANSA
|
#ifndef TRANSA
|
||||||
GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
|
GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -47,7 +47,11 @@ GenerateNamedObjects("abs.c" "DOUBLE" "z_abs" 0 "" "" 1)
|
||||||
GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" "" 1)
|
GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" "" 1)
|
||||||
|
|
||||||
if (DYNAMIC_ARCH)
|
if (DYNAMIC_ARCH)
|
||||||
|
if (ARM64)
|
||||||
|
list(APPEND COMMON_SOURCES dynamic_arm64.c)
|
||||||
|
else ()
|
||||||
list(APPEND COMMON_SOURCES dynamic.c)
|
list(APPEND COMMON_SOURCES dynamic.c)
|
||||||
|
endif ()
|
||||||
else ()
|
else ()
|
||||||
list(APPEND COMMON_SOURCES parameter.c)
|
list(APPEND COMMON_SOURCES parameter.c)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
|
@ -15,7 +15,19 @@ endif
|
||||||
# COMMONOBJS += info.$(SUFFIX)
|
# COMMONOBJS += info.$(SUFFIX)
|
||||||
|
|
||||||
ifeq ($(DYNAMIC_ARCH), 1)
|
ifeq ($(DYNAMIC_ARCH), 1)
|
||||||
|
ifeq ($(ARCH),arm64)
|
||||||
|
COMMONOBJS += dynamic_arm64.$(SUFFIX)
|
||||||
|
else
|
||||||
|
ifeq ($(ARCH),power)
|
||||||
|
COMMONOBJS += dynamic_power.$(SUFFIX)
|
||||||
|
else
|
||||||
|
ifeq ($(ARCH),zarch)
|
||||||
|
COMMONOBJS += dynamic_zarch.$(SUFFIX)
|
||||||
|
else
|
||||||
COMMONOBJS += dynamic.$(SUFFIX)
|
COMMONOBJS += dynamic.$(SUFFIX)
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
else
|
else
|
||||||
COMMONOBJS += parameter.$(SUFFIX)
|
COMMONOBJS += parameter.$(SUFFIX)
|
||||||
endif
|
endif
|
||||||
|
@ -71,7 +83,19 @@ BLAS_SERVER = blas_server.c
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(DYNAMIC_ARCH), 1)
|
ifeq ($(DYNAMIC_ARCH), 1)
|
||||||
|
ifeq ($(ARCH),arm64)
|
||||||
|
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX)
|
||||||
|
else
|
||||||
|
ifeq ($(ARCH),power)
|
||||||
|
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_power.$(SUFFIX)
|
||||||
|
else
|
||||||
|
ifeq ($(ARCH),zarch)
|
||||||
|
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX)
|
||||||
|
else
|
||||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
|
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
else
|
else
|
||||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
|
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU)
|
||||||
#include <dlfcn.h>
|
#include <dlfcn.h>
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
#include <sys/resource.h>
|
#include <sys/resource.h>
|
||||||
|
@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout();
|
||||||
/* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */
|
/* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */
|
||||||
/* jobs is queued. */
|
/* jobs is queued. */
|
||||||
|
|
||||||
/* We need this grobal for cheking if initialization is finished. */
|
/* We need this global for checking if initialization is finished. */
|
||||||
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
|
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
|
||||||
|
|
||||||
/* Local Variables */
|
/* Local Variables */
|
||||||
|
@ -150,7 +150,7 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));
|
||||||
|
|
||||||
#ifdef MONITOR
|
#ifdef MONITOR
|
||||||
|
|
||||||
/* Monitor is a function to see thread's status for every seconds. */
|
/* Monitor is a function to see thread's status for every second. */
|
||||||
/* Usually it turns off and it's for debugging. */
|
/* Usually it turns off and it's for debugging. */
|
||||||
|
|
||||||
static pthread_t monitor_thread;
|
static pthread_t monitor_thread;
|
||||||
|
@ -582,7 +582,7 @@ int blas_thread_init(void){
|
||||||
if(ret!=0){
|
if(ret!=0){
|
||||||
struct rlimit rlim;
|
struct rlimit rlim;
|
||||||
const char *msg = strerror(ret);
|
const char *msg = strerror(ret);
|
||||||
fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create: %s\n", msg);
|
fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %ld: %s\n", i+1,blas_num_threads,msg);
|
||||||
#ifdef RLIMIT_NPROC
|
#ifdef RLIMIT_NPROC
|
||||||
if(0 == getrlimit(RLIMIT_NPROC, &rlim)) {
|
if(0 == getrlimit(RLIMIT_NPROC, &rlim)) {
|
||||||
fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC "
|
fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC "
|
||||||
|
@ -850,6 +850,11 @@ void goto_set_num_threads(int num_threads) {
|
||||||
|
|
||||||
long i;
|
long i;
|
||||||
|
|
||||||
|
#ifdef SMP_SERVER
|
||||||
|
// Handle lazy re-init of the thread-pool after a POSIX fork
|
||||||
|
if (unlikely(blas_server_avail == 0)) blas_thread_init();
|
||||||
|
#endif
|
||||||
|
|
||||||
if (num_threads < 1) num_threads = blas_num_threads;
|
if (num_threads < 1) num_threads = blas_num_threads;
|
||||||
|
|
||||||
#ifndef NO_AFFINITY
|
#ifndef NO_AFFINITY
|
||||||
|
|
|
@ -36,6 +36,7 @@
|
||||||
/* or implied, of The University of Texas at Austin. */
|
/* or implied, of The University of Texas at Austin. */
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
|
|
||||||
|
#include <stdbool.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
//#include <sys/mman.h>
|
//#include <sys/mman.h>
|
||||||
|
@ -47,13 +48,22 @@
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
#ifndef OMP_SCHED
|
||||||
|
#define OMP_SCHED static
|
||||||
|
#endif
|
||||||
|
|
||||||
int blas_server_avail = 0;
|
int blas_server_avail = 0;
|
||||||
|
|
||||||
static void * blas_thread_buffer[MAX_CPU_NUMBER];
|
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
|
||||||
|
#if __STDC_VERSION__ >= 201112L
|
||||||
|
static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
|
||||||
|
#else
|
||||||
|
static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
|
||||||
|
#endif
|
||||||
|
|
||||||
void goto_set_num_threads(int num_threads) {
|
void goto_set_num_threads(int num_threads) {
|
||||||
|
|
||||||
int i=0;
|
int i=0, j=0;
|
||||||
|
|
||||||
if (num_threads < 1) num_threads = blas_num_threads;
|
if (num_threads < 1) num_threads = blas_num_threads;
|
||||||
|
|
||||||
|
@ -68,15 +78,17 @@ void goto_set_num_threads(int num_threads) {
|
||||||
omp_set_num_threads(blas_cpu_number);
|
omp_set_num_threads(blas_cpu_number);
|
||||||
|
|
||||||
//adjust buffer for each thread
|
//adjust buffer for each thread
|
||||||
for(i=0; i<blas_cpu_number; i++){
|
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
|
||||||
if(blas_thread_buffer[i]==NULL){
|
for(j=0; j<blas_cpu_number; j++){
|
||||||
blas_thread_buffer[i]=blas_memory_alloc(2);
|
if(blas_thread_buffer[i][j]==NULL){
|
||||||
|
blas_thread_buffer[i][j]=blas_memory_alloc(2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for(; i<MAX_CPU_NUMBER; i++){
|
for(; j<MAX_CPU_NUMBER; j++){
|
||||||
if(blas_thread_buffer[i]!=NULL){
|
if(blas_thread_buffer[i][j]!=NULL){
|
||||||
blas_memory_free(blas_thread_buffer[i]);
|
blas_memory_free(blas_thread_buffer[i][j]);
|
||||||
blas_thread_buffer[i]=NULL;
|
blas_thread_buffer[i][j]=NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#if defined(ARCH_MIPS64)
|
#if defined(ARCH_MIPS64)
|
||||||
|
@ -92,30 +104,34 @@ void openblas_set_num_threads(int num_threads) {
|
||||||
|
|
||||||
int blas_thread_init(void){
|
int blas_thread_init(void){
|
||||||
|
|
||||||
int i=0;
|
int i=0, j=0;
|
||||||
|
|
||||||
blas_get_cpu_number();
|
blas_get_cpu_number();
|
||||||
|
|
||||||
blas_server_avail = 1;
|
blas_server_avail = 1;
|
||||||
|
|
||||||
for(i=0; i<blas_num_threads; i++){
|
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
|
||||||
blas_thread_buffer[i]=blas_memory_alloc(2);
|
for(j=0; j<blas_num_threads; j++){
|
||||||
|
blas_thread_buffer[i][j]=blas_memory_alloc(2);
|
||||||
|
}
|
||||||
|
for(; j<MAX_CPU_NUMBER; j++){
|
||||||
|
blas_thread_buffer[i][j]=NULL;
|
||||||
}
|
}
|
||||||
for(; i<MAX_CPU_NUMBER; i++){
|
|
||||||
blas_thread_buffer[i]=NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int BLASFUNC(blas_thread_shutdown)(void){
|
int BLASFUNC(blas_thread_shutdown)(void){
|
||||||
int i=0;
|
int i=0, j=0;
|
||||||
blas_server_avail = 0;
|
blas_server_avail = 0;
|
||||||
|
|
||||||
for(i=0; i<MAX_CPU_NUMBER; i++){
|
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
|
||||||
if(blas_thread_buffer[i]!=NULL){
|
for(j=0; j<MAX_CPU_NUMBER; j++){
|
||||||
blas_memory_free(blas_thread_buffer[i]);
|
if(blas_thread_buffer[i][j]!=NULL){
|
||||||
blas_thread_buffer[i]=NULL;
|
blas_memory_free(blas_thread_buffer[i][j]);
|
||||||
|
blas_thread_buffer[i][j]=NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -206,7 +222,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void exec_threads(blas_queue_t *queue){
|
static void exec_threads(blas_queue_t *queue, int buf_index){
|
||||||
|
|
||||||
void *buffer, *sa, *sb;
|
void *buffer, *sa, *sb;
|
||||||
int pos=0, release_flag=0;
|
int pos=0, release_flag=0;
|
||||||
|
@ -223,7 +239,7 @@ static void exec_threads(blas_queue_t *queue){
|
||||||
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
|
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
|
||||||
|
|
||||||
pos = omp_get_thread_num();
|
pos = omp_get_thread_num();
|
||||||
buffer = blas_thread_buffer[pos];
|
buffer = blas_thread_buffer[buf_index][pos];
|
||||||
|
|
||||||
//fallback
|
//fallback
|
||||||
if(buffer==NULL) {
|
if(buffer==NULL) {
|
||||||
|
@ -291,7 +307,7 @@ static void exec_threads(blas_queue_t *queue){
|
||||||
|
|
||||||
int exec_blas(BLASLONG num, blas_queue_t *queue){
|
int exec_blas(BLASLONG num, blas_queue_t *queue){
|
||||||
|
|
||||||
BLASLONG i;
|
BLASLONG i, buf_index;
|
||||||
|
|
||||||
if ((num <= 0) || (queue == NULL)) return 0;
|
if ((num <= 0) || (queue == NULL)) return 0;
|
||||||
|
|
||||||
|
@ -302,16 +318,39 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#pragma omp parallel for schedule(static)
|
while(true) {
|
||||||
|
for(i=0; i < MAX_PARALLEL_NUMBER; i++) {
|
||||||
|
#if __STDC_VERSION__ >= 201112L
|
||||||
|
_Bool inuse = false;
|
||||||
|
if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) {
|
||||||
|
#else
|
||||||
|
if(blas_buffer_inuse[i] == false) {
|
||||||
|
blas_buffer_inuse[i] = true;
|
||||||
|
#endif
|
||||||
|
buf_index = i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(i != MAX_PARALLEL_NUMBER)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma omp parallel for schedule(OMP_SCHED)
|
||||||
for (i = 0; i < num; i ++) {
|
for (i = 0; i < num; i ++) {
|
||||||
|
|
||||||
#ifndef USE_SIMPLE_THREADED_LEVEL3
|
#ifndef USE_SIMPLE_THREADED_LEVEL3
|
||||||
queue[i].position = i;
|
queue[i].position = i;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
exec_threads(&queue[i]);
|
exec_threads(&queue[i], buf_index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if __STDC_VERSION__ >= 201112L
|
||||||
|
atomic_store(&blas_buffer_inuse[buf_index], false);
|
||||||
|
#else
|
||||||
|
blas_buffer_inuse[buf_index] = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -50,7 +50,7 @@
|
||||||
|
|
||||||
/* This is a thread implementation for Win32 lazy implementation */
|
/* This is a thread implementation for Win32 lazy implementation */
|
||||||
|
|
||||||
/* Thread server common infomation */
|
/* Thread server common information */
|
||||||
typedef struct{
|
typedef struct{
|
||||||
CRITICAL_SECTION lock;
|
CRITICAL_SECTION lock;
|
||||||
HANDLE filled;
|
HANDLE filled;
|
||||||
|
@ -61,7 +61,7 @@ typedef struct{
|
||||||
|
|
||||||
} blas_pool_t;
|
} blas_pool_t;
|
||||||
|
|
||||||
/* We need this global for cheking if initialization is finished. */
|
/* We need this global for checking if initialization is finished. */
|
||||||
int blas_server_avail = 0;
|
int blas_server_avail = 0;
|
||||||
|
|
||||||
/* Local Variables */
|
/* Local Variables */
|
||||||
|
@ -461,12 +461,21 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
||||||
SetEvent(pool.killed);
|
SetEvent(pool.killed);
|
||||||
|
|
||||||
for(i = 0; i < blas_num_threads - 1; i++){
|
for(i = 0; i < blas_num_threads - 1; i++){
|
||||||
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
|
// Could also just use WaitForMultipleObjects
|
||||||
|
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
|
||||||
|
|
||||||
#ifndef OS_WINDOWSSTORE
|
#ifndef OS_WINDOWSSTORE
|
||||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
||||||
|
if (WAIT_OBJECT_0 != wait_thread_value) {
|
||||||
TerminateThread(blas_threads[i],0);
|
TerminateThread(blas_threads[i],0);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
CloseHandle(blas_threads[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
CloseHandle(pool.filled);
|
||||||
|
CloseHandle(pool.killed);
|
||||||
|
|
||||||
blas_server_avail = 0;
|
blas_server_avail = 0;
|
||||||
}
|
}
|
||||||
|
@ -480,6 +489,11 @@ void goto_set_num_threads(int num_threads)
|
||||||
{
|
{
|
||||||
long i;
|
long i;
|
||||||
|
|
||||||
|
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
|
||||||
|
// Handle lazy re-init of the thread-pool after a POSIX fork
|
||||||
|
if (unlikely(blas_server_avail == 0)) blas_thread_init();
|
||||||
|
#endif
|
||||||
|
|
||||||
if (num_threads < 1) num_threads = blas_cpu_number;
|
if (num_threads < 1) num_threads = blas_cpu_number;
|
||||||
|
|
||||||
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
|
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue