Merge branch 'develop' into risc-v

This commit is contained in:
Xianyi Zhang 2020-02-27 13:53:49 +08:00
commit 4aa2d89217
1686 changed files with 142614 additions and 26047 deletions

143
.drone.yml Normal file
View File

@ -0,0 +1,143 @@
---
kind: pipeline
name: arm64_gcc_make
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:19.04
environment:
CC: gcc
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
---
kind: pipeline
name: arm32_gcc_make
platform:
os: linux
arch: arm
steps:
- name: Build and Test
image: ubuntu:19.04
environment:
CC: gcc
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
---
kind: pipeline
name: arm64_clang_make
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
---
kind: pipeline
name: arm32_clang_cmake
platform:
os: linux
arch: arm
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV6 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
commands:
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
- apt-get update -y
- apt-get install -y make $CC g++ perl cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest -V
---
kind: pipeline
name: arm64_gcc_cmake
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: gcc
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
commands:
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
- apt-get update -y
- apt-get install -y make $CC g++ perl cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest -V
---
kind: pipeline
name: arm64_clang_cmake
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: clang
CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON'
commands:
- echo "CMAKE_FLAGS:= $CMAKE_FLAGS"
- apt-get update -y
- apt-get install -y make $CC g++ perl cmake
- $CC --version
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest -V

View File

@ -0,0 +1,78 @@
# Only the "head" branch of the OpenBLAS package is tested
on:
push:
paths:
- '**/nightly-Homebrew-build.yml'
pull_request:
branches:
- develop
paths:
- '**/nightly-Homebrew-build.yml'
schedule:
- cron: 45 7 * * *
# This is 7:45 AM UTC daily, late at night in the USA
# Since push and pull_request will still always be building and testing the `develop` branch,
# it only makes sense to test if this file has been changed
name: Nightly-Homebrew-Build
jobs:
build-OpenBLAS-with-Homebrew:
runs-on: macos-latest
env:
HOMEBREW_DEVELOPER: "ON"
HOMEBREW_DISPLAY_INSTALL_TIMES: "ON"
HOMEBREW_NO_ANALYTICS: "ON"
HOMEBREW_NO_AUTO_UPDATE: "ON"
HOMEBREW_NO_BOTTLE_SOURCE_FALLBACK: "ON"
HOMEBREW_NO_INSTALL_CLEANUP: "ON"
steps:
- name: Random delay for cron job
run: |
delay=$(( RANDOM % 600 ))
printf 'Delaying for %s seconds on event %s' ${delay} "${{ github.event_name }}"
sleep ${delay}
if: github.event_name == 'schedule'
- uses: actions/checkout@v2
# This isn't even needed, technically. Homebrew will get `develop` via git
- name: Update Homebrew
if: github.event_name != 'pull_request'
run: brew update || true
- name: Install prerequisites
run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas
- name: Install and bottle OpenBLAS
run: brew install --fetch-HEAD --HEAD --build-bottle --keep-tmp openblas
# the HEAD flags tell Homebrew to build the develop branch fetch via git
- name: Create bottle
run: |
brew bottle -v openblas
mkdir bottles
mv *.bottle.tar.gz bottles
- name: Upload bottle
uses: actions/upload-artifact@v1
with:
name: openblas--HEAD.catalina.bottle.tar.gz
path: bottles
- name: Show linkage
run: brew linkage -v openblas
- name: Test openblas
run: brew test --HEAD --verbose openblas
- name: Audit openblas formula
run: |
brew audit --strict openblas
brew cat openblas
- name: Post logs on failure
if: failure()
run: brew gist-logs --with-hostname -v openblas

3
.gitignore vendored
View File

@ -87,4 +87,5 @@ build.*
*.swp *.swp
benchmark/*.goto benchmark/*.goto
benchmark/smallscaling benchmark/smallscaling
CMakeCache.txt
CMakeFiles/*

View File

@ -4,11 +4,10 @@ dist: precise
sudo: true sudo: true
language: c language: c
jobs: matrix:
include: include:
- &test-ubuntu - &test-ubuntu
os: linux os: linux
stage: test
compiler: gcc compiler: gcc
addons: addons:
apt: apt:
@ -18,7 +17,7 @@ jobs:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
script: script:
- set -e - set -e
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
- make -C test $COMMON_FLAGS $BTYPE - make -C test $COMMON_FLAGS $BTYPE
- make -C ctest $COMMON_FLAGS $BTYPE - make -C ctest $COMMON_FLAGS $BTYPE
- make -C utest $COMMON_FLAGS $BTYPE - make -C utest $COMMON_FLAGS $BTYPE
@ -26,6 +25,15 @@ jobs:
- TARGET_BOX=LINUX64 - TARGET_BOX=LINUX64
- BTYPE="BINARY=64" - BTYPE="BINARY=64"
- <<: *test-ubuntu
os: linux-ppc64le
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
env:
# for matrix annotation only
- TARGET_BOX=PPC64LE_LINUX
- BTYPE="BINARY=64 USE_OPENMP=1"
- <<: *test-ubuntu - <<: *test-ubuntu
env: env:
- TARGET_BOX=LINUX64 - TARGET_BOX=LINUX64
@ -59,7 +67,6 @@ jobs:
- BTYPE="BINARY=32" - BTYPE="BINARY=32"
- os: linux - os: linux
stage: test
compiler: gcc compiler: gcc
addons: addons:
apt: apt:
@ -80,13 +87,12 @@ jobs:
# that don't require sudo. # that don't require sudo.
- &test-alpine - &test-alpine
os: linux os: linux
stage: test
dist: trusty dist: trusty
sudo: true sudo: true
language: minimal language: minimal
before_install: before_install:
- "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \ - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
&& echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e alpine-chroot-install' | sha1sum -c || exit 1" && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1"
- alpine() { /alpine/enter-chroot -u "$USER" "$@"; } - alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
install: install:
- sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
@ -120,11 +126,10 @@ jobs:
- <<: *test-alpine - <<: *test-alpine
env: env:
- TARGET_BOX=LINUX64_MUSL - TARGET_BOX=LINUX64_MUSL
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2" - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2"
- &test-cmake - &test-cmake
os: linux os: linux
stage: test
compiler: clang compiler: clang
addons: addons:
apt: apt:
@ -153,20 +158,27 @@ jobs:
- &test-macos - &test-macos
os: osx os: osx
stage: test osx_image: xcode10.1
osx_image: xcode8
before_script: before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
- brew update - brew update
- brew install gcc # for gfortran - brew install gcc@8 # for gfortran
script: script:
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
env: env:
- BTYPE="BINARY=64 INTERFACE64=1" - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8"
- <<: *test-macos - <<: *test-macos
osx_image: xcode10.0
env: env:
- BTYPE="BINARY=32" - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
- <<: *test-macos
osx_image: xcode10.1
env:
- CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
- CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
- BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
# whitelist # whitelist
branches: branches:

View File

@ -6,21 +6,35 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM) project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3) set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 0.dev) set(OpenBLAS_PATCH_VERSION 9.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions # Adhere to GNU filesystem layout conventions
include(GNUInstallDirs) include(GNUInstallDirs)
set(OpenBLAS_LIBNAME openblas) include(CMakePackageConfigHelpers)
####### #######
if(MSVC) if(MSVC)
option(BUILD_WITHOUT_LAPACK "Without LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
endif() endif()
option(BUILD_WITHOUT_CBLAS "Without CBLAS" OFF) option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
option(DYNAMIC_ARCH "Build with DYNAMIC_ARCH" OFF) option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
option(BUILD_RELAPACK "Build with ReLAPACK (recursive LAPACK" OFF) option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
else()
set(NO_AFFINITY 1)
endif()
# Add a prefix or suffix to all exported symbol names in the shared library.
# Avoids conflicts with other BLAS libraries, especially when using
# 64 bit integer interfaces in OpenBLAS.
set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" )
set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" )
####### #######
if(BUILD_WITHOUT_LAPACK) if(BUILD_WITHOUT_LAPACK)
set(NO_LAPACK 1) set(NO_LAPACK 1)
@ -33,12 +47,27 @@ endif()
####### #######
if(MSVC AND MSVC_STATIC_CRT)
set(CompilerFlags
CMAKE_CXX_FLAGS
CMAKE_CXX_FLAGS_DEBUG
CMAKE_CXX_FLAGS_RELEASE
CMAKE_C_FLAGS
CMAKE_C_FLAGS_DEBUG
CMAKE_C_FLAGS_RELEASE
)
foreach(CompilerFlag ${CompilerFlags})
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
endforeach()
endif()
message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.") message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")
include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE})
set(BLASDIRS interface driver/level2 driver/level3 driver/others) set(BLASDIRS interface driver/level2 driver/level3 driver/others)
if (NOT DYNAMIC_ARCH) if (NOT DYNAMIC_ARCH)
@ -51,10 +80,10 @@ endif ()
set(SUBDIRS ${BLASDIRS}) set(SUBDIRS ${BLASDIRS})
if (NOT NO_LAPACK) if (NOT NO_LAPACK)
list(APPEND SUBDIRS lapack)
if(BUILD_RELAPACK) if(BUILD_RELAPACK)
list(APPEND SUBDIRS relapack/src) list(APPEND SUBDIRS relapack/src)
endif() endif()
list(APPEND SUBDIRS lapack)
endif () endif ()
# set which float types we want to build for # set which float types we want to build for
@ -123,7 +152,7 @@ endif ()
# Only generate .def for dll on MSVC and always produce pdb files for debug and release # Only generate .def for dll on MSVC and always produce pdb files for debug and release
if(MSVC) if(MSVC)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def") set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def")
endif() endif()
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
@ -138,14 +167,9 @@ if (${DYNAMIC_ARCH})
endforeach() endforeach()
endif () endif ()
# Only build shared libs for MSVC
if (MSVC)
set(BUILD_SHARED_LIBS ON)
endif()
# add objects to the openblas lib # add objects to the openblas lib
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)
# Android needs to explicitly link against libm # Android needs to explicitly link against libm
if(ANDROID) if(ANDROID)
@ -154,7 +178,7 @@ endif()
# Handle MSVC exports # Handle MSVC exports
if(MSVC AND BUILD_SHARED_LIBS) if(MSVC AND BUILD_SHARED_LIBS)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
else() else()
# Creates verbose .def file (51KB vs 18KB) # Creates verbose .def file (51KB vs 18KB)
@ -165,6 +189,7 @@ endif()
# Set output for libopenblas # Set output for libopenblas
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS")
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
@ -186,7 +211,8 @@ if (USE_THREAD)
target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
endif() endif()
if (MSVC OR NOT NOFORTRAN) #if (MSVC OR NOT NOFORTRAN)
if (NOT NO_CBLAS)
# Broken without fortran on unix # Broken without fortran on unix
add_subdirectory(utest) add_subdirectory(utest)
endif() endif()
@ -204,14 +230,92 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
SOVERSION ${OpenBLAS_MAJOR_VERSION} SOVERSION ${OpenBLAS_MAJOR_VERSION}
) )
if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
if (NOT MSVC)
target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition")
else()
target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE")
endif()
endif()
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "")
if (NOT DEFINED ARCH)
set(ARCH_IN "x86_64")
else()
set(ARCH_IN ${ARCH})
endif()
if (${CORE} STREQUAL "generic")
set(ARCH_IN "GENERIC")
endif ()
if (NOT DEFINED EXPRECISION)
set(EXPRECISION_IN 0)
else()
set(EXPRECISION_IN ${EXPRECISION})
endif()
if (NOT DEFINED NO_CBLAS)
set(NO_CBLAS_IN 0)
else()
set(NO_CBLAS_IN ${NO_CBLAS})
endif()
if (NOT DEFINED NO_LAPACK)
set(NO_LAPACK_IN 0)
else()
set(NO_LAPACK_IN ${NO_LAPACK})
endif()
if (NOT DEFINED NO_LAPACKE)
set(NO_LAPACKE_IN 0)
else()
set(NO_LAPACKE_IN ${NO_LAPACKE})
endif()
if (NOT DEFINED NEED2UNDERSCORES)
set(NEED2UNDERSCORES_IN 0)
else()
set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES})
endif()
if (NOT DEFINED ONLY_CBLAS)
set(ONLY_CBLAS_IN 0)
else()
set(ONLY_CBLAS_IN ${ONLY_CBLAS})
endif()
if (NOT DEFINED BU)
set(BU _)
endif()
if (NOT ${SYMBOLPREFIX} STREQUAL "")
message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
endif()
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
endif()
add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
COMMENT "renaming symbols"
)
endif()
# Install project # Install project
# Install libraries # Install libraries
install(TARGETS ${OpenBLAS_LIBNAME} install(TARGETS ${OpenBLAS_LIBNAME}
EXPORT "OpenBLAS${SUFFIX64}Targets"
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
# Install headers
set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
set(CMAKE_INSTALL_FULL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR})
message(STATUS "Generating openblas_config.h in ${CMAKE_INSTALL_INCLUDEDIR}") message(STATUS "Generating openblas_config.h in ${CMAKE_INSTALL_INCLUDEDIR}")
set(OPENBLAS_CONFIG_H ${CMAKE_BINARY_DIR}/openblas_config.h) set(OPENBLAS_CONFIG_H ${CMAKE_BINARY_DIR}/openblas_config.h)
@ -231,7 +335,7 @@ install (FILES ${OPENBLAS_CONFIG_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
if(NOT NOFORTRAN) if(NOT NOFORTRAN)
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
set(F77BLAS_H ${CMAKE_BINARY_DIR}/f77blas.h) set(F77BLAS_H ${CMAKE_BINARY_DIR}/generated/f77blas.h)
file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n") file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n")
file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n") file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n")
file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n") file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n")
@ -244,10 +348,11 @@ endif()
if(NOT NO_CBLAS) if(NOT NO_CBLAS)
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}") file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h) install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
endif() endif()
if(NOT NO_LAPACKE) if(NOT NO_LAPACKE)
@ -259,11 +364,31 @@ if(NOT NO_LAPACKE)
ADD_CUSTOM_TARGET(genlapacke ADD_CUSTOM_TARGET(genlapacke
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
) )
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
endif() endif()
include(FindPkgConfig QUIET) include(FindPkgConfig QUIET)
if(PKG_CONFIG_FOUND) if(PKG_CONFIG_FOUND)
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY) configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY)
install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
endif() endif()
# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
set(PN OpenBLAS)
set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}")
configure_package_config_file(cmake/${PN}Config.cmake.in
"${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake"
INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR})
write_basic_package_version_file(${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
VERSION ${${PN}_VERSION}
COMPATIBILITY AnyNewerVersion)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake
RENAME ${PN}${SUFFIX64}ConfigVersion.cmake
DESTINATION ${CMAKECONFIG_INSTALL_DIR})
install(EXPORT "${PN}${SUFFIX64}Targets"
NAMESPACE "${PN}${SUFFIX64}::"
DESTINATION ${CMAKECONFIG_INSTALL_DIR})

View File

@ -167,4 +167,16 @@ In chronological order:
* [2017-02-26] ztrmm kernel for IBM z13 * [2017-02-26] ztrmm kernel for IBM z13
* [2017-03-13] strmm and ctrmm kernel for IBM z13 * [2017-03-13] strmm and ctrmm kernel for IBM z13
* [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13 * [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13
* [2018-03-07] added missing Blas Level 1-2 (double precision) simd codes
* [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
* [2019-03-14] power9 dgemm/dtrmm kernel
* [2019-04-29] power9 sgemm/strmm kernel
* Jiachen Wang <https://github.com/wjc404>
* [2019-07-29] optimize AVX2 DGEMM
* [2019-10-20] AVX512 DGEMM kernel (4x8)
* [2019-11-06] optimize AVX512 SGEMM
* [2019-11-12] AVX512 CGEMM & ZGEMM kernels
* [2019-12-23] optimize AVX2 CGEMM and ZGEMM
* [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels
* [2020-01-07] optimize AVX2 SGEMM and STRMM

View File

@ -1,4 +1,422 @@
OpenBLAS ChangeLog OpenBLAS ChangeLog
====================================================================
Version 0.3.8
9-Feb-2020
common:
` * LAPACK has been updated to 3.9.0 (plus patches up to
January 2nd, 2020)
* CMAKE support has been improved in several areas including
cross-compilation
* a thread race condition in the GEMM3M kernels was resolved
* the "generic" (plain C) gemm beta kernel used by many targets
has been sped up
* an optimized version of the LAPACK trtrs functions has been added
* an incompatibilty between the LAPACK tests and the OpenBLAS
implementation of XERBLA was resolved, removing the numerous
warnings about wrong error exits in the former
* support for NetBSD has been added
* support for compilation with g95 and non-GNU versions of ld
has been improved
* support for compilation with (upcoming) gcc 10 has been added
POWER:
* worked around miscompilation of several POWER8 and POWER9
kernels by older versions of gcc
* added support for big-endian POWER8 and for compilation on AIX
* corrected bugs in the big-endian support for PPC440 and PPC970
* DYNAMIC_ARCH support is now available in CMAKE builds as well
ARMV8:
* performance of DGEMM_BETA and SGEMM_NCOPY has been improved
* compilation for 32bit works again
* performance of the RPCC function has been improved
* improved performance on small systems
* DYNAMIC_ARCH support is now available in CMAKE builds as well
* cross-compilation from OSX to IOS was simplified
x86_64:
* a new AVX512 DGEMM kernel was added and the AVX512 SGEMM kernel
was significantly improved
* optimized AVX512 kernels for CGEMM and ZGEMM have been added
* AVX2 kernels for STRMM, SGEMM, and CGEMM have been significantly
sped up and optimized CGEMM3M and ZGEMM3M kernels have been added
* added support for QEMU virtual cpus
* a compilation problem with PGI and SUN compilers was fixed
* Intel "Goldmont plus" is now autodetected
* a potential crash on program exit on MS Windows has been fixed
x86:
* an unwanted case sensitivity in the implementation of LSAME
on older 32bit AMD cpus was fixed
zarch:
* Z15 is now supported as Z14
* DYNAMIC_ARCH is now available on ZARCH as well
====================================================================
Version 0.3.7
11-Aug 2019
common:
* having the gmake special variables TARGET_ARCH or TARGET_MACH
defined no longer causes build failures in ctest or utest
* defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer
has the same effect as setting them to 1
* a new test program was added to allow checking the library for
thread safety
* a new option USE_LOCKING was added to ensure thread safety when
OpenBLAS itself is built without multithreading but will be
called from multiple threads.
* a build failure on Linux with glibc versions earlier than 2.5
was fixed
* a runtime error with CPU enumeration (and NO_AFFINITY not set)
on glibc 2.6 was fixed
* NO_AFFINITY was added to the CMAKE options (and defaults to being
active on Linux, as in the gmake builds)
x86_64:
* the build-time logic for detection of AVX512 availability in
the processor and compiler was fixed
* gmake builds on OSX now set the internal name of the library to
libopenblas.0.dylib (consistent with CMAKE)
* the Haswell DGEMM kernel received a significant speedup through
improved prefetch and load instructions
* performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly
increased by avoiding vpermpd instructions
* the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled
to fix remaining errors in DGEMM, DSYMM and DTRMM
POWER:
* added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970
* added optimized kernels for POWER9 SGEMM and STRMM
ARMV7:
* fixed the softfp implementations of xAMAX and IxAMAX
* removed the predefined -march= flags on both ARMV5 and ARMV6 as
they were appropriate for only a subset of platforms
====================================================================
Version 0.3.6
29-Apr-2019
common:
* the build tools now check that a given cpu TARGET is actually valid
* the build-time check of system features (c_check) has been made
less dependent on particular perl features (this should mainly
benefit building on Windows)
* several problem with the ReLAPACK integration were fixed,
including INTERFACE64 support and building a shared library
* building with CMAKE on BSD systems was improved
* a non-absolute SUM function was added based on the
existing optimized code for ASUM
* CBLAS interfaces to the IxMIN and IxMAX functions were added
* a name clash between LAPACKE and BOOST headers was resolved
* CMAKE builds with OpenMP failed to include the appropriate getrf_parallel
kernels
* a crash on thread (key) deletion with the USE_TLS=1 memory management
option was fixed
* restored several earlier fixes, in particular for OpenMP performance,
building on BSD, and calling fork on CYGWIN, which had inadvertently
been dropped in the 0.3.3 rewrite of the memory management code.
x86_64:
* the AVX512 DGEMM kernel has been disabled again due to unsolved problems
* building with old versions of MSVC was fixed
* it is now possible to build a static library on Windows with CMAKE
* accessing environment variables on CYGWIN at run time was fixed
* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
* Intel "Denverton" atom and Hygon "Dhyana" zen CPUs are now autodetected
* building for DYNAMIC_ARCH with a DYNAMIC_LIST of targets is now supported
with CMAKE as well
* building for DYNAMIC_ARCH with GENERIC as the default target is now supported
* a buffer overflow in the SSE GEMM kernel for Intel Nano targets was fixed
* assembly bugs involving undeclared modification of input operands were fixed
in the AXPY, DOT, GEMV, GER, SCAL, SYMV and TRSM microkernels for Nehalem,
Sandybridge, Haswell, Bulldozer and Piledriver. These would typically cause
test failures or segfaults when compiled with recent versions of gcc from 8 onward.
* a similar bug was fixed in the blas_quickdivide code used to split workloads
in most functions
* a bug in the IxMIN implementation for the GENERIC target made it return the result of IxMAX
* fixed building on SkylakeX systems when either the compiler or the (emulated) operating
environment does not support AVX512
* improved GEMM performance on ZEN targets
x86:
* build failures caused by the recently added checks for AVX512 were fixed
* an inline assembly bug involving undeclared modification of an input argument was
fixed in the blas_quickdivide code used to split workloads in most functions
* a bug in the IMIN implementation for the GENERIC target made it return the result of IMAX
MIPS32:
* a bug in the IMIN implementation made it return the result of IMAX
POWER:
* single precision BLAS1/2 functions have received optimized POWER8 kernels
* POWER9 is now a separate target, with an optimized DGEMM/DTRMM kernel
* building on PPC970 systems under OSX Leopard or Tiger is now supported
* out-of-bounds memory accesses in the gemm_beta microkernels were fixed
* building a shared library on AIX is now supported for POWER6
* DYNAMIC_ARCH support has been added for POWER6 and newer
ARMv7:
* corrected xDOT behaviour with zero INC_X or INC_Y
* a bug in the IMIN implementation made it return the result of IMAX
ARMv8:
* added support for HiSilicon TSV110 cpus
* the CMAKE build system now recognizes 32bit userspace on 64bit hardware
* cross-compilation with CMAKE now works again
* a bug in the IMIN implementation made it return the result of IMAX
* ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7
IBM Z:
* optimized microkernels for single precicion BLAS1/2 functions have been added
for both Z13 and Z14
====================================================================
Version 0.3.5
31-Dec-2018
common:
* loop unrolling in TRMV has been enabled again.
* A domain error in the thread workload distribution for SYRK
has been fixed.
* gmake builds will now automatically add -fPIC to the build
options if the platform requires it.
* a pthreads key leakage (and associate crash on dlclose) in
the USE_TLS codepath was fixed.
* building of the utest cases on systems that do not provide
an implementation of complex.h was fixed.
x86_64:
* the SkylakeX code was changed to compile on OSX.
* unwanted application of the -march=skylake-avx512 option
to the common code parts of a DYNAMIC_ARCH build was fixed.
* improved performance of SGEMM for small workloads on Skylake X.
* performance of SGEMM and DGEMM was improved on Haswell.
ARMV8:
* a configuration error that broke the CNRM2 kernel was corrected.
* compilation of the GEMM kernels with CMAKE was fixed.
* DYNAMIC_ARCH builds are now available with CMAKE as well.
* using CMAKE for cross-compilation to the new cpu TARGETs
introduced in 0.3.4 now works.
POWER:
* a problem in cpu autodetection for AIX has been corrected.
====================================================================
Version 0.3.4
02-Dec-2018
common:
* the new, experimental thread-local memory allocation had
inadvertently been left enabled for gmake builds in 0.3.3
despite the announcement. It is now disabled by default, and
single-threaded builds will keep using the old allocator even
if the USE_TLS option is turned on.
* OpenBLAS will now provide enough buffer space for at least 50
threads by default.
* The output of openblas_get_config() now contains the version
number.
* A serious thread safety bug in GEMV operation with small M and
large N size has been fixed.
* The code will now automatically call blas_thread_init after a
fork if needed before handling a call to openblas_set_num_threads
* Accesses to parallelized level3 functions from multiple callers
are now serialized to avoid thread races (unless using OpenMP).
This should provide better performance than the known-threadsafe
(but non-default) USE_SIMPLE_THREADED_LEVEL3 option.
* When building LAPACK with gfortran, -frecursive is now (again)
enabled by default to ensure correct behaviour.
* The OpenBLAS version cblas.h now supports both CBLAS_ORDER and
CBLAS_LAYOUT as the name of the matrix row/column order option.
* Externally set LDFLAGS are now passed through to the final compile/link
steps to facilitate setting platform-specific linker flags.
* A potential race condition during the build of LAPACK (that would
usually manifest itself as a failure to build TESTING/MATGEN) has been
fixed.
* xHEMV has been changed to stay single-threaded for small input sizes
where the overhead of multithreading exceeds any possible gains
* CSWAP and ZSWAP have been limited to a single thread except on ARMV8 or
ThunderX hardware with sizable input.
* Linker flags for the PGI compiler have been updated
* Behaviour of AXPY with zero increments is now handled in the C interface,
correcting the result on at least Intel Atom.
* The result matrix from calling SGELSS with an all-zero input matrix is
now zeroed completely.
x86_64:
* Autodetection of AMD Ryzen2 has been fixed (again).
* CMAKE builds now support labeling of an INTERFACE64=1 build of
the library with the _64 suffix.
* AVX512 version of DGEMM has been added and the AVX512 SGEMM kernel
has been sped up by rewriting with C intrinsics
* Fixed compilation on RHEL5/CENTOS5 (issue with typename __WAIT_STATUS)
POWER:
* added support for building on AIX (with gcc and GNU tools from AIX Toolbox).
* CPU type detection has been implemented for AIX.
* CPU type detection has been fixed for NETBSD.
MIPS64:
* AXPY on LOONGSON3A has been corrected to pass "zero increment" utest.
* DSDOT on LOONGSON3A has been fixed.
* the SGEMM microkernel has been hardened against potential data loss.
ARMV8:
* DYNAMic_ARCH support is now available for 64bit ARM
* cross-compiling for ARMV8 under iOS now works.
* cpu-specific code has been rearranged to make better use of both
hardware commonalities and model-specific compiler optimizations.
* XGENE1 has been removed as a TARGET, superseded by the improved generic
ARMV8 support.
ARMV7:
* Older assembly mnemonics have been converted to UAL form to allow
building with clang 7.0
* Cross compiling LAPACKE for Android has been fixed again (broken by
update to LAPACK 3.7.0 some while ago).
====================================================================
Version 0.3.3
31-Aug-2018
common:
* thread memory allocation has been switched back to the method
used before version 0.3.1 due to unexpected problems caused by
the new code under some circumstances. A new compile-time option
USE_TLS has been added to enable the new code, and it is hoped
that this can become the default again in the next version.
* LAPAck PR272 has been integrated, which fixes spurious errors
in DSYEVR and related functions caused by missing conversion
from ILAENV to ILAENV_2STAGE in several _2stage routines.
* the cmake-generated OpenBLASConfig.cmake now uses correct case
for the name of the library
* added support for Haiku OS
x86_64:
* added AVX512 implementations of SDOT, DDOT, SAXPY, DAXPY,
DSCAL, DGEMVN and DSYMVL
* added a workaround for a cygwin issue that prevented compilation
of AVX512 code
IBM Z:
* added autodetection of Z14
* fixed TRMM errors in the generic target
====================================================================
Version 0.3.2
30-Jul-2018
common:
* fixes for regressions caused by the rewrite of the thread
initialization code in 0.3.1
POWER:
* fixed cpu autodetection for the BSDs
MIPS64:
* fixed utest errors in AXPY, DSDOT, ROT and SWAP
x86_64:
* added autodetection of AMD Ryzen 2
* fixed build with older versions of MSVC
====================================================================
Version 0.3.1
01-Jul-2018
common:
* rewritten thread initialization code with significantly reduced overhead
* added CBLAS interfaces to the IxAMIN BLAS extension functions
* fixed the lapack-test target
* CMAKE builds now create an OpenBLASConfig.cmake file
* ZAXPY now uses a single thread for small input sizes
* the LAPACK code was updated from Reference-LAPACK/lapack#253
(fixing LAPACKE interfaces to Aasen's functions)
POWER:
* corrected CROT and ZROT behaviour with zero INC_X
ARMV7:
* corrected xDOT behaviour with zero INC_X or INC_Y
x86_64:
* retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER,
this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO
(which will still be supported via the slower PRESCOTT kernels when this option is not set)
* added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to
specify the list of x86_64 targets to include. Any target not on the list will be supported
by the Sandybridge or Nehalem kernels if available, or by Prescott.
* improved SWITCH_RATIO on Haswell for increased GEMM throughput
* added initial support for Intel Skylake X, including an AVX512 SGEMM kernel
* added autodetection of Intel Cannon Lake series as Skylake X
* added a default L2 cache size for hypervisors that return zero here (Chromebook)
* fixed a name clash with recent Windows10 headers that broke the build with (at least)
recent mingw from MSYS2
* fixed a link error in mixed clang/gfortran builds with OpenMP
* updated the OSX deployment target to 10.8
* switched on parallel make for builds on MS Windows by default
x86:
* fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y
====================================================================
Version 0.3.0
23-May-2108
common:
* fixed some more thread race and locking bugs
* added preliminary support for calling an OpenMP build of the library from multiple threads
* removed performance impact of thread locks added in 0.2.20 on OpenMP code
* general code cleanup
* optimized DSDOT implementation
* improved thread distribution for GEMM
* corrected IMATCOPY/OMATCOPY implementation
* fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations
* cmake build improvements
* pkgconfig file now contains build options
* openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build
* corrections and improvements for systems with more than 64 cpus
* LAPACK code updated to 3.8.0 including later fixes
* added ReLAPACK, a recursive implementation of several LAPACK functions
* Rewrote ROTMG to handle cases that the netlib code failed to address
* Disabled (broken) multithreading code for xTRMV
* corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard
* shared memory access failures on startup are now handled more gracefully
* restored utests from earlier releases (and made them pass on all affected systems)
SPARC:
* several fixes for cpu autodetection
POWER:
* corrected vector register overwriting in several Power8 kernels
* optimized additional BLAS functions
ARM:
* added support for CortexA53 and A72
* added autodetection for ThunderX2T99
* made most optimized kernels the default for generic ARMv8 targets
x86_64:
* parallelized DDOT kernel for Haswell
* changed alignment directives in assembly kernels to boost performance on OSX
* fixed register handling in the GEMV microkernels (bug exposed by gcc7)
* added support for building on OpenBSD and Dragonfly
* updated compiler options to work with Intel release 2018
* support fully optimized build with clang/flang on Microsoft Windows
* fixed building on AIX
IBM Z:
* added optimized BLAS 1/2 functions
MIPS:
* fixed cpu autodetection helper code
* added mips32 1004K cpu (Mediatek MT7621 and similar SoC)
* added mips64 I6500 cpu
==================================================================== ====================================================================
Version 0.2.20 Version 0.2.20
24-Jul-2017 24-Jul-2017

View File

@ -21,9 +21,20 @@ ifeq ($(BUILD_RELAPACK), 1)
RELA = re_lapack RELA = re_lapack
endif endif
ifeq ($(NO_FORTRAN), 1)
define NOFORTRAN
1
endef
define NO_LAPACK
1
endef
export NOFORTRAN
export NO_LAPACK
endif
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
.PHONY : all libs netlib $(RELA) test ctest shared install .PHONY : all libs netlib $(RELA) test ctest shared install
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test .NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
@ -47,7 +58,7 @@ endif
endif endif
@echo " C compiler ... $(C_COMPILER) (command line : $(CC))" @echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
ifndef NOFORTRAN ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))" @echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"
endif endif
ifneq ($(OSNAME), AIX) ifneq ($(OSNAME), AIX)
@ -85,8 +96,8 @@ endif
@echo @echo
shared : shared :
ifndef NO_SHARED ifneq ($(NO_SHARED), 1)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@$(MAKE) -C exports so @$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so @ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) @ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
@ -98,6 +109,7 @@ endif
ifeq ($(OSNAME), Darwin) ifeq ($(OSNAME), Darwin)
@$(MAKE) -C exports dyn @$(MAKE) -C exports dyn
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib @ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif endif
ifeq ($(OSNAME), WINNT) ifeq ($(OSNAME), WINNT)
@$(MAKE) -C exports dll @$(MAKE) -C exports dll
@ -108,19 +120,22 @@ endif
endif endif
tests : tests :
ifndef NOFORTRAN ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
touch $(LIBNAME) touch $(LIBNAME)
ifndef NO_FBLAS ifndef NO_FBLAS
$(MAKE) -C test all $(MAKE) -C test all
$(MAKE) -C utest all
endif endif
$(MAKE) -C utest all
ifndef NO_CBLAS ifndef NO_CBLAS
$(MAKE) -C ctest all $(MAKE) -C ctest all
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
$(MAKE) -C cpp_thread_test all
endif
endif endif
endif endif
libs : libs :
ifeq ($(CORE), UNKOWN) ifeq ($(CORE), UNKNOWN)
$(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.) $(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.)
endif endif
ifeq ($(NOFORTRAN), 1) ifeq ($(NOFORTRAN), 1)
@ -153,6 +168,9 @@ ifeq ($(DYNAMIC_ARCH), 1)
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\ do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
done done
@echo DYNAMIC_ARCH=1 >> Makefile.conf_last @echo DYNAMIC_ARCH=1 >> Makefile.conf_last
ifeq ($(DYNAMIC_OLDER), 1)
@echo DYNAMIC_OLDER=1 >> Makefile.conf_last
endif
endif endif
ifdef USE_THREAD ifdef USE_THREAD
@echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
@ -207,7 +225,7 @@ netlib :
else else
netlib : lapack_prebuild netlib : lapack_prebuild
ifndef NOFORTRAN ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
endif endif
@ -228,22 +246,22 @@ prof_lapack : lapack_prebuild
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
lapack_prebuild : lapack_prebuild :
ifndef NOFORTRAN ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "override ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "ARFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LAPACKLIB = ../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "TMGLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "TMGLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKELIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LAPACKELIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
@ -253,6 +271,8 @@ ifeq ($(F_COMPILER), GFORTRAN)
ifdef SMP ifdef SMP
ifeq ($(OSNAME), WINNT) ifeq ($(OSNAME), WINNT)
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
else ifeq ($(OSNAME), Haiku)
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
else else
-@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc -@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
endif endif
@ -271,21 +291,21 @@ endif
endif endif
large.tgz : large.tgz :
ifndef NOFORTRAN ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
if [ ! -a $< ]; then if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/large.tgz; -wget http://www.netlib.org/lapack/timing/large.tgz;
fi fi
endif endif
timing.tgz : timing.tgz :
ifndef NOFORTRAN ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
if [ ! -a $< ]; then if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/timing.tgz; -wget http://www.netlib.org/lapack/timing/timing.tgz;
fi fi
endif endif
lapack-timing : large.tgz timing.tgz lapack-timing : large.tgz timing.tgz
ifndef NOFORTRAN ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING) (cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz ) (cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
$(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING $(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING
@ -294,11 +314,12 @@ endif
lapack-test : lapack-test :
(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out) (cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
ifneq ($(CROSS), 1) ifneq ($(CROSS), 1)
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \ ( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \
./testsecond; ./testdsecnd; ./testieee; ./testversion ) ./testsecond; ./testdsecnd; ./testieee; ./testversion )
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r ) (cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING)
endif endif
lapack-runtest: lapack-runtest:
@ -308,9 +329,9 @@ lapack-runtest:
blas-test: blas-test:
(cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out) (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out)
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing $(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
(cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out) (cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out)
dummy : dummy :

View File

@ -1,7 +1,7 @@
ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15)) ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
ifeq ($(OSNAME), Android) ifeq ($(OSNAME), Android)
CCOMMON_OPT += -mfpu=neon -march=armv7-a CCOMMON_OPT += -mfpu=neon -march=armv7-a
FCOMMON_OPT += -mfpu=neon -march=armv7-a FCOMMON_OPT += -mfpu=neon -march=armv7-a
else else
CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
@ -9,11 +9,6 @@ endif
endif endif
ifeq ($(CORE), ARMV6) ifeq ($(CORE), ARMV6)
CCOMMON_OPT += -mfpu=vfp -march=armv6 CCOMMON_OPT += -mfpu=vfp
FCOMMON_OPT += -mfpu=vfp -march=armv6 FCOMMON_OPT += -mfpu=vfp
endif
ifeq ($(CORE), ARMV5)
CCOMMON_OPT += -march=armv5
FCOMMON_OPT += -march=armv5
endif endif

View File

@ -4,22 +4,45 @@ CCOMMON_OPT += -march=armv8-a
FCOMMON_OPT += -march=armv8-a FCOMMON_OPT += -march=armv8-a
endif endif
ifeq ($(CORE), CORTEXA57) ifeq ($(CORE), CORTEXA53)
CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
endif endif
ifeq ($(CORE), VULCAN) ifeq ($(CORE), CORTEXA57)
CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
endif
ifeq ($(CORE), CORTEXA72)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
endif
ifeq ($(CORE), CORTEXA73)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
endif endif
ifeq ($(CORE), THUNDERX) ifeq ($(CORE), THUNDERX)
CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx CCOMMON_OPT += -march=armv8-a -mtune=thunderx
FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx FCOMMON_OPT += -march=armv8-a -mtune=thunderx
endif
ifeq ($(CORE), FALKOR)
CCOMMON_OPT += -march=armv8-a -mtune=falkor
FCOMMON_OPT += -march=armv8-a -mtune=falkor
endif endif
ifeq ($(CORE), THUNDERX2T99) ifeq ($(CORE), THUNDERX2T99)
CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
endif endif
ifeq ($(GCCVERSIONGTEQ9), 1)
ifeq ($(CORE), TSV110)
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
endif
endif

View File

@ -48,8 +48,10 @@ ifndef NO_CBLAS
@sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
endif endif
ifneq ($(OSNAME), AIX)
ifndef NO_LAPACKE ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
@ -57,21 +59,22 @@ ifndef NO_LAPACKE
endif endif
#for install static library #for install static library
ifndef NO_STATIC ifneq ($(NO_STATIC),1)
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif endif
#for install shared library #for install shared library
ifndef NO_SHARED ifneq ($(NO_SHARED),1)
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android)) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif endif
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly)) ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
@ -79,9 +82,10 @@ ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
endif endif
ifeq ($(OSNAME), Darwin) ifeq ($(OSNAME), Darwin)
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" @-install_name_tool -id "$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(MAJOR_VERSION).dylib" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
endif endif
ifeq ($(OSNAME), WINNT) ifeq ($(OSNAME), WINNT)
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" @-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
@ -93,11 +97,40 @@ ifeq ($(OSNAME), CYGWIN_NT)
endif endif
endif endif
else
#install on AIX has different options syntax
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
endif
#for install static library
ifneq ($(NO_STATIC),1)
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
#for install shared library
ifneq ($(NO_SHARED),1)
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
endif
#Generating openblas.pc #Generating openblas.pc
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@ -108,7 +141,7 @@ endif
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
ifndef NO_SHARED ifneq ($(NO_SHARED),1)
#ifeq logical or #ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"

View File

@ -9,7 +9,15 @@ else
USE_OPENMP = 1 USE_OPENMP = 1
endif endif
ifeq ($(CORE), POWER9)
ifeq ($(USE_OPENMP), 1)
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
else
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math
endif
endif
ifeq ($(CORE), POWER8) ifeq ($(CORE), POWER8)
ifeq ($(USE_OPENMP), 1) ifeq ($(USE_OPENMP), 1)
@ -21,6 +29,10 @@ FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fas
endif endif
endif endif
# workaround for C->FORTRAN ABI violation in LAPACKE
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -fno-optimize-sibling-calls
endif
FLAMEPATH = $(HOME)/flame/lib FLAMEPATH = $(HOME)/flame/lib

View File

@ -17,6 +17,10 @@ ifdef CPUIDEMU
EXFLAGS = -DCPUIDEMU -DVENDOR=99 EXFLAGS = -DCPUIDEMU -DVENDOR=99
endif endif
ifeq ($(TARGET), 1004K)
TARGET_FLAGS = -mips32r2
endif
ifeq ($(TARGET), P5600) ifeq ($(TARGET), P5600)
TARGET_FLAGS = -mips32r5 TARGET_FLAGS = -mips32r5
endif endif
@ -38,7 +42,7 @@ all: getarch_2nd
./getarch_2nd 1 >> $(TARGET_CONF) ./getarch_2nd 1 >> $(TARGET_CONF)
config.h : c_check f_check getarch config.h : c_check f_check getarch
perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) $(CFLAGS)
ifneq ($(ONLY_CBLAS), 1) ifneq ($(ONLY_CBLAS), 1)
perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS) perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS)
else else
@ -55,13 +59,13 @@ endif
getarch : getarch.c cpuid.S dummy $(CPUIDEMU) getarch : getarch.c cpuid.S dummy $(CPUIDEMU)
$(HOSTCC) $(CFLAGS) $(EXFLAGS) -o $(@F) getarch.c cpuid.S $(CPUIDEMU) $(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) -o $(@F) getarch.c cpuid.S $(CPUIDEMU)
getarch_2nd : getarch_2nd.c config.h dummy getarch_2nd : getarch_2nd.c config.h dummy
ifndef TARGET_CORE ifndef TARGET_CORE
$(HOSTCC) -I. $(CFLAGS) -o $(@F) getarch_2nd.c $(HOSTCC) -I. $(HOST_CFLAGS) -o $(@F) getarch_2nd.c
else else
$(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c $(HOSTCC) -I. $(HOST_CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c
endif endif
dummy: dummy:

View File

@ -3,7 +3,7 @@
# #
# This library's version # This library's version
VERSION = 0.3.0.dev VERSION = 0.3.9.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -17,6 +17,11 @@ VERSION = 0.3.0.dev
# If you want to support multiple architecture in one binary # If you want to support multiple architecture in one binary
# DYNAMIC_ARCH = 1 # DYNAMIC_ARCH = 1
# If you want the full list of x86_64 architectures supported in DYNAMIC_ARCH
# mode (including individual optimizied codes for PENRYN, DUNNINGTON, OPTERON,
# OPTERON_SSE3, ATOM and NANO rather than fallbacks to older architectures)
# DYNAMIC_OLDER = 1
# C compiler including binary type(32bit / 64bit). Default is gcc. # C compiler including binary type(32bit / 64bit). Default is gcc.
# Don't use Intel Compiler or PGI, it won't generate right codes as I expect. # Don't use Intel Compiler or PGI, it won't generate right codes as I expect.
# CC = gcc # CC = gcc
@ -43,6 +48,8 @@ VERSION = 0.3.0.dev
# HOSTCC = gcc # HOSTCC = gcc
# If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 # If you need 32bit binary, define BINARY=32, otherwise define BINARY=64
# Please note that AVX is not available on 32-bit.
# Setting BINARY=32 disables AVX/AVX2/AVX-512.
# BINARY=64 # BINARY=64
# About threaded BLAS. It will be automatically detected if you don't # About threaded BLAS. It will be automatically detected if you don't
@ -51,33 +58,72 @@ VERSION = 0.3.0.dev
# For force setting for multi threaded, specify USE_THREAD = 1 # For force setting for multi threaded, specify USE_THREAD = 1
# USE_THREAD = 0 # USE_THREAD = 0
# If you want to build a single-threaded OpenBLAS, but expect to call this
# from several concurrent threads in some other program, comment this in for
# thread safety. (This is done automatically for USE_THREAD=1 , and should not
# be necessary when USE_OPENMP=1)
# USE_LOCKING = 1
# If you're going to use this library with OpenMP, please comment it in. # If you're going to use this library with OpenMP, please comment it in.
# This flag is always set for POWER8. Don't modify the flag # This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8.
# USE_OPENMP = 1 # USE_OPENMP = 1
# You can define maximum number of threads. Basically it should be # The OpenMP scheduler to use - by default this is "static" and you
# less than actual number of cores. If you don't specify one, it's # will normally not want to change this unless you know that your main
# automatically detected by the the script. # workload will involve tasks that have highly unbalanced running times
# for individual threads. Changing away from "static" may also adversely
# affect memory access locality in NUMA systems. Setting to "runtime" will
# allow you to select the scheduler from the environment variable OMP_SCHEDULE
# CCOMMON_OPT += -DOMP_SCHED=dynamic
# You can define the maximum number of threads. Basically it should be less
# than or equal to the number of CPU threads. If you don't specify one, it's
# automatically detected by the build system.
# If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to
# restrict NUM_THREADS to the number of physical cores. By default, the automatic
# detection includes logical CPUs, thus allowing the use of SMT.
# Users may opt at runtime to use less than NUM_THREADS threads.
#
# Note for package maintainers: you can build OpenBLAS with a large NUM_THREADS
# value (eg. 32-256) if you expect your users to use that many threads. Due to the way
# some internal structures are allocated, using a large NUM_THREADS value has a RAM
# footprint penalty, even if users reduce the actual number of threads at runtime.
# NUM_THREADS = 24 # NUM_THREADS = 24
# if you don't need to install the static library, please comment it in. # If you have enabled USE_OPENMP and your application would call
# OpenBLAS's calculation API from multiple threads, please comment this in.
# This flag defines how many instances of OpenBLAS's calculation API can actually
# run in parallel. If more than NUM_PARALLEL threads call OpenBLAS's calculation API,
# they need to wait for the preceding API calls to finish or risk data corruption.
# NUM_PARALLEL = 2
# When multithreading, OpenBLAS needs to use a memory buffer for communicating
# and collating results for individual subranges of the original matrix. Since
# the original GotoBLAS of the early 2000s, the default size of this buffer has
# been set at a value of 32<<20 (which is 32MB) on x86_64 , twice that on PPC.
# If you expect to handle large problem sizes (beyond about 30000x30000) uncomment
# this line and adjust the (32<<n) factor if necessary. Usually an insufficient value
# manifests itself as a crash in the relevant scal kernel (sscal_k, dscal_k etc)
# BUFFERSIZE = 25
# If you don't need to install the static library, please comment this in.
# NO_STATIC = 1 # NO_STATIC = 1
# if you don't need generate the shared library, please comment it in. # If you don't need to generate the shared library, please comment this in.
# NO_SHARED = 1 # NO_SHARED = 1
# If you don't need CBLAS interface, please comment it in. # If you don't need the CBLAS interface, please comment this in.
# NO_CBLAS = 1 # NO_CBLAS = 1
# If you only want CBLAS interface without installing Fortran compiler, # If you only want the CBLAS interface without installing a Fortran compiler,
# please comment it in. # please comment this in.
# ONLY_CBLAS = 1 # ONLY_CBLAS = 1
# If you don't need LAPACK, please comment it in. # If you don't need LAPACK, please comment this in.
# If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1. # If you set NO_LAPACK=1, the build system automatically sets NO_LAPACKE=1.
# NO_LAPACK = 1 # NO_LAPACK = 1
# If you don't need LAPACKE (C Interface to LAPACK), please comment it in. # If you don't need LAPACKE (C Interface to LAPACK), please comment this in.
# NO_LAPACKE = 1 # NO_LAPACKE = 1
# Build LAPACK Deprecated functions since LAPACK 3.6.0 # Build LAPACK Deprecated functions since LAPACK 3.6.0
@ -86,12 +132,18 @@ BUILD_LAPACK_DEPRECATED = 1
# Build RecursiveLAPACK on top of LAPACK # Build RecursiveLAPACK on top of LAPACK
# BUILD_RELAPACK = 1 # BUILD_RELAPACK = 1
# If you want to use legacy threaded Level 3 implementation. # If you want to use the legacy threaded Level 3 implementation.
# USE_SIMPLE_THREADED_LEVEL3 = 1 # USE_SIMPLE_THREADED_LEVEL3 = 1
# If you want to use the new, still somewhat experimental code that uses
# thread-local storage instead of a central memory buffer in memory.c
# Note that if your system uses GLIBC, it needs to have at least glibc 2.21
# for this to work.
# USE_TLS = 1
# If you want to drive whole 64bit region by BLAS. Not all Fortran # If you want to drive whole 64bit region by BLAS. Not all Fortran
# compiler supports this. It's safe to keep comment it out if you # compilers support this. It's safe to keep this commented out if you
# are not sure(equivalent to "-i8" option). # are not sure. (This is equivalent to the "-i8" ifort option).
# INTERFACE64 = 1 # INTERFACE64 = 1
# Unfortunately most of kernel won't give us high quality buffer. # Unfortunately most of kernel won't give us high quality buffer.
@ -99,10 +151,18 @@ BUILD_LAPACK_DEPRECATED = 1
# but it will consume time. If you don't like it, you can disable one. # but it will consume time. If you don't like it, you can disable one.
NO_WARMUP = 1 NO_WARMUP = 1
# If you want to disable CPU/Memory affinity on Linux. # Comment this in if you want to disable OpenBLAS's CPU/Memory affinity handling.
# This feature is only implemented on Linux, and is always disabled on other platforms.
# Enabling affinity handling may improve performance, especially on NUMA systems, but
# it may conflict with certain applications that also try to manage affinity.
# This conflict can result in threads of the application calling OpenBLAS ending up locked
# to the same core(s) as OpenBLAS, possibly binding all threads to a single core.
# For this reason, affinity handling is disabled by default. Can be safely enabled if nothing
# else modifies affinity settings.
# Note: enabling affinity has been known to cause problems with NumPy and R
NO_AFFINITY = 1 NO_AFFINITY = 1
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus # If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
# BIGNUMA = 1 # BIGNUMA = 1
# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
@ -112,6 +172,10 @@ NO_AFFINITY = 1
# Don't use Haswell optimizations if binutils is too old (e.g. RHEL6) # Don't use Haswell optimizations if binutils is too old (e.g. RHEL6)
# NO_AVX2 = 1 # NO_AVX2 = 1
# Don't use SkylakeX optimizations if binutils or compiler are too old (the build
# system will try to determine this automatically)
# NO_AVX512 = 1
# Don't use parallel make. # Don't use parallel make.
# NO_PARALLEL_MAKE = 1 # NO_PARALLEL_MAKE = 1
@ -126,6 +190,9 @@ NO_AFFINITY = 1
# FUNCTION_PROFILE = 1 # FUNCTION_PROFILE = 1
# Support for IEEE quad precision(it's *real* REAL*16)( under testing) # Support for IEEE quad precision(it's *real* REAL*16)( under testing)
# This option should not be used - it is a holdover from unfinished code present
# in the original GotoBLAS2 library that may be usable as a starting point but
# is not even expected to compile in its present form.
# QUAD_PRECISION = 1 # QUAD_PRECISION = 1
# Theads are still working for a while after finishing BLAS operation # Theads are still working for a while after finishing BLAS operation
@ -133,22 +200,25 @@ NO_AFFINITY = 1
# time out to improve performance. This number should be from 4 to 30 # time out to improve performance. This number should be from 4 to 30
# which corresponds to (1 << n) cycles. For example, if you set to 26, # which corresponds to (1 << n) cycles. For example, if you set to 26,
# thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz # thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz
# system). Also you can control this mumber by THREAD_TIMEOUT # system). Also you can control this number by THREAD_TIMEOUT
# CCOMMON_OPT += -DTHREAD_TIMEOUT=26 # CCOMMON_OPT += -DTHREAD_TIMEOUT=26
# Using special device driver for mapping physically contigous memory # Using special device driver for mapping physically contiguous memory
# to the user space. If bigphysarea is enabled, it will use it. # to the user space. If bigphysarea is enabled, it will use it.
# DEVICEDRIVER_ALLOCATION = 1 # DEVICEDRIVER_ALLOCATION = 1
# If you need to synchronize FP CSR between threads (for x86/x86_64 only). # If you need to synchronize FP CSR between threads (for x86/x86_64 only).
# CONSISTENT_FPCSR = 1 # CONSISTENT_FPCSR = 1
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute # If any gemm argument m, n or k is less or equal this threshold, gemm will be execute
# with single thread. You can use this flag to avoid the overhead of multi-threading # with single thread. (Actually in recent versions this is a factor proportional to the
# in small matrix sizes. The default value is 4. # number of floating point operations necessary for the given problem size, no longer
# an individual dimension). You can use this setting to avoid the overhead of multi-
# threading in small matrix sizes. The default value is 4, but values as high as 50 have
# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
# GEMM_MULTITHREAD_THRESHOLD = 4 # GEMM_MULTITHREAD_THRESHOLD = 4
# If you need santy check by comparing reference BLAS. It'll be very # If you need sanity check by comparing results to reference BLAS. It'll be very
# slow (Not implemented yet). # slow (Not implemented yet).
# SANITY_CHECK = 1 # SANITY_CHECK = 1
@ -160,8 +230,8 @@ NO_AFFINITY = 1
# Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT # Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT
# COMMON_OPT = -O2 # COMMON_OPT = -O2
# gfortran option for LAPACK # gfortran option for LAPACK to improve thread-safety
# enable this flag only on 64bit Linux and if you need a thread safe lapack library # It is enabled by default in Makefile.system for gfortran
# Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT # Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT
# FCOMMON_OPT = -frecursive # FCOMMON_OPT = -frecursive
@ -188,6 +258,21 @@ COMMON_PROF = -pg
# SYMBOLPREFIX= # SYMBOLPREFIX=
# SYMBOLSUFFIX= # SYMBOLSUFFIX=
# Run a C++ based thread safety tester after the build is done.
# This is mostly intended as a developer feature to spot regressions, but users and
# package maintainers can enable this if they have doubts about the thread safety of
# the library, given the configuration in this file.
# By default, the thread safety tester launches 52 concurrent calculations at the same
# time.
#
# Please note that the test uses ~1300 MiB of RAM for the DGEMM test.
#
# The test requires CBLAS to be built, a C++11 capable compiler and the presence of
# an OpenMP implementation. If you are cross-compiling this test will probably not
# work at all.
#
# CPP_THREAD_SAFETY_TEST = 1
# #
# End of user configuration # End of user configuration
# #

View File

@ -9,6 +9,26 @@ ifndef TOPDIR
TOPDIR = . TOPDIR = .
endif endif
# If ARCH is not set, we use the host system's architecture for getarch compile options.
ifndef ARCH
HOSTARCH := $(shell uname -m)
else
HOSTARCH = $(ARCH)
endif
# Catch conflicting usage of ARCH in some BSD environments
ifeq ($(ARCH), amd64)
override ARCH=x86_64
else ifeq ($(ARCH), powerpc64)
override ARCH=power
else ifeq ($(ARCH), i386)
override ARCH=x86
else ifeq ($(ARCH), aarch64)
override ARCH=arm64
else ifeq ($(ARCH), zarch)
override ARCH=zarch
endif
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
# Default C compiler # Default C compiler
@ -54,6 +74,7 @@ endif
ifdef TARGET ifdef TARGET
GETARCH_FLAGS := -DFORCE_$(TARGET) GETARCH_FLAGS := -DFORCE_$(TARGET)
GETARCH_FLAGS += -DUSER_TARGET
endif endif
# Force fallbacks for 32bit # Force fallbacks for 32bit
@ -62,6 +83,9 @@ ifeq ($(BINARY), 32)
ifeq ($(TARGET), HASWELL) ifeq ($(TARGET), HASWELL)
GETARCH_FLAGS := -DFORCE_NEHALEM GETARCH_FLAGS := -DFORCE_NEHALEM
endif endif
ifeq ($(TARGET), SKYLAKEX)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), SANDYBRIDGE) ifeq ($(TARGET), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM GETARCH_FLAGS := -DFORCE_NEHALEM
endif endif
@ -80,6 +104,9 @@ endif
ifeq ($(TARGET), ZEN) ifeq ($(TARGET), ZEN)
GETARCH_FLAGS := -DFORCE_BARCELONA GETARCH_FLAGS := -DFORCE_BARCELONA
endif endif
ifeq ($(TARGET), ARMV8)
GETARCH_FLAGS := -DFORCE_ARMV7
endif
endif endif
@ -95,6 +122,9 @@ ifeq ($(BINARY), 32)
ifeq ($(TARGET_CORE), HASWELL) ifeq ($(TARGET_CORE), HASWELL)
GETARCH_FLAGS := -DFORCE_NEHALEM GETARCH_FLAGS := -DFORCE_NEHALEM
endif endif
ifeq ($(TARGET_CORE), SKYLAKEX)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), SANDYBRIDGE) ifeq ($(TARGET_CORE), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM GETARCH_FLAGS := -DFORCE_NEHALEM
endif endif
@ -116,7 +146,12 @@ endif
endif endif
# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
ifeq ($(HOSTARCH), x86_64)
ifeq ($(findstring pgcc,$(HOSTCC)),)
GETARCH_FLAGS += -march=native
endif
endif
ifdef INTERFACE64 ifdef INTERFACE64
ifneq ($(INTERFACE64), 0) ifneq ($(INTERFACE64), 0)
@ -134,13 +169,18 @@ GETARCH_FLAGS += -DNO_AVX
endif endif
ifeq ($(BINARY), 32) ifeq ($(BINARY), 32)
GETARCH_FLAGS += -DNO_AVX GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512
NO_AVX512 = 1
endif endif
ifeq ($(NO_AVX2), 1) ifeq ($(NO_AVX2), 1)
GETARCH_FLAGS += -DNO_AVX2 GETARCH_FLAGS += -DNO_AVX2
endif endif
ifeq ($(NO_AVX512), 1)
GETARCH_FLAGS += -DNO_AVX512
endif
ifeq ($(DEBUG), 1) ifeq ($(DEBUG), 1)
GETARCH_FLAGS += -g GETARCH_FLAGS += -g
endif endif
@ -174,7 +214,7 @@ ifndef GOTOBLAS_MAKEFILE
export GOTOBLAS_MAKEFILE = 1 export GOTOBLAS_MAKEFILE = 1
# Generating Makefile.conf and config.h # Generating Makefile.conf and config.h
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
ifndef TARGET_CORE ifndef TARGET_CORE
include $(TOPDIR)/Makefile.conf include $(TOPDIR)/Makefile.conf
@ -184,6 +224,10 @@ endif
endif endif
ifndef NUM_PARALLEL
NUM_PARALLEL = 1
endif
ifndef NUM_THREADS ifndef NUM_THREADS
NUM_THREADS = $(NUM_CORES) NUM_THREADS = $(NUM_CORES)
endif endif
@ -207,6 +251,10 @@ SMP = 1
endif endif
endif endif
ifeq ($(SMP), 1)
USE_LOCKING =
endif
ifndef NEED_PIC ifndef NEED_PIC
NEED_PIC = 1 NEED_PIC = 1
endif endif
@ -223,9 +271,10 @@ OBJCOPY = $(CROSS_SUFFIX)objcopy
OBJCONV = $(CROSS_SUFFIX)objconv OBJCONV = $(CROSS_SUFFIX)objconv
# For detect fortran failed, only build BLAS. # When fortran support was either not detected or actively deselected, only build BLAS.
ifeq ($(NOFORTRAN), 1) ifeq ($(NOFORTRAN), 1)
NO_LAPACK = 1 NO_LAPACK = 1
override FEXTRALIB =
endif endif
# #
@ -234,7 +283,7 @@ endif
ifeq ($(OSNAME), Darwin) ifeq ($(OSNAME), Darwin)
ifndef MACOSX_DEPLOYMENT_TARGET ifndef MACOSX_DEPLOYMENT_TARGET
export MACOSX_DEPLOYMENT_TARGET=10.6 export MACOSX_DEPLOYMENT_TARGET=10.8
endif endif
MD5SUM = md5 -r MD5SUM = md5 -r
endif endif
@ -275,12 +324,14 @@ CCOMMON_OPT += -DMS_ABI
endif endif
ifeq ($(C_COMPILER), GCC) ifeq ($(C_COMPILER), GCC)
#Test for supporting MS_ABI #Version tests for supporting specific features (MS_ABI, POWER9 intrinsics)
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
ifeq ($(GCCVERSIONGT4), 1) ifeq ($(GCCVERSIONGT4), 1)
# GCC Majar version > 4 # GCC Major version > 4
# It is compatible with MSVC ABI. # It is compatible with MSVC ABI.
CCOMMON_OPT += -DMS_ABI CCOMMON_OPT += -DMS_ABI
endif endif
@ -358,6 +409,12 @@ ifneq ($(MAX_STACK_ALLOC), 0)
CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC) CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC)
endif endif
ifdef USE_LOCKING
ifneq ($(USE_LOCKING), 0)
CCOMMON_OPT += -DUSE_LOCKING
endif
endif
# #
# Architecture dependent settings # Architecture dependent settings
# #
@ -433,7 +490,7 @@ CCOMMON_OPT += -fopenmp
endif endif
ifeq ($(C_COMPILER), INTEL) ifeq ($(C_COMPILER), INTEL)
CCOMMON_OPT += -openmp CCOMMON_OPT += -fopenmp
endif endif
ifeq ($(C_COMPILER), PGI) ifeq ($(C_COMPILER), PGI)
@ -458,13 +515,70 @@ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
endif endif
ifeq ($(ARCH), x86_64) ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO DYNAMIC_CORE = PRESCOTT CORE2
ifeq ($(DYNAMIC_OLDER), 1)
DYNAMIC_CORE += PENRYN DUNNINGTON
endif
DYNAMIC_CORE += NEHALEM
ifeq ($(DYNAMIC_OLDER), 1)
DYNAMIC_CORE += OPTERON OPTERON_SSE3
endif
DYNAMIC_CORE += BARCELONA
ifeq ($(DYNAMIC_OLDER), 1)
DYNAMIC_CORE += BOBCAT ATOM NANO
endif
ifneq ($(NO_AVX), 1) ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR
endif endif
ifneq ($(NO_AVX2), 1) ifneq ($(NO_AVX2), 1)
DYNAMIC_CORE += HASWELL ZEN DYNAMIC_CORE += HASWELL ZEN
endif endif
ifneq ($(NO_AVX512), 1)
ifneq ($(NO_AVX2), 1)
DYNAMIC_CORE += SKYLAKEX
endif
endif
endif
ifdef DYNAMIC_LIST
override DYNAMIC_CORE = PRESCOTT $(DYNAMIC_LIST)
XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_PRESCOTT
XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
CCOMMON_OPT += $(XCCOMMON_OPT)
#CCOMMON_OPT += -DDYNAMIC_LIST='$(DYNAMIC_LIST)'
endif
ifeq ($(ARCH), arm64)
DYNAMIC_CORE = ARMV8
DYNAMIC_CORE += CORTEXA53
DYNAMIC_CORE += CORTEXA57
DYNAMIC_CORE += CORTEXA72
DYNAMIC_CORE += CORTEXA73
DYNAMIC_CORE += FALKOR
DYNAMIC_CORE += THUNDERX
DYNAMIC_CORE += THUNDERX2T99
DYNAMIC_CORE += TSV110
DYNAMIC_CORE += EMAG8180
endif
ifeq ($(ARCH), zarch)
DYNAMIC_CORE = Z13
DYNAMIC_CORE += Z14
endif
ifeq ($(ARCH), power)
DYNAMIC_CORE = POWER6
DYNAMIC_CORE += POWER8
ifneq ($(C_COMPILER), GCC)
DYNAMIC_CORE += POWER9
endif
ifeq ($(C_COMPILER), GCC)
ifeq ($(GCCVERSIONGT5), 1)
DYNAMIC_CORE += POWER9
else
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
endif
endif
endif endif
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
@ -564,9 +678,14 @@ CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64 FCOMMON_OPT += -march=mips64
endif endif
ifeq ($(CORE), 1004K)
CCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
FCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
endif
ifeq ($(CORE), P5600) ifeq ($(CORE), P5600)
CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS) FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
endif endif
ifeq ($(CORE), I6400) ifeq ($(CORE), I6400)
@ -606,7 +725,7 @@ endif
ifeq ($(C_COMPILER), PGI) ifeq ($(C_COMPILER), PGI)
ifdef BINARY64 ifdef BINARY64
CCOMMON_OPT += -tp p7-64 CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm
else else
CCOMMON_OPT += -tp p7 CCOMMON_OPT += -tp p7
endif endif
@ -666,12 +785,19 @@ else
FCOMMON_OPT += -m32 FCOMMON_OPT += -m32
endif endif
endif endif
ifneq ($(NO_LAPACKE), 1)
FCOMMON_OPT += -fno-second-underscore
endif
endif endif
endif endif
ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(F_COMPILER), GFORTRAN)
CCOMMON_OPT += -DF_INTERFACE_GFORT CCOMMON_OPT += -DF_INTERFACE_GFORT
FCOMMON_OPT += -Wall FCOMMON_OPT += -Wall
# make single-threaded LAPACK calls thread-safe #1847
FCOMMON_OPT += -frecursive
# work around ABI problem with passing single-character arguments
FCOMMON_OPT += -fno-optimize-sibling-calls
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
ifneq ($(NO_LAPACK), 1) ifneq ($(NO_LAPACK), 1)
EXTRALIB += -lgfortran EXTRALIB += -lgfortran
@ -717,7 +843,7 @@ FCOMMON_OPT += -i8
endif endif
endif endif
ifeq ($(USE_OPENMP), 1) ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -openmp FCOMMON_OPT += -fopenmp
endif endif
endif endif
@ -897,6 +1023,10 @@ ifeq ($(DYNAMIC_ARCH), 1)
CCOMMON_OPT += -DDYNAMIC_ARCH CCOMMON_OPT += -DDYNAMIC_ARCH
endif endif
ifeq ($(DYNAMIC_OLDER), 1)
CCOMMON_OPT += -DDYNAMIC_OLDER
endif
ifeq ($(NO_LAPACK), 1) ifeq ($(NO_LAPACK), 1)
CCOMMON_OPT += -DNO_LAPACK CCOMMON_OPT += -DNO_LAPACK
#Disable LAPACK C interface #Disable LAPACK C interface
@ -919,6 +1049,10 @@ ifeq ($(NO_AVX2), 1)
CCOMMON_OPT += -DNO_AVX2 CCOMMON_OPT += -DNO_AVX2
endif endif
ifeq ($(NO_AVX512), 1)
CCOMMON_OPT += -DNO_AVX512
endif
ifdef SMP ifdef SMP
CCOMMON_OPT += -DSMP_SERVER CCOMMON_OPT += -DSMP_SERVER
@ -965,10 +1099,18 @@ endif
CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS) CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS)
CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL)
ifdef USE_SIMPLE_THREADED_LEVEL3 ifdef USE_SIMPLE_THREADED_LEVEL3
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
endif endif
ifeq ($(USE_TLS), 1)
CCOMMON_OPT += -DUSE_TLS
endif
CCOMMON_OPT += -DVERSION=\"$(VERSION)\"
ifndef SYMBOLPREFIX ifndef SYMBOLPREFIX
SYMBOLPREFIX = SYMBOLPREFIX =
endif endif
@ -1016,8 +1158,12 @@ endif
endif endif
ifdef NO_AFFINITY ifdef NO_AFFINITY
ifeq ($(NO_AFFINITY), 0)
override undefine NO_AFFINITY
else
CCOMMON_OPT += -DNO_AFFINITY CCOMMON_OPT += -DNO_AFFINITY
endif endif
endif
ifdef FUNCTION_PROFILE ifdef FUNCTION_PROFILE
CCOMMON_OPT += -DFUNCTION_PROFILE CCOMMON_OPT += -DFUNCTION_PROFILE
@ -1079,8 +1225,6 @@ ifndef FCOMMON_OPT
FCOMMON_OPT = -O2 -frecursive FCOMMON_OPT = -O2 -frecursive
endif endif
override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
@ -1088,6 +1232,12 @@ override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
#MAKEOVERRIDES = #MAKEOVERRIDES =
ifdef NEED_PIC
ifeq (,$(findstring PIC,$(FFLAGS)))
override FFLAGS += -fPIC
endif
endif
#For LAPACK Fortran codes. #For LAPACK Fortran codes.
#Disable -fopenmp for LAPACK Fortran codes on Windows. #Disable -fopenmp for LAPACK Fortran codes on Windows.
ifdef OS_WINDOWS ifdef OS_WINDOWS
@ -1146,7 +1296,11 @@ endif
LIBDLLNAME = $(LIBPREFIX).dll LIBDLLNAME = $(LIBPREFIX).dll
IMPLIBNAME = lib$(LIBNAMEBASE).dll.a IMPLIBNAME = lib$(LIBNAMEBASE).dll.a
ifneq ($(OSNAME), AIX)
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
else
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a)
endif
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)
@ -1179,6 +1333,7 @@ export OSNAME
export ARCH export ARCH
export CORE export CORE
export LIBCORE export LIBCORE
export __BYTE_ORDER__
export PGCPATH export PGCPATH
export CONFIG export CONFIG
export CC export CC
@ -1223,6 +1378,7 @@ export MSA_FLAGS
export KERNELDIR export KERNELDIR
export FUNCTION_PROFILE export FUNCTION_PROFILE
export TARGET_CORE export TARGET_CORE
export NO_AVX512
export SGEMM_UNROLL_M export SGEMM_UNROLL_M
export SGEMM_UNROLL_N export SGEMM_UNROLL_N

View File

@ -8,6 +8,38 @@ endif
endif endif
endif endif
ifeq ($(CORE), SKYLAKEX)
ifndef DYNAMIC_ARCH
ifndef NO_AVX512
CCOMMON_OPT += -march=skylake-avx512
FCOMMON_OPT += -march=skylake-avx512
ifeq ($(OSNAME), CYGWIN_NT)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
ifeq ($(OSNAME), WINNT)
ifeq ($(C_COMPILER), GCC)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
endif
endif
endif
endif
ifeq ($(CORE), HASWELL)
ifndef DYNAMIC_ARCH
ifndef NO_AVX2
ifeq ($(C_COMPILER), GCC)
CCOMMON_OPT += -mavx2
endif
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -mavx2
endif
endif
endif
endif
ifeq ($(OSNAME), Interix) ifeq ($(OSNAME), Interix)
ARFLAGS = -m x64 ARFLAGS = -m x64
endif endif

View File

@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector
FCOMMON_OPT += -march=z13 -mzvector FCOMMON_OPT += -march=z13 -mzvector
endif endif
ifeq ($(CORE), Z14)
CCOMMON_OPT += -march=z14 -mzvector
FCOMMON_OPT += -march=z14 -mzvector
endif

View File

@ -6,11 +6,13 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=dev
AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
[![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
## Introduction ## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>. Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
## Binary Packages ## Binary Packages
@ -22,8 +24,10 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
## Installation from Source ## Installation from Source
Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
using Git from https://github.com/xianyi/OpenBLAS.git. using Git from https://github.com/xianyi/OpenBLAS.git.
Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option.
Most can also be given directly on the make or cmake command line.
### Dependencies ### Dependencies
@ -63,9 +67,7 @@ A debug version can be built using `make DEBUG=1`.
### Compile with MASS support on Power CPU (optional) ### Compile with MASS support on Power CPU (optional)
The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures.
consists of a set of mathematical functions for C, C++, and Fortran applications that are
are tuned for optimum performance on POWER architectures.
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER. OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
The library can be installed as shown: The library can be installed as shown:
@ -101,7 +103,7 @@ The default installation directory is `/opt/OpenBLAS`.
## Supported CPUs and Operating Systems ## Supported CPUs and Operating Systems
Please read `GotoBLAS_01Readme.txt`. Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by the 2010 GotoBLAS.
### Additional supported CPUs ### Additional supported CPUs
@ -109,11 +111,13 @@ Please read `GotoBLAS_01Readme.txt`.
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
- **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
- **AMD ZEN**: Uses Haswell codes with some optimizations.
#### MIPS64 #### MIPS64
@ -127,26 +131,51 @@ Please read `GotoBLAS_01Readme.txt`.
#### ARM64 #### ARM64
- **ARMv8**: Experimental - **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS
- **ARM Cortex-A57**: Experimental - **Cortex-A53**: same as ARMV8 (different cpu specifications)
- **Cortex A57**: Optimized Level-3 and Level-2 functions
- **Cortex A72**: same as A57 ( different cpu specifications)
- **Cortex A73**: same as A57 (different cpu specifications)
- **Falkor**: same as A57 (different cpu specifications)
- **ThunderX**: Optimized some Level-1 functions
- **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
- **TSV110**: Optimized some Level-3 helper functions
#### PPC/PPC64 #### PPC/PPC64
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` - **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
#### IBM zEnterprise System #### IBM zEnterprise System
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) - **Z13**: Optimized Level-3 BLAS and Level-1,2
- **Z14**: Optimized Level-3 BLAS and (single precision) Level-1,2
### Support for multiple targets in a single library
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying DYNAMIC_ARCH=1 in Makefile.rule, on the gmake command line or as -DDYNAMIC_ARCH=TRUE in cmake.
For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify DYNAMIC_OLDER=1, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option DYNAMIC_LIST that allows to specify an individual list of targets to include instead of the default.
DYNAMIC_ARCH is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.
On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus.
For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
The TARGET option can be used in conjunction with DYNAMIC_ARCH=1 to specify which cpu model should be assumed for all the
common code in the library, usually you will want to set this to the oldest model you expect to encounter.
Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library.
### Supported OS ### Supported OS
- **GNU/Linux** - **GNU/Linux**
- **MinGW or Visual Studio (CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>. - **MinGW or Visual Studio (CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts. - **Darwin/macOS/OSX/iOS**: Experimental. Although GotoBLAS2 already supports Darwin, we are not OSX/iOS experts.
- **FreeBSD**: Supported by the community. We don't actively test the library on this OS. - **FreeBSD**: Supported by the community. We don't actively test the library on this OS.
- **OpenBSD**: Supported by the community. We don't actively test the library on this OS. - **OpenBSD**: Supported by the community. We don't actively test the library on this OS.
- **NetBSD**: Supported by the community. We don't actively test the library on this OS.
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS. - **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>. - **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
- **AIX**: Supported on PPC up to POWER8
- **Haiku**: Supported by the community. We don't actively test the library on this OS.
- **SunOS**: Supported by the community. We don't actively test the library on this OS:
## Usage ## Usage
@ -200,7 +229,8 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
Clang 3.0 will generate the wrong AVX binary code. Clang 3.0 will generate the wrong AVX binary code.
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`), * Please use GCC version 6 or LLVM version 6 and above to compile Skylake AVX512 kernels.
* The number of CPUs/cores should be less than or equal to 256. On Linux `x86_64` (`amd64`),
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
the library with `BIGNUMA=1`. the library with `BIGNUMA=1`.
* OpenBLAS does not set processor affinity by default. * OpenBLAS does not set processor affinity by default.

View File

@ -20,6 +20,7 @@ DUNNINGTON
NEHALEM NEHALEM
SANDYBRIDGE SANDYBRIDGE
HASWELL HASWELL
SKYLAKEX
ATOM ATOM
b)AMD CPU: b)AMD CPU:
@ -47,6 +48,7 @@ POWER5
POWER6 POWER6
POWER7 POWER7
POWER8 POWER8
POWER9
PPCG4 PPCG4
PPC970 PPC970
PPC970MP PPC970MP
@ -56,6 +58,7 @@ CELL
3.MIPS CPU: 3.MIPS CPU:
P5600 P5600
1004K
4.MIPS64 CPU: 4.MIPS64 CPU:
SICORTEX SICORTEX
@ -81,11 +84,16 @@ ARMV5
8.ARM 64-bit CPU: 8.ARM 64-bit CPU:
ARMV8 ARMV8
CORTEXA53
CORTEXA57 CORTEXA57
VULCAN CORTEXA72
CORTEXA73
FALKOR
THUNDERX THUNDERX
THUNDERX2T99 THUNDERX2T99
TSV110
9.System Z: 9.System Z:
ZARCH_GENERIC ZARCH_GENERIC
Z13 Z13
Z14

View File

@ -35,6 +35,14 @@ environment:
DYNAMIC_ARCH: ON DYNAMIC_ARCH: ON
WITH_FORTRAN: no WITH_FORTRAN: no
- COMPILER: cl - COMPILER: cl
- COMPILER: MinGW64-gcc-7.2.0-mingw
DYNAMIC_ARCH: OFF
WITH_FORTRAN: ignore
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
COMPILER: MinGW-gcc-6.3.0-32
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
COMPILER: MinGW-gcc-5.3.0
WITH_FORTRAN: ignore
install: install:
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
@ -52,10 +60,17 @@ install:
before_build: before_build:
- ps: if (-Not (Test-Path .\build)) { mkdir build } - ps: if (-Not (Test-Path .\build)) { mkdir build }
- cd build - cd build
- set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
- if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] set PATH=C:\msys64\usr\bin;C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl .. - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
build_script: build_script:
- cmake --build . - cmake --build .
@ -64,3 +79,4 @@ test_script:
- echo Running Test - echo Running Test
- cd utest - cd utest
- openblas_utest - openblas_utest

51
azure-pipelines.yml Normal file
View File

@ -0,0 +1,51 @@
trigger:
# start a new build for every push
batch: False
branches:
include:
- develop
jobs:
# manylinux1 is useful to test because the
# standard Docker container uses an old version
# of gcc / glibc
- job: manylinux1_gcc
pool:
vmImage: 'ubuntu-16.04'
steps:
- script: |
echo "FROM quay.io/pypa/manylinux1_x86_64
COPY . /tmp/openblas
RUN cd /tmp/openblas && \
COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \
BTYPE='BINARY=64' CC=gcc && \
make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \
make -C test $COMMON_FLAGS $BTYPE && \
make -C ctest $COMMON_FLAGS $BTYPE && \
make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile
docker build .
displayName: Run manylinux1 docker build
- job: Intel_SDE_skx
pool:
vmImage: 'ubuntu-16.04'
steps:
- script: |
# at the time of writing the available Azure Ubuntu vm image
# does not support AVX512VL, so use more recent LTS version
echo "FROM ubuntu:bionic
COPY . /tmp/openblas
RUN apt-get -y update && apt-get -y install \\
cmake \\
gfortran \\
make \\
wget
RUN mkdir /tmp/SDE && cd /tmp/SDE && \\
mkdir sde-external-8.35.0-2019-03-11-lin && \\
wget --quiet -O sde-external-8.35.0-2019-03-11-lin.tar.bz2 https://www.dropbox.com/s/fopsnzj67572sj5/sde-external-8.35.0-2019-03-11-lin.tar.bz2?dl=0 && \\
tar -xjvf sde-external-8.35.0-2019-03-11-lin.tar.bz2 -C /tmp/SDE/sde-external-8.35.0-2019-03-11-lin --strip-components=1
RUN cd /tmp/openblas && CC=gcc make QUIET_MAKE=1 DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64
CMD cd /tmp/openblas && echo 0 > /proc/sys/kernel/yama/ptrace_scope && CC=gcc OPENBLAS_VERBOSE=2 /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/sde64 -cpuid_in /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/misc/cpuid/skx/cpuid.def -- make -C utest DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64" > Dockerfile
docker build -t intel_sde .
# we need a privileged docker run for sde process attachment
docker run --privileged intel_sde
displayName: 'Run AVX512 SkylakeX docker build / test'

View File

@ -129,7 +129,10 @@ int main(int argc, char *argv[]){
int step = 1; int step = 1;
struct timeval start, stop; struct timeval start, stop;
double time1,timeg; double time1 = 0.0, timeg = 0.0;
long nanos = 0;
time_t seconds = 0;
struct timespec time_start = { 0, 0 }, time_end = { 0, 0 };
argc--;argv++; argc--;argv++;
@ -163,35 +166,32 @@ int main(int argc, char *argv[]){
timeg=0; timeg=0;
fprintf(stderr, " %6d : ", (int)m); fprintf(stderr, " %6d : ", (int)m);
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for (l=0; l<loops; l++) for (l=0; l<loops; l++)
{ {
clock_gettime(CLOCK_REALTIME, &time_start);
COPY (&m, x, &inc_x, y, &inc_y );
clock_gettime(CLOCK_REALTIME, &time_end);
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ nanos = time_end.tv_nsec - time_start.tv_nsec;
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; seconds = time_end.tv_sec - time_start.tv_sec;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ time1 = seconds + nanos / 1.e9;
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; timeg += time1;
} }
gettimeofday( &start, (struct timezone *)0);
COPY (&m, x, &inc_x, y, &inc_y ); timeg /= loops;
gettimeofday( &stop, (struct timezone *)0); fprintf(stderr,
" %10.2f MBytes %12.9f sec\n",
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg / 1.e6, timeg);
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MBytes %10.6f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
} }

View File

@ -122,7 +122,7 @@ int main(int argc, char *argv[]){
FLOAT *a, *x, *y; FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0}; FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0}; FLOAT beta [] = {1.0, 0.0};
char trans='N'; char trans='N';
blasint m, i, j; blasint m, i, j;
blasint inc_x=1,inc_y=1; blasint inc_x=1,inc_y=1;
@ -197,7 +197,7 @@ int main(int argc, char *argv[]){
fprintf(stderr, " %6dx%d : ", (int)m,(int)n); fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
for(j = 0; j < m; j++){ for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){ for(i = 0; i < n * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; a[j + i * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
} }
@ -208,7 +208,7 @@ int main(int argc, char *argv[]){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); gettimeofday( &start, (struct timezone *)0);
@ -234,7 +234,7 @@ int main(int argc, char *argv[]){
fprintf(stderr, " %6dx%d : ", (int)m,(int)n); fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
for(j = 0; j < m; j++){ for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){ for(i = 0; i < n * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; a[j + i * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
} }
@ -245,7 +245,7 @@ int main(int argc, char *argv[]){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); gettimeofday( &start, (struct timezone *)0);

View File

@ -2,6 +2,8 @@
argv <- commandArgs(trailingOnly = TRUE) argv <- commandArgs(trailingOnly = TRUE)
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
nfrom <- 128 nfrom <- 128
nto <- 2048 nto <- 2048
nstep <- 128 nstep <- 128
@ -19,7 +21,6 @@ if (length(argv) > 0) {
loops <- as.numeric(argv[z]) loops <- as.numeric(argv[z])
} }
} }
} }
p <- Sys.getenv("OPENBLAS_LOOPS") p <- Sys.getenv("OPENBLAS_LOOPS")
@ -27,29 +28,21 @@ if (p != "") {
loops <- as.numeric(p) loops <- as.numeric(p)
} }
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf(" SIZE Flops Time\n")) cat(sprintf(" SIZE Flops Time\n"))
n <- nfrom n <- nfrom
while (n <= nto) { while (n <= nto) {
A <- matrix(rnorm(n * n), ncol = n, nrow = n) A <- matrix(rnorm(n * n), nrow = n)
ev <- 0 ev <- 0
z <- system.time(for (l in 1:loops) { z <- system.time(for (l in 1:loops) {
ev <- eigen(A) ev <- eigen(A)
}) })
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6) mflops <- (26.66 * n * n * n) * loops / (z[3] * 1e+06)
st <- sprintf("%.0fx%.0f :", n, n) st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
n <- n + nstep n <- n + nstep
} }

View File

@ -2,6 +2,8 @@
argv <- commandArgs(trailingOnly = TRUE) argv <- commandArgs(trailingOnly = TRUE)
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
nfrom <- 128 nfrom <- 128
nto <- 2048 nto <- 2048
nstep <- 128 nstep <- 128
@ -19,7 +21,6 @@ if (length(argv) > 0) {
loops <- as.numeric(argv[z]) loops <- as.numeric(argv[z])
} }
} }
} }
p <- Sys.getenv("OPENBLAS_LOOPS") p <- Sys.getenv("OPENBLAS_LOOPS")
@ -27,26 +28,13 @@ if (p != "") {
loops <- as.numeric(p) loops <- as.numeric(p)
} }
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf(" SIZE Flops Time\n")) cat(sprintf(" SIZE Flops Time\n"))
n <- nfrom n <- nfrom
while (n <= nto) { while (n <= nto) {
A <- matrix(runif(n * n), A <- matrix(runif(n * n), nrow = n)
ncol = n, B <- matrix(runif(n * n), nrow = n)
nrow = n,
byrow = TRUE)
B <- matrix(runif(n * n),
ncol = n,
nrow = n,
byrow = TRUE)
C <- 1 C <- 1
z <- system.time(for (l in 1:loops) { z <- system.time(for (l in 1:loops) {
@ -54,11 +42,10 @@ while (n <= nto) {
l <- l + 1 l <- l + 1
}) })
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6) mflops <- (2.0 * n * n * n) * loops / (z[3] * 1e+06)
st <- sprintf("%.0fx%.0f :", n, n) st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
n <- n + nstep n <- n + nstep
} }

View File

@ -2,6 +2,8 @@
argv <- commandArgs(trailingOnly = TRUE) argv <- commandArgs(trailingOnly = TRUE)
if (!is.null(options("matprod")[[1]])) options(matprod = "blas")
nfrom <- 128 nfrom <- 128
nto <- 2048 nto <- 2048
nstep <- 128 nstep <- 128
@ -19,7 +21,6 @@ if (length(argv) > 0) {
loops <- as.numeric(argv[z]) loops <- as.numeric(argv[z])
} }
} }
} }
p <- Sys.getenv("OPENBLAS_LOOPS") p <- Sys.getenv("OPENBLAS_LOOPS")
@ -27,31 +28,22 @@ if (p != "") {
loops <- as.numeric(p) loops <- as.numeric(p)
} }
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf(" SIZE Flops Time\n")) cat(sprintf(" SIZE Flops Time\n"))
n <- nfrom n <- nfrom
while (n <= nto) { while (n <= nto) {
A <- matrix(rnorm(n * n), ncol = n, nrow = n) A <- matrix(rnorm(n * n), nrow = n)
B <- matrix(rnorm(n * n), ncol = n, nrow = n) B <- matrix(rnorm(n * n), nrow = n)
z <- system.time(for (l in 1:loops) { z <- system.time(for (l in 1:loops) {
solve(A, B) solve(A, B)
}) })
mflops <- mflops <- (8.0 / 3 * n * n * n) * loops / (z[3] * 1e+06)
(2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6)
st <- sprintf("%.0fx%.0f :", n, n) st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
n <- n + nstep n <- n + nstep
} }

102
c_check
View File

@ -1,7 +1,7 @@
#!/usr/bin/perl #!/usr/bin/perl
use File::Basename; #use File::Basename;
use File::Temp qw(tempfile); # use File::Temp qw(tempfile);
# Checking cross compile # Checking cross compile
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
@ -12,17 +12,18 @@ $hostarch = "arm64" if ($hostarch eq "aarch64");
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
$hostarch = "zarch" if ($hostarch eq "s390x"); $hostarch = "zarch" if ($hostarch eq "s390x");
$tmpf = new File::Temp( UNLINK => 1 ); #$tmpf = new File::Temp( UNLINK => 1 );
$binary = $ENV{"BINARY"}; $binary = $ENV{"BINARY"};
$makefile = shift(@ARGV); $makefile = shift(@ARGV);
$config = shift(@ARGV); $config = shift(@ARGV);
$compiler_name = join(" ", @ARGV); $compiler_name = shift(@ARGV);
$flags = join(" ", @ARGV);
# First, we need to know the target OS and compiler name # First, we need to know the target OS and compiler name
$data = `$compiler_name -E ctest.c`; $data = `$compiler_name $flags -E ctest.c`;
if ($?) { if ($?) {
printf STDERR "C Compiler ($compiler_name) is something wrong.\n"; printf STDERR "C Compiler ($compiler_name) is something wrong.\n";
@ -31,12 +32,25 @@ if ($?) {
$cross_suffix = ""; $cross_suffix = "";
if (dirname($compiler_name) ne ".") { eval "use File::Basename";
$cross_suffix .= dirname($compiler_name) . "/"; if ($@){
} warn "could not load PERL module File::Basename, emulating its functionality";
my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 );
if ($dirnam ne ".") {
$cross_suffix .= $dirnam . "/";
}
my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1);
if ($basnam =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
}
} else {
if (dirname($compiler_name) ne ".") {
$cross_suffix .= dirname($compiler_name) . "/";
}
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1; $cross_suffix .= $1;
}
} }
$compiler = ""; $compiler = "";
@ -64,6 +78,7 @@ $os = WINNT if ($data =~ /OS_WINNT/);
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/); $os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
$os = Interix if ($data =~ /OS_INTERIX/); $os = Interix if ($data =~ /OS_INTERIX/);
$os = Android if ($data =~ /OS_ANDROID/); $os = Android if ($data =~ /OS_ANDROID/);
$os = Haiku if ($data =~ /OS_HAIKU/);
$architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/);
@ -167,7 +182,7 @@ if ($defined == 0) {
# Do again # Do again
$data = `$compiler_name -E ctest.c`; $data = `$compiler_name $flags -E ctest.c`;
if ($?) { if ($?) {
printf STDERR "C Compiler ($compiler_name) is something wrong.\n"; printf STDERR "C Compiler ($compiler_name) is something wrong.\n";
@ -176,20 +191,26 @@ if ($?) {
$have_msa = 0; $have_msa = 0;
if (($architecture eq "mips") || ($architecture eq "mips64")) { if (($architecture eq "mips") || ($architecture eq "mips64")) {
$code = '"addvi.b $w0, $w1, 1"'; eval "use File::Temp qw(tempfile)";
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; if ($@){
print $tmpf "#include <msa.h>\n\n"; warn "could not load PERL module File::Temp, so could not check MSA capatibility";
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args");
system(@cmd) == 0;
if ($? != 0) {
$have_msa = 0;
} else { } else {
$have_msa = 1; $tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
$code = '"addvi.b $w0, $w1, 1"';
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
print $tmpf "#include <msa.h>\n\n";
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
$args = "$msa_flags -o $tmpf.o $tmpf";
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$have_msa = 0;
} else {
$have_msa = 1;
}
unlink("$tmpf.o");
} }
unlink("$tmpf.o");
} }
$architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86 if ($data =~ /ARCH_X86/);
@ -207,14 +228,39 @@ $architecture = zarch if ($data =~ /ARCH_ZARCH/);
$binformat = bin32; $binformat = bin32;
$binformat = bin64 if ($data =~ /BINARY_64/); $binformat = bin64 if ($data =~ /BINARY_64/);
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; $no_avx512= 0;
if (($architecture eq "x86") || ($architecture eq "x86_64")) {
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512";
$no_avx512 = 0;
} else {
# $tmpf = new File::Temp( UNLINK => 1 );
($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
$args = " -march=skylake-avx512 -c -o $tmpf.o $tmpf";
if ($compiler eq "PGI") {
$args = " -tp skylake -c -o $tmpf.o $tmpf";
}
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_avx512 = 1;
} else {
$no_avx512 = 0;
}
unlink("$tmpf.o");
}
}
$data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
$data =~ /globl\s([_\.]*)(.*)/; $data =~ /globl\s([_\.]*)(.*)/;
$need_fu = $1; $need_fu = $1;
$cross = 0; $cross = 0;
$cross = 1 if ($os ne $hostos);
if ($architecture ne $hostarch) { if ($architecture ne $hostarch) {
$cross = 1; $cross = 1;
@ -222,6 +268,8 @@ if ($architecture ne $hostarch) {
$cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips")); $cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips"));
} }
$cross = 1 if ($os ne $hostos);
$openmp = "" if $ENV{USE_OPENMP} != 1; $openmp = "" if $ENV{USE_OPENMP} != 1;
$linker_L = ""; $linker_L = "";
@ -229,7 +277,7 @@ $linker_l = "";
$linker_a = ""; $linker_a = "";
{ {
$link = `$compiler_name -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`; $link = `$compiler_name $flags -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $flags $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`;
$link =~ s/\-Y\sP\,/\-Y/g; $link =~ s/\-Y\sP\,/\-Y/g;
@ -267,6 +315,7 @@ $linker_a = "";
&& ($flags !~ /kernel32/) && ($flags !~ /kernel32/)
&& ($flags !~ /advapi32/) && ($flags !~ /advapi32/)
&& ($flags !~ /shell32/) && ($flags !~ /shell32/)
&& ($flags !~ /omp/)
) { ) {
$linker_l .= $flags . " " $linker_l .= $flags . " "
} }
@ -294,6 +343,7 @@ print MAKEFILE "CROSS=1\n" if $cross != 0;
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n"; print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1; print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1; print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
$os =~ tr/[a-z]/[A-Z]/; $os =~ tr/[a-z]/[A-Z]/;
$architecture =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/;

21
cblas.h
View File

@ -51,6 +51,7 @@ typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=1
typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
typedef CBLAS_ORDER CBLAS_LAYOUT;
float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy); double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
@ -72,6 +73,11 @@ double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS
float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
float cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
float cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX); float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX);
double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX); double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);
float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX); float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX);
@ -82,6 +88,21 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_ismin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_idmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_icmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);

View File

@ -0,0 +1,79 @@
# OpenBLASConfig.cmake
# --------------------
#
# OpenBLAS cmake module.
# This module sets the following variables in your project::
#
# OpenBLAS_FOUND - true if OpenBLAS and all required components found on the system
# OpenBLAS_VERSION - OpenBLAS version in format Major.Minor.Release
# OpenBLAS_INCLUDE_DIRS - Directory where OpenBLAS header is located.
# OpenBLAS_INCLUDE_DIR - same as DIRS
# OpenBLAS_LIBRARIES - OpenBLAS library to link against.
# OpenBLAS_LIBRARY - same as LIBRARIES
#
#
# Available components::
#
## shared - search for only shared library
## static - search for only static library
# serial - search for unthreaded library
# pthread - search for native pthread threaded library
# openmp - search for OpenMP threaded library
#
#
# Exported targets::
#
# If OpenBLAS is found, this module defines the following :prop_tgt:`IMPORTED`
## target. Target is shared _or_ static, so, for both, use separate, not
## overlapping, installations. ::
#
# OpenBLAS::OpenBLAS - the main OpenBLAS library #with header & defs attached.
#
#
# Suggested usage::
#
# find_package(OpenBLAS)
# find_package(OpenBLAS 0.2.20 EXACT CONFIG REQUIRED COMPONENTS pthread)
#
#
# The following variables can be set to guide the search for this package::
#
# OpenBLAS_DIR - CMake variable, set to directory containing this Config file
# CMAKE_PREFIX_PATH - CMake variable, set to root directory of this package
# PATH - environment variable, set to bin directory of this package
# CMAKE_DISABLE_FIND_PACKAGE_OpenBLAS - CMake variable, disables
# find_package(OpenBLAS) when not REQUIRED, perhaps to force internal build
@PACKAGE_INIT@
set(PN OpenBLAS)
# need to check that the @USE_*@ evaluate to something cmake can perform boolean logic upon
if(@USE_OPENMP@)
set(${PN}_openmp_FOUND 1)
elseif(@USE_THREAD@)
set(${PN}_pthread_FOUND 1)
else()
set(${PN}_serial_FOUND 1)
endif()
check_required_components(${PN})
#-----------------------------------------------------------------------------
# Don't include targets if this file is being picked up by another
# project which has already built this as a subproject
#-----------------------------------------------------------------------------
if(NOT TARGET ${PN}::OpenBLAS)
include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake")
get_property(_loc TARGET ${PN}::OpenBLAS PROPERTY LOCATION)
set(${PN}_LIBRARY ${_loc})
get_property(_ill TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_LINK_LIBRARIES)
set(${PN}_LIBRARIES ${_ill})
get_property(_id TARGET ${PN}::OpenBLAS PROPERTY INCLUDE_DIRECTORIES)
set(${PN}_INCLUDE_DIR ${_id})
get_property(_iid TARGET ${PN}::OpenBLAS PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
set(${PN}_INCLUDE_DIRS ${_iid})
endif()

View File

@ -44,22 +44,49 @@ endif ()
if (DYNAMIC_ARCH) if (DYNAMIC_ARCH)
if (ARM64)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180)
endif ()
if (POWER)
set(DYNAMIC_CORE POWER6 POWER8 POWER9)
endif ()
if (X86) if (X86)
set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO) set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO)
endif () endif ()
if (X86_64) if (X86_64)
set(DYNAMIC_CORE PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO) set(DYNAMIC_CORE PRESCOTT CORE2)
if (DYNAMIC_OLDER)
set (DYNAMIC_CORE ${DYNAMIC_CORE} PENRYN DUNNINGTON)
endif ()
set (DYNAMIC_CORE ${DYNAMIC_CORE} NEHALEM)
if (DYNAMIC_OLDER)
set (DYNAMIC_CORE ${DYNAMIC_CORE} OPTERON OPTERON_SSE3)
endif ()
set (DYNAMIC_CORE ${DYNAMIC_CORE} BARCELONA)
if (DYNAMIC_OLDER)
set (DYNAMIC_CORE ${DYNAMIC_CORE} BOBCAT ATOM NANO)
endif ()
if (NOT NO_AVX) if (NOT NO_AVX)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR) set(DYNAMIC_CORE ${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR)
endif () endif ()
if (NOT NO_AVX2) if (NOT NO_AVX2)
set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN) set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
endif () endif ()
if (NOT NO_AVX512)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
endif ()
if (DYNAMIC_LIST)
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
endif ()
endif () endif ()
if (NOT DYNAMIC_CORE) if (NOT DYNAMIC_CORE)
unset(DYNAMIC_ARCH) message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options")
unset(DYNAMIC_ARCH CACHE)
endif () endif ()
endif () endif ()

View File

@ -3,7 +3,7 @@
## Description: Ported from portion of OpenBLAS/Makefile.system ## Description: Ported from portion of OpenBLAS/Makefile.system
## Sets C related variables. ## Sets C related variables.
if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_C_COMPILER} STREQUAL "Clang") if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB" OR ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
set(CCOMMON_OPT "${CCOMMON_OPT} -Wall") set(CCOMMON_OPT "${CCOMMON_OPT} -Wall")
set(COMMON_PROF "${COMMON_PROF} -fno-inline") set(COMMON_PROF "${COMMON_PROF} -fno-inline")
@ -43,7 +43,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR
endif () endif ()
endif () endif ()
if (${CMAKE_C_COMPILER} STREQUAL "PGI") if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
if (BINARY64) if (BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7-64") set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7-64")
else () else ()
@ -51,7 +51,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PGI")
endif () endif ()
endif () endif ()
if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE") if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
if (BINARY64) if (BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -m64") set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
else () else ()
@ -59,7 +59,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
endif () endif ()
endif () endif ()
if (${CMAKE_C_COMPILER} STREQUAL "OPEN64") if (${CMAKE_C_COMPILER_ID} STREQUAL "OPEN64")
if (MIPS64) if (MIPS64)
@ -87,7 +87,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
endif () endif ()
endif () endif ()
if (${CMAKE_C_COMPILER} STREQUAL "SUN") if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN")
set(CCOMMON_OPT "${CCOMMON_OPT} -w") set(CCOMMON_OPT "${CCOMMON_OPT} -w")
if (X86) if (X86)
set(CCOMMON_OPT "${CCOMMON_OPT} -m32") set(CCOMMON_OPT "${CCOMMON_OPT} -m32")
@ -96,3 +96,10 @@ if (${CMAKE_C_COMPILER} STREQUAL "SUN")
endif () endif ()
endif () endif ()
if (${CORE} STREQUAL "SKYLAKEX")
if (NOT DYNAMIC_ARCH)
if (NOT NO_AVX512)
set (CCOMMON_OPT = "${CCOMMON_OPT} -march=skylake-avx512")
endif ()
endif ()
endif ()

View File

@ -3,6 +3,11 @@
## Description: Ported from portion of OpenBLAS/Makefile.system ## Description: Ported from portion of OpenBLAS/Makefile.system
## Sets Fortran related variables. ## Sets Fortran related variables.
if (INTERFACE64)
set(SUFFIX64 64)
set(SUFFIX64_UNDERSCORE _64)
endif()
if (${F_COMPILER} STREQUAL "FLANG") if (${F_COMPILER} STREQUAL "FLANG")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
if (BINARY64 AND INTERFACE64) if (BINARY64 AND INTERFACE64)
@ -39,7 +44,10 @@ endif ()
if (${F_COMPILER} STREQUAL "GFORTRAN") if (${F_COMPILER} STREQUAL "GFORTRAN")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall") # ensure reentrancy of lapack codes
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
# work around ABI violation in passing string arguments from C
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
if (NOT NO_LAPACK) if (NOT NO_LAPACK)
set(EXTRALIB "{EXTRALIB} -lgfortran") set(EXTRALIB "{EXTRALIB} -lgfortran")

View File

@ -1,7 +1,7 @@
# helper functions for the kernel CMakeLists.txt # helper functions for the kernel CMakeLists.txt
# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file. # Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file.
macro(SetDefaultL1) macro(SetDefaultL1)
set(SAMAXKERNEL amax.S) set(SAMAXKERNEL amax.S)
set(DAMAXKERNEL amax.S) set(DAMAXKERNEL amax.S)
@ -107,6 +107,12 @@ macro(SetDefaultL1)
set(DAXPBYKERNEL ../arm/axpby.c) set(DAXPBYKERNEL ../arm/axpby.c)
set(CAXPBYKERNEL ../arm/zaxpby.c) set(CAXPBYKERNEL ../arm/zaxpby.c)
set(ZAXPBYKERNEL ../arm/zaxpby.c) set(ZAXPBYKERNEL ../arm/zaxpby.c)
set(SSUMKERNEL sum.S)
set(DSUMKERNEL sum.S)
set(CSUMKERNEL zsum.S)
set(ZSUMKERNEL zsum.S)
set(QSUMKERNEL sum.S)
set(XSUMKERNEL zsum.S)
endmacro () endmacro ()
macro(SetDefaultL2) macro(SetDefaultL2)

View File

@ -115,7 +115,9 @@ set(SLASRC
stplqt.f stplqt2.f stpmlqt.f stplqt.f stplqt2.f stpmlqt.f
ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f
ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f) ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
scombssq.f sgesvdq.f slaorhr_col_getrfnp.f
slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f )
set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
@ -210,7 +212,9 @@ set(CLASRC
ctplqt.f ctplqt2.f ctpmlqt.f ctplqt.f ctplqt2.f ctpmlqt.f
chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f
cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f) chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f
cungtsqr.f cunhr_col.f )
set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
@ -299,7 +303,9 @@ set(DLASRC
dtplqt.f dtplqt2.f dtpmlqt.f dtplqt.f dtplqt2.f dtpmlqt.f
dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f
dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f) dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f )
set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
@ -398,7 +404,9 @@ set(ZLASRC
zgelq.f zlaswlq.f zlamswlq.f zgemlq.f zgelq.f zlaswlq.f zlamswlq.f zgemlq.f
zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f
zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f) zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
zungtsqr.f zunhr_col.f)
set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f

View File

@ -715,6 +715,8 @@ set(DSRC
lapacke_dgesv_work.c lapacke_dgesv_work.c
lapacke_dgesvd.c lapacke_dgesvd.c
lapacke_dgesvd_work.c lapacke_dgesvd_work.c
lapacke_dgesvdq.c
lapacke_dgesvdq_work.c
lapacke_dgesvdx.c lapacke_dgesvdx.c
lapacke_dgesvdx_work.c lapacke_dgesvdx_work.c
lapacke_dgesvj.c lapacke_dgesvj.c
@ -1287,6 +1289,8 @@ set(SSRC
lapacke_sgesv_work.c lapacke_sgesv_work.c
lapacke_sgesvd.c lapacke_sgesvd.c
lapacke_sgesvd_work.c lapacke_sgesvd_work.c
lapacke_sgesvdq.c
lapacke_sgesvdq_work.c
lapacke_sgesvdx.c lapacke_sgesvdx.c
lapacke_sgesvdx_work.c lapacke_sgesvdx_work.c
lapacke_sgesvj.c lapacke_sgesvj.c
@ -1853,6 +1857,8 @@ set(ZSRC
lapacke_zgesv_work.c lapacke_zgesv_work.c
lapacke_zgesvd.c lapacke_zgesvd.c
lapacke_zgesvd_work.c lapacke_zgesvd_work.c
lapacke_zgesvdq.c
lapacke_zgesvdq_work.c
lapacke_zgesvdx.c lapacke_zgesvdx.c
lapacke_zgesvdx_work.c lapacke_zgesvdx_work.c
lapacke_zgesvj.c lapacke_zgesvj.c

View File

@ -1,9 +1,11 @@
libdir=@CMAKE_INSTALL_FULL_LIBDIR@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@
libsuffix=@SUFFIX64_UNDERSCORE@
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
Name: OpenBLAS Name: OpenBLAS
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
Version: @OPENBLAS_VERSION@ Version: @OPENBLAS_VERSION@
URL: https://github.com/xianyi/OpenBLAS URL: https://github.com/xianyi/OpenBLAS
Libs: -L${libdir} -lopenblas Libs: -L${libdir} -lopenblas${libsuffix}
Cflags: -I${includedir} Cflags: -I${includedir}

View File

@ -8,6 +8,11 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
set(NO_EXPRECISION 1) set(NO_EXPRECISION 1)
endif () endif ()
if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly")
set(EXTRALIB "${EXTRALIB} -lm")
set(NO_EXPRECISION 1)
endif ()
if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX") if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX")
set(EXTRALIB "${EXTRALIB} -lm") set(EXTRALIB "${EXTRALIB} -lm")
endif () endif ()

View File

@ -59,6 +59,9 @@ set(FU "")
if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")) if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang"))
set(FU "_") set(FU "_")
endif() endif()
if(MINGW AND NOT MINGW64)
set(FU "_")
endif()
set(COMPILER_ID ${CMAKE_C_COMPILER_ID}) set(COMPILER_ID ${CMAKE_C_COMPILER_ID})
if (${COMPILER_ID} STREQUAL "GNU") if (${COMPILER_ID} STREQUAL "GNU")
@ -82,18 +85,59 @@ endif ()
# f_check # f_check
if (NOT NOFORTRAN) if (NOT NOFORTRAN)
include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake") include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake")
else ()
file(APPEND ${TARGET_CONF_TEMP}
"#define BUNDERSCORE _\n"
"#define NEEDBUNDERSCORE 1\n")
set(BU "_")
endif () endif ()
# Cannot run getarch on target if we are cross-compiling # Cannot run getarch on target if we are cross-compiling
if (DEFINED CORE AND CMAKE_CROSSCOMPILING) if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE"))
# Write to config as getarch would # Write to config as getarch would
if (DEFINED TARGET_CORE)
set(TCORE ${TARGET_CORE})
else()
set(TCORE ${CORE})
endif()
# TODO: Set up defines that getarch sets up based on every other target # TODO: Set up defines that getarch sets up based on every other target
# Perhaps this should be inside a different file as it grows larger # Perhaps this should be inside a different file as it grows larger
file(APPEND ${TARGET_CONF_TEMP} file(APPEND ${TARGET_CONF_TEMP}
"#define ${CORE}\n" "#define ${TCORE}\n"
"#define CHAR_CORENAME \"${CORE}\"\n") "#define CORE_${TCORE}\n"
if ("${CORE}" STREQUAL "ARMV7") "#define CHAR_CORENAME \"${TCORE}\"\n")
if ("${TCORE}" STREQUAL "CORE2")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L2_SIZE\t1048576\n"
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t256\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_CMOV\n"
"#define HAVE_MMX\n"
"#define HAVE_SSE\n"
"#define HAVE_SSE2\n"
"#define HAVE_SSE3\n"
"#define HAVE_SSSE3\n"
"#define SLOCAL_BUFFER_SIZE\t16384\n"
"#define DLOCAL_BUFFER_SIZE\t16384\n"
"#define CLOCAL_BUFFER_SIZE\t16384\n"
"#define ZLOCAL_BUFFER_SIZE\t16384\n")
set(SGEMM_UNROLL_M 8)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 4)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
set(CGEMM3M_UNROLL_M 8)
set(CGEMM3M_UNROLL_N 4)
set(ZGEMM3M_UNROLL_M 4)
set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "ARMV7")
file(APPEND ${TARGET_CONF_TEMP} file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t65536\n" "#define L1_DATA_SIZE\t65536\n"
"#define L1_DATA_LINESIZE\t32\n" "#define L1_DATA_LINESIZE\t32\n"
@ -108,7 +152,11 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
set(SGEMM_UNROLL_N 4) set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 4) set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 4) set(DGEMM_UNROLL_N 4)
elseif ("${CORE}" STREQUAL "ARMV8") set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
elseif ("${TCORE}" STREQUAL "ARMV8")
file(APPEND ${TARGET_CONF_TEMP} file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t32768\n" "#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n" "#define L1_DATA_LINESIZE\t64\n"
@ -116,18 +164,26 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
"#define L2_LINESIZE\t64\n" "#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t64\n" "#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n" "#define DTB_SIZE\t4096\n"
"#define L2_ASSOCIATIVE\t32\n") "#define L2_ASSOCIATIVE\t32\n"
set(SGEMM_UNROLL_M 4) "#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4) set(SGEMM_UNROLL_N 4)
elseif ("${CORE}" STREQUAL "CORTEXA57") set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53")
file(APPEND ${TARGET_CONF_TEMP} file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t49152\n" "#define L1_CODE_SIZE\t32768\n"
"#define L1_CODE_LINESIZE\t64\n" "#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t3\n" "#define L1_CODE_ASSOCIATIVE\t3\n"
"#define L1_DATA_SIZE\t32768\n" "#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n" "#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t2\n" "#define L1_DATA_ASSOCIATIVE\t2\n"
"#define L2_SIZE\t2097152\n" "#define L2_SIZE\t262144\n"
"#define L2_LINESIZE\t64\n" "#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t16\n" "#define L2_ASSOCIATIVE\t16\n"
"#define DTB_DEFAULT_ENTRIES\t64\n" "#define DTB_DEFAULT_ENTRIES\t64\n"
@ -135,15 +191,224 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
"#define HAVE_VFPV4\n" "#define HAVE_VFPV4\n"
"#define HAVE_VFPV3\n" "#define HAVE_VFPV3\n"
"#define HAVE_VFP\n" "#define HAVE_VFP\n"
"#define HAVE_NEON\n") "#define HAVE_NEON\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4) set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8) set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4) set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8) set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4) set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 8) set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4) set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t49152\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t3\n"
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t2\n"
"#define L2_SIZE\t524288\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t16\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_VFPV4\n"
"#define HAVE_VFPV3\n"
"#define HAVE_VFP\n"
"#define HAVE_NEON\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "FALKOR")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t65536\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t3\n"
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t128\n"
"#define L1_DATA_ASSOCIATIVE\t2\n"
"#define L2_SIZE\t524288\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t16\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_VFPV4\n"
"#define HAVE_VFPV3\n"
"#define HAVE_VFP\n"
"#define HAVE_NEON\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "THUNDERX")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t32768\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t3\n"
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t128\n"
"#define L1_DATA_ASSOCIATIVE\t2\n"
"#define L2_SIZE\t167772164\n"
"#define L2_LINESIZE\t128\n"
"#define L2_ASSOCIATIVE\t16\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_VFPV4\n"
"#define HAVE_VFPV3\n"
"#define HAVE_VFP\n"
"#define HAVE_NEON\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 4)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 2)
set(DGEMM_UNROLL_N 2)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "THUNDERX2T99")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t32768\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t8\n"
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t8\n"
"#define L2_SIZE\t262144\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t8\n"
"#define L3_SIZE\t33554432\n"
"#define L3_LINESIZE\t64\n"
"#define L3_ASSOCIATIVE\t32\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "TSV110")
file(APPEND ${TARGET_CONF_TEMP}
"#define ARMV8\n"
"#define L1_CODE_SIZE\t65536\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t4\n"
"#define L1_DATA_SIZE\t65536\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t4\n"
"#define L2_SIZE\t524288\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t8\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "EMAG8180")
file(APPEND ${TARGET_CONF_TEMP}
"#define ARMV8\n"
"#define L1_CODE_SIZE\t32768\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t4\n"
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t4\n"
"#define L2_SIZE\t5262144\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t8\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "POWER6")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE 32768\n"
"#define L1_DATA_LINESIZE 128\n"
"#define L2_SIZE 524288\n"
"#define L2_LINESIZE 128 \n"
"#define DTB_DEFAULT_ENTRIES 128\n"
"#define DTB_SIZE 4096\n"
"#define L2_ASSOCIATIVE 8\n")
set(SGEMM_UNROLL_M 4)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 8)
elseif ("${TCORE}" STREQUAL "POWER8")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE 32768\n"
"#define L1_DATA_LINESIZE 128\n"
"#define L2_SIZE 524288\n"
"#define L2_LINESIZE 128 \n"
"#define DTB_DEFAULT_ENTRIES 128\n"
"#define DTB_SIZE 4096\n"
"#define L2_ASSOCIATIVE 8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 8)
set(DGEMM_UNROLL_M 16)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 8)
set(ZGEMM_UNROLL_N 2)
set(SYMV_P 8)
elseif ("${TCORE}" STREQUAL "POWER9")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE 32768\n"
"#define L1_DATA_LINESIZE 128\n"
"#define L2_SIZE 524288\n"
"#define L2_LINESIZE 128 \n"
"#define DTB_DEFAULT_ENTRIES 128\n"
"#define DTB_SIZE 4096\n"
"#define L2_ASSOCIATIVE 8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 8)
set(DGEMM_UNROLL_M 16)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 8)
set(ZGEMM_UNROLL_N 2)
set(SYMV_P 8)
endif() endif()
# Or should this actually be NUM_CORES? # Or should this actually be NUM_CORES?
@ -163,6 +428,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
file(APPEND ${TARGET_CONF_TEMP} file(APPEND ${TARGET_CONF_TEMP}
"#define GEMM_MULTITHREAD_THRESHOLD\t${GEMM_MULTITHREAD_THRESHOLD}\n") "#define GEMM_MULTITHREAD_THRESHOLD\t${GEMM_MULTITHREAD_THRESHOLD}\n")
# Move to where gen_config_h would place it # Move to where gen_config_h would place it
file(MAKE_DIRECTORY ${TARGET_CONF_DIR})
file(RENAME ${TARGET_CONF_TEMP} "${TARGET_CONF_DIR}/${TARGET_CONF}") file(RENAME ${TARGET_CONF_TEMP} "${TARGET_CONF_DIR}/${TARGET_CONF}")
else(NOT CMAKE_CROSSCOMPILING) else(NOT CMAKE_CROSSCOMPILING)
@ -178,6 +444,9 @@ else(NOT CMAKE_CROSSCOMPILING)
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
else() else()
list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S) list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S)
if (DEFINED TARGET_CORE)
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_${TARGET_CORE})
endif ()
endif () endif ()
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")

View File

@ -33,19 +33,50 @@ endif ()
if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
message(STATUS "Compiling a ${BINARY}-bit binary.") message(STATUS "Compiling a ${BINARY}-bit binary.")
set(NO_AVX 1) set(NO_AVX 1)
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE") if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX")
set(TARGET "NEHALEM") set(TARGET "NEHALEM")
endif () endif ()
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
set(TARGET "BARCELONA") set(TARGET "BARCELONA")
endif () endif ()
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53")
set(TARGET "ARMV7")
endif ()
endif () endif ()
if (DEFINED TARGET)
if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
endif()
if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2)
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
endif()
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
endif()
endif()
endif()
if (DEFINED TARGET) if (DEFINED TARGET)
message(STATUS "Targeting the ${TARGET} architecture.") message(STATUS "Targeting the ${TARGET} architecture.")
set(GETARCH_FLAGS "-DFORCE_${TARGET}") set(GETARCH_FLAGS "-DFORCE_${TARGET}")
endif () endif ()
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
endif ()
# On x86 no AVX support is available
if (X86 OR X86_64)
if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4"))
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512")
endif ()
endif ()
if (INTERFACE64) if (INTERFACE64)
message(STATUS "Using 64-bit integers.") message(STATUS "Using 64-bit integers.")
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT") set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT")
@ -96,6 +127,10 @@ if (NOT CMAKE_CROSSCOMPILING)
endif() endif()
if (NOT DEFINED NUM_PARALLEL)
set(NUM_PARALLEL 1)
endif()
if (NOT DEFINED NUM_THREADS) if (NOT DEFINED NUM_THREADS)
if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0) if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0)
# HT? # HT?
@ -113,10 +148,16 @@ endif ()
if (USE_THREAD) if (USE_THREAD)
message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.") message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.")
else()
if (${USE_LOCKING})
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_LOCKING")
endif ()
endif () endif ()
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
if (DEFINED BINARY)
message(STATUS "Compiling a ${BINARY}-bit binary.")
endif ()
if (NOT DEFINED NEED_PIC) if (NOT DEFINED NEED_PIC)
set(NEED_PIC 1) set(NEED_PIC 1)
endif () endif ()
@ -133,6 +174,9 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
if (NOT NOFORTRAN) if (NOT NOFORTRAN)
# Fortran Compiler dependent settings # Fortran Compiler dependent settings
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
else ()
set(NO_LAPACK 1)
set(NO_LAPACKE 1)
endif () endif ()
if (BINARY64) if (BINARY64)
@ -158,7 +202,22 @@ if (NEED_PIC)
endif () endif ()
if (DYNAMIC_ARCH) if (DYNAMIC_ARCH)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") if (X86 OR X86_64 OR ARM64 OR PPC)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
if (DYNAMIC_OLDER)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
endif ()
else ()
unset (DYNAMIC_ARCH)
message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing")
endif ()
endif ()
if (DYNAMIC_LIST)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_LIST")
foreach(DCORE ${DYNAMIC_LIST})
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYN_${DCORE}")
endforeach ()
endif () endif ()
if (NO_LAPACK) if (NO_LAPACK)
@ -207,6 +266,10 @@ if (CONSISTENT_FPCSR)
set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR") set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR")
endif () endif ()
if (USE_TLS)
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_TLS")
endif ()
# Only for development # Only for development
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST") # set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST")
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST") # set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST")
@ -224,6 +287,12 @@ endif ()
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}") set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}")
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}")
if (BUFFERSIZE)
set(CCOMMON_OPT "${CCOMMON_OPT} -DBUFFERSIZE=${BUFFERSIZE}")
endif ()
if (USE_SIMPLE_THREADED_LEVEL3) if (USE_SIMPLE_THREADED_LEVEL3)
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
endif () endif ()
@ -244,7 +313,7 @@ endif ()
set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}") set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}")
# TODO: nead to convert these Makefiles # TODO: need to convert these Makefiles
# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake # include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake
if (${CORE} STREQUAL "PPC440") if (${CORE} STREQUAL "PPC440")
@ -291,6 +360,8 @@ if (MIXED_MEMORY_ALLOCATION)
set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION") set(CCOMMON_OPT "${CCOMMON_OPT} -DMIXED_MEMORY_ALLOCATION")
endif () endif ()
set(CCOMMON_OPT "${CCOMMON_OPT} -DVERSION=\"\\\"${OpenBLAS_VERSION}\\\"\"")
set(REVISION "-r${OpenBLAS_VERSION}") set(REVISION "-r${OpenBLAS_VERSION}")
set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION}) set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION})

View File

@ -10,6 +10,16 @@ if (${HOST_OS} STREQUAL "WINDOWS")
set(HOST_OS WINNT) set(HOST_OS WINNT)
endif () endif ()
if (${HOST_OS} STREQUAL "LINUX")
# check if we're building natively on Android (TERMUX)
EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
if(${OPERATING_SYSTEM} MATCHES "Android")
set(HOST_OS ANDROID)
endif()
endif()
if(CMAKE_COMPILER_IS_GNUCC AND WIN32) if(CMAKE_COMPILER_IS_GNUCC AND WIN32)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine
OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE
@ -29,13 +39,45 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
set(MIPS64 1) set(MIPS64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
set(X86_64 1) if (NOT BINARY)
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(X86_64 1)
else()
set(X86 1)
endif()
else()
if (${BINARY} EQUAL "64")
set(X86_64 1)
else ()
set(X86 1)
endif()
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
set(X86 1) set(X86 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
set(ARM 1) set(ARM 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
set(ARM64 1) if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(ARM64 1)
else()
set(ARM 1)
endif()
elseif (${CMAKE_CROSSCOMPILING})
if (${TARGET} STREQUAL "CORE2")
if (NOT BINARY)
set(X86 1)
elseif (${BINARY} EQUAL "64")
set(X86_64 1)
else ()
set(X86 1)
endif()
elseif (${TARGET} STREQUAL "ARMV7")
set(ARM 1)
else()
set(ARM64 1)
endif ()
else ()
message(WARNING "Target ARCH could not be determined, got \"${CMAKE_SYSTEM_PROCESSOR}\"")
endif() endif()
if (X86_64) if (X86_64)
@ -66,3 +108,11 @@ else()
set(BINARY32 1) set(BINARY32 1)
endif() endif()
if (X86_64 OR X86)
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
if (NO_AVX512 EQUAL 1)
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
endif()
file(REMOVE "avx512.tmp" "avx512.o")
endif()

View File

@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in)
set(CODES_OUT ${CODES_OUT} PARENT_SCOPE) set(CODES_OUT ${CODES_OUT} PARENT_SCOPE)
endfunction () endfunction ()
# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition # generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition
# @param sources_in the source files to build from # @param sources_in the source files to build from
# @param defines_in (optional) preprocessor definitions that will be applied to all objects # @param defines_in (optional) preprocessor definitions that will be applied to all objects
# @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended. # @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended.

View File

@ -85,6 +85,8 @@ extern "C" {
#if !defined(_MSC_VER) #if !defined(_MSC_VER)
#include <unistd.h> #include <unistd.h>
#elif _MSC_VER < 1900
#define snprintf _snprintf
#endif #endif
#include <time.h> #include <time.h>
@ -105,6 +107,10 @@ extern "C" {
#endif #endif
#endif #endif
#ifdef OS_HAIKU
#define NO_SYSV_IPC
#endif
#ifdef OS_WINDOWS #ifdef OS_WINDOWS
#ifdef ATOM #ifdef ATOM
#define GOTO_ATOM ATOM #define GOTO_ATOM ATOM
@ -125,7 +131,7 @@ extern "C" {
#include <time.h> #include <time.h>
#include <unistd.h> #include <unistd.h>
#include <math.h> #include <math.h>
#ifdef SMP #if defined(SMP) || defined(USE_LOCKING)
#include <pthread.h> #include <pthread.h>
#endif #endif
#endif #endif
@ -179,7 +185,7 @@ extern "C" {
#define ALLOCA_ALIGN 63UL #define ALLOCA_ALIGN 63UL
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2) #define NUM_BUFFERS MAX(50,(MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER))
#ifdef NEEDBUNDERSCORE #ifdef NEEDBUNDERSCORE
#define BLASFUNC(FUNC) FUNC##_ #define BLASFUNC(FUNC) FUNC##_
@ -194,7 +200,7 @@ extern "C" {
#error "You can't specify both LOCK operation!" #error "You can't specify both LOCK operation!"
#endif #endif
#ifdef SMP #if defined(SMP) || defined(USE_LOCKING)
#define USE_PTHREAD_LOCK #define USE_PTHREAD_LOCK
#undef USE_PTHREAD_SPINLOCK #undef USE_PTHREAD_SPINLOCK
#endif #endif
@ -253,8 +259,14 @@ typedef unsigned long BLASULONG;
#ifdef USE64BITINT #ifdef USE64BITINT
typedef BLASLONG blasint; typedef BLASLONG blasint;
#if defined(OS_WINDOWS) && defined(__64BIT__)
#define blasabs(x) llabs(x)
#else
#define blasabs(x) labs(x)
#endif
#else #else
typedef int blasint; typedef int blasint;
#define blasabs(x) abs(x)
#endif #endif
#else #else
#ifdef USE64BITINT #ifdef USE64BITINT
@ -338,6 +350,11 @@ typedef int blasint;
#endif #endif
#endif #endif
#ifdef POWER9
#ifndef YIELDING
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif
#endif
/* /*
#ifdef PILEDRIVER #ifdef PILEDRIVER
@ -434,7 +451,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
typedef char env_var_t[MAX_PATH]; typedef char env_var_t[MAX_PATH];
#define readenv(p, n) 0 #define readenv(p, n) 0
#else #else
#ifdef OS_WINDOWS #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
typedef char env_var_t[MAX_PATH]; typedef char env_var_t[MAX_PATH];
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
#else #else
@ -647,6 +664,7 @@ void gotoblas_profile_init(void);
void gotoblas_profile_quit(void); void gotoblas_profile_quit(void);
#ifdef USE_OPENMP #ifdef USE_OPENMP
#ifndef C_MSVC #ifndef C_MSVC
int omp_in_parallel(void); int omp_in_parallel(void);
int omp_get_num_procs(void); int omp_get_num_procs(void);
@ -654,6 +672,21 @@ int omp_get_num_procs(void);
__declspec(dllimport) int __cdecl omp_in_parallel(void); __declspec(dllimport) int __cdecl omp_in_parallel(void);
__declspec(dllimport) int __cdecl omp_get_num_procs(void); __declspec(dllimport) int __cdecl omp_get_num_procs(void);
#endif #endif
#if (__STDC_VERSION__ >= 201112L)
#if defined(C_GCC) && ( __GNUC__ < 7)
// workaround for GCC bug 65467
#ifndef _Atomic
#define _Atomic volatile
#endif
#endif
#include <stdatomic.h>
#else
#ifndef _Atomic
#define _Atomic volatile
#endif
#endif
#else #else
#ifdef __ELF__ #ifdef __ELF__
int omp_in_parallel (void) __attribute__ ((weak)); int omp_in_parallel (void) __attribute__ ((weak));

View File

@ -78,7 +78,18 @@ static void __inline blas_lock(volatile BLASULONG *address){
#define BLAS_LOCK_DEFINED #define BLAS_LOCK_DEFINED
#if !defined(OS_DARWIN) && !defined (OS_ANDROID)
static __inline BLASULONG rpcc(void){
BLASULONG ret = 0;
__asm__ __volatile__ ("isb; mrs %0,cntvct_el0":"=r"(ret));
return ret;
}
#define RPCC_DEFINED
#define RPCC64BIT
#endif
static inline int blas_quickdivide(blasint x, blasint y){ static inline int blas_quickdivide(blasint x, blasint y){
return x / y; return x / y;
@ -103,12 +114,16 @@ static inline int blas_quickdivide(blasint x, blasint y){
#if defined(ASSEMBLER) && !defined(NEEDPARAM) #if defined(ASSEMBLER) && !defined(NEEDPARAM)
#define PROLOGUE \ .macro PROLOGUE
.text ;\ .text ;
.align 4 ;\ .p2align 2 ;
.global REALNAME ;\ .global REALNAME ;
.type REALNAME, %function ;\ #ifndef __APPLE__
.type REALNAME, %function ;
#endif
REALNAME: REALNAME:
.endm
#define EPILOGUE #define EPILOGUE

View File

@ -19,6 +19,7 @@
#define CDOTC_K cdotc_k #define CDOTC_K cdotc_k
#define CNRM2_K cnrm2_k #define CNRM2_K cnrm2_k
#define CSCAL_K cscal_k #define CSCAL_K cscal_k
#define CSUM_K csum_k
#define CSWAP_K cswap_k #define CSWAP_K cswap_k
#define CROT_K csrot_k #define CROT_K csrot_k
@ -249,6 +250,7 @@
#define CDOTC_K gotoblas -> cdotc_k #define CDOTC_K gotoblas -> cdotc_k
#define CNRM2_K gotoblas -> cnrm2_k #define CNRM2_K gotoblas -> cnrm2_k
#define CSCAL_K gotoblas -> cscal_k #define CSCAL_K gotoblas -> cscal_k
#define CSUM_K gotoblas -> csum_k
#define CSWAP_K gotoblas -> cswap_k #define CSWAP_K gotoblas -> cswap_k
#define CROT_K gotoblas -> csrot_k #define CROT_K gotoblas -> csrot_k

View File

@ -19,6 +19,7 @@
#define DDOTC_K ddot_k #define DDOTC_K ddot_k
#define DNRM2_K dnrm2_k #define DNRM2_K dnrm2_k
#define DSCAL_K dscal_k #define DSCAL_K dscal_k
#define DSUM_K dsum_k
#define DSWAP_K dswap_k #define DSWAP_K dswap_k
#define DROT_K drot_k #define DROT_K drot_k
@ -174,6 +175,7 @@
#define DDOTC_K gotoblas -> ddot_k #define DDOTC_K gotoblas -> ddot_k
#define DNRM2_K gotoblas -> dnrm2_k #define DNRM2_K gotoblas -> dnrm2_k
#define DSCAL_K gotoblas -> dscal_k #define DSCAL_K gotoblas -> dscal_k
#define DSUM_K gotoblas -> dsum_k
#define DSWAP_K gotoblas -> dswap_k #define DSWAP_K gotoblas -> dswap_k
#define DROT_K gotoblas -> drot_k #define DROT_K gotoblas -> drot_k

View File

@ -122,6 +122,13 @@ xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *);
double BLASFUNC(dzasum)(blasint *, double *, blasint *); double BLASFUNC(dzasum)(blasint *, double *, blasint *);
xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *); xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *);
FLOATRET BLASFUNC(ssum) (blasint *, float *, blasint *);
FLOATRET BLASFUNC(scsum)(blasint *, float *, blasint *);
double BLASFUNC(dsum) (blasint *, double *, blasint *);
xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *);
double BLASFUNC(dzsum)(blasint *, double *, blasint *);
xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *);
blasint BLASFUNC(isamax)(blasint *, float *, blasint *); blasint BLASFUNC(isamax)(blasint *, float *, blasint *);
blasint BLASFUNC(idamax)(blasint *, double *, blasint *); blasint BLASFUNC(idamax)(blasint *, double *, blasint *);
blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *); blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *);

View File

@ -293,4 +293,150 @@ blasint zlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLO
blasint xlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint strtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint dtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint qtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint ctrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ztrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint xtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint strtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint strtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint dtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint dtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint qtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint qtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint ctrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ctrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
blasint ztrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint ztrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
blasint xtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
blasint xtrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
#endif #endif

View File

@ -100,6 +100,13 @@ float casum_k (BLASLONG, float *, BLASLONG);
double zasum_k (BLASLONG, double *, BLASLONG); double zasum_k (BLASLONG, double *, BLASLONG);
xdouble xasum_k (BLASLONG, xdouble *, BLASLONG); xdouble xasum_k (BLASLONG, xdouble *, BLASLONG);
float ssum_k (BLASLONG, float *, BLASLONG);
double dsum_k (BLASLONG, double *, BLASLONG);
xdouble qsum_k (BLASLONG, xdouble *, BLASLONG);
float csum_k (BLASLONG, float *, BLASLONG);
double zsum_k (BLASLONG, double *, BLASLONG);
xdouble xsum_k (BLASLONG, xdouble *, BLASLONG);
float samax_k (BLASLONG, float *, BLASLONG); float samax_k (BLASLONG, float *, BLASLONG);
double damax_k (BLASLONG, double *, BLASLONG); double damax_k (BLASLONG, double *, BLASLONG);
xdouble qamax_k (BLASLONG, xdouble *, BLASLONG); xdouble qamax_k (BLASLONG, xdouble *, BLASLONG);

View File

@ -47,6 +47,14 @@ __global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *);
extern "C" { extern "C" {
#endif #endif
extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K,
float * A, BLASLONG strideA,
float * B, BLASLONG strideB,
float * R, BLASLONG strideR);
extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K);
int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double, int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double,

View File

@ -66,6 +66,7 @@
#define DOTC_K QDOTC_K #define DOTC_K QDOTC_K
#define NRM2_K QNRM2_K #define NRM2_K QNRM2_K
#define SCAL_K QSCAL_K #define SCAL_K QSCAL_K
#define SUM_K QSUM_K
#define SWAP_K QSWAP_K #define SWAP_K QSWAP_K
#define ROT_K QROT_K #define ROT_K QROT_K
@ -356,6 +357,7 @@
#define DOTC_K DDOTC_K #define DOTC_K DDOTC_K
#define NRM2_K DNRM2_K #define NRM2_K DNRM2_K
#define SCAL_K DSCAL_K #define SCAL_K DSCAL_K
#define SUM_K DSUM_K
#define SWAP_K DSWAP_K #define SWAP_K DSWAP_K
#define ROT_K DROT_K #define ROT_K DROT_K
@ -658,6 +660,7 @@
#define DOTC_K SDOTC_K #define DOTC_K SDOTC_K
#define NRM2_K SNRM2_K #define NRM2_K SNRM2_K
#define SCAL_K SSCAL_K #define SCAL_K SSCAL_K
#define SUM_K SSUM_K
#define SWAP_K SSWAP_K #define SWAP_K SSWAP_K
#define ROT_K SROT_K #define ROT_K SROT_K
@ -962,6 +965,7 @@
#define DOTC_K XDOTC_K #define DOTC_K XDOTC_K
#define NRM2_K XNRM2_K #define NRM2_K XNRM2_K
#define SCAL_K XSCAL_K #define SCAL_K XSCAL_K
#define SUM_K XSUM_K
#define SWAP_K XSWAP_K #define SWAP_K XSWAP_K
#define ROT_K XROT_K #define ROT_K XROT_K
@ -1363,6 +1367,7 @@
#define DOTC_K ZDOTC_K #define DOTC_K ZDOTC_K
#define NRM2_K ZNRM2_K #define NRM2_K ZNRM2_K
#define SCAL_K ZSCAL_K #define SCAL_K ZSCAL_K
#define SUM_K ZSUM_K
#define SWAP_K ZSWAP_K #define SWAP_K ZSWAP_K
#define ROT_K ZROT_K #define ROT_K ZROT_K
@ -1785,6 +1790,7 @@
#define DOTC_K CDOTC_K #define DOTC_K CDOTC_K
#define NRM2_K CNRM2_K #define NRM2_K CNRM2_K
#define SCAL_K CSCAL_K #define SCAL_K CSCAL_K
#define SUM_K CSUM_K
#define SWAP_K CSWAP_K #define SWAP_K CSWAP_K
#define ROT_K CROT_K #define ROT_K CROT_K
@ -2800,3 +2806,160 @@ typedef struct {
#endif #endif
#endif #endif
#ifndef COMPLEX
#ifdef XDOUBLE
#define TRTRS_UNU_SINGLE qtrtrs_UNU_single
#define TRTRS_UNN_SINGLE qtrtrs_UNN_single
#define TRTRS_UTU_SINGLE qtrtrs_UTU_single
#define TRTRS_UTN_SINGLE qtrtrs_UTN_single
#define TRTRS_LNU_SINGLE qtrtrs_LNU_single
#define TRTRS_LNN_SINGLE qtrtrs_LNN_single
#define TRTRS_LTU_SINGLE qtrtrs_LTU_single
#define TRTRS_LTN_SINGLE qtrtrs_LTN_single
#define TRTRS_UNU_PARALLEL qtrtrs_UNU_parallel
#define TRTRS_UNN_PARALLEL qtrtrs_UNN_parallel
#define TRTRS_UTU_PARALLEL qtrtrs_UTU_parallel
#define TRTRS_UTN_PARALLEL qtrtrs_UTN_parallel
#define TRTRS_LNU_PARALLEL qtrtrs_LNU_parallel
#define TRTRS_LNN_PARALLEL qtrtrs_LNN_parallel
#define TRTRS_LTU_PARALLEL qtrtrs_LTU_parallel
#define TRTRS_LTN_PARALLEL qtrtrs_LTN_parallel
#elif defined(DOUBLE)
#define TRTRS_UNU_SINGLE dtrtrs_UNU_single
#define TRTRS_UNN_SINGLE dtrtrs_UNN_single
#define TRTRS_UTU_SINGLE dtrtrs_UTU_single
#define TRTRS_UTN_SINGLE dtrtrs_UTN_single
#define TRTRS_LNU_SINGLE dtrtrs_LNU_single
#define TRTRS_LNN_SINGLE dtrtrs_LNN_single
#define TRTRS_LTU_SINGLE dtrtrs_LTU_single
#define TRTRS_LTN_SINGLE dtrtrs_LTN_single
#define TRTRS_UNU_PARALLEL dtrtrs_UNU_parallel
#define TRTRS_UNN_PARALLEL dtrtrs_UNN_parallel
#define TRTRS_UTU_PARALLEL dtrtrs_UTU_parallel
#define TRTRS_UTN_PARALLEL dtrtrs_UTN_parallel
#define TRTRS_LNU_PARALLEL dtrtrs_LNU_parallel
#define TRTRS_LNN_PARALLEL dtrtrs_LNN_parallel
#define TRTRS_LTU_PARALLEL dtrtrs_LTU_parallel
#define TRTRS_LTN_PARALLEL dtrtrs_LTN_parallel
#else
#define TRTRS_UNU_SINGLE strtrs_UNU_single
#define TRTRS_UNN_SINGLE strtrs_UNN_single
#define TRTRS_UTU_SINGLE strtrs_UTU_single
#define TRTRS_UTN_SINGLE strtrs_UTN_single
#define TRTRS_LNU_SINGLE strtrs_LNU_single
#define TRTRS_LNN_SINGLE strtrs_LNN_single
#define TRTRS_LTU_SINGLE strtrs_LTU_single
#define TRTRS_LTN_SINGLE strtrs_LTN_single
#define TRTRS_UNU_PARALLEL strtrs_UNU_parallel
#define TRTRS_UNN_PARALLEL strtrs_UNN_parallel
#define TRTRS_UTU_PARALLEL strtrs_UTU_parallel
#define TRTRS_UTN_PARALLEL strtrs_UTN_parallel
#define TRTRS_LNU_PARALLEL strtrs_LNU_parallel
#define TRTRS_LNN_PARALLEL strtrs_LNN_parallel
#define TRTRS_LTU_PARALLEL strtrs_LTU_parallel
#define TRTRS_LTN_PARALLEL strtrs_LTN_parallel
#endif
#else
#ifdef XDOUBLE
#define TRTRS_UNU_SINGLE xtrtrs_UNU_single
#define TRTRS_UNN_SINGLE xtrtrs_UNN_single
#define TRTRS_UTU_SINGLE xtrtrs_UTU_single
#define TRTRS_UTN_SINGLE xtrtrs_UTN_single
#define TRTRS_URU_SINGLE xtrtrs_URU_single
#define TRTRS_URN_SINGLE xtrtrs_URN_single
#define TRTRS_UCU_SINGLE xtrtrs_UCU_single
#define TRTRS_UCN_SINGLE xtrtrs_UCN_single
#define TRTRS_LNU_SINGLE xtrtrs_LNU_single
#define TRTRS_LNN_SINGLE xtrtrs_LNN_single
#define TRTRS_LTU_SINGLE xtrtrs_LTU_single
#define TRTRS_LTN_SINGLE xtrtrs_LTN_single
#define TRTRS_LRU_SINGLE xtrtrs_LRU_single
#define TRTRS_LRN_SINGLE xtrtrs_LRN_single
#define TRTRS_LCU_SINGLE xtrtrs_LCU_single
#define TRTRS_LCN_SINGLE xtrtrs_LCN_single
#define TRTRS_UNU_PARALLEL xtrtrs_UNU_parallel
#define TRTRS_UNN_PARALLEL xtrtrs_UNN_parallel
#define TRTRS_UTU_PARALLEL xtrtrs_UTU_parallel
#define TRTRS_UTN_PARALLEL xtrtrs_UTN_parallel
#define TRTRS_URU_PARALLEL xtrtrs_URU_parallel
#define TRTRS_URN_PARALLEL xtrtrs_URN_parallel
#define TRTRS_UCU_PARALLEL xtrtrs_UCU_parallel
#define TRTRS_UCN_PARALLEL xtrtrs_UCN_parallel
#define TRTRS_LNU_PARALLEL xtrtrs_LNU_parallel
#define TRTRS_LNN_PARALLEL xtrtrs_LNN_parallel
#define TRTRS_LTU_PARALLEL xtrtrs_LTU_parallel
#define TRTRS_LTN_PARALLEL xtrtrs_LTN_parallel
#define TRTRS_LRU_PARALLEL xtrtrs_LRU_parallel
#define TRTRS_LRN_PARALLEL xtrtrs_LRN_parallel
#define TRTRS_LCU_PARALLEL xtrtrs_LCU_parallel
#define TRTRS_LCN_PARALLEL xtrtrs_LCN_parallel
#elif defined(DOUBLE)
#define TRTRS_UNU_SINGLE ztrtrs_UNU_single
#define TRTRS_UNN_SINGLE ztrtrs_UNN_single
#define TRTRS_UTU_SINGLE ztrtrs_UTU_single
#define TRTRS_UTN_SINGLE ztrtrs_UTN_single
#define TRTRS_URU_SINGLE ztrtrs_URU_single
#define TRTRS_URN_SINGLE ztrtrs_URN_single
#define TRTRS_UCU_SINGLE ztrtrs_UCU_single
#define TRTRS_UCN_SINGLE ztrtrs_UCN_single
#define TRTRS_LNU_SINGLE ztrtrs_LNU_single
#define TRTRS_LNN_SINGLE ztrtrs_LNN_single
#define TRTRS_LTU_SINGLE ztrtrs_LTU_single
#define TRTRS_LTN_SINGLE ztrtrs_LTN_single
#define TRTRS_LRU_SINGLE ztrtrs_LRU_single
#define TRTRS_LRN_SINGLE ztrtrs_LRN_single
#define TRTRS_LCU_SINGLE ztrtrs_LCU_single
#define TRTRS_LCN_SINGLE ztrtrs_LCN_single
#define TRTRS_UNU_PARALLEL ztrtrs_UNU_parallel
#define TRTRS_UNN_PARALLEL ztrtrs_UNN_parallel
#define TRTRS_UTU_PARALLEL ztrtrs_UTU_parallel
#define TRTRS_UTN_PARALLEL ztrtrs_UTN_parallel
#define TRTRS_URU_PARALLEL ztrtrs_URU_parallel
#define TRTRS_URN_PARALLEL ztrtrs_URN_parallel
#define TRTRS_UCU_PARALLEL ztrtrs_UCU_parallel
#define TRTRS_UCN_PARALLEL ztrtrs_UCN_parallel
#define TRTRS_LNU_PARALLEL ztrtrs_LNU_parallel
#define TRTRS_LNN_PARALLEL ztrtrs_LNN_parallel
#define TRTRS_LTU_PARALLEL ztrtrs_LTU_parallel
#define TRTRS_LTN_PARALLEL ztrtrs_LTN_parallel
#define TRTRS_LRU_PARALLEL ztrtrs_LRU_parallel
#define TRTRS_LRN_PARALLEL ztrtrs_LRN_parallel
#define TRTRS_LCU_PARALLEL ztrtrs_LCU_parallel
#define TRTRS_LCN_PARALLEL ztrtrs_LCN_parallel
#else
#define TRTRS_UNU_SINGLE ctrtrs_UNU_single
#define TRTRS_UNN_SINGLE ctrtrs_UNN_single
#define TRTRS_UTU_SINGLE ctrtrs_UTU_single
#define TRTRS_UTN_SINGLE ctrtrs_UTN_single
#define TRTRS_URU_SINGLE ctrtrs_URU_single
#define TRTRS_URN_SINGLE ctrtrs_URN_single
#define TRTRS_UCU_SINGLE ctrtrs_UCU_single
#define TRTRS_UCN_SINGLE ctrtrs_UCN_single
#define TRTRS_LNU_SINGLE ctrtrs_LNU_single
#define TRTRS_LNN_SINGLE ctrtrs_LNN_single
#define TRTRS_LTU_SINGLE ctrtrs_LTU_single
#define TRTRS_LTN_SINGLE ctrtrs_LTN_single
#define TRTRS_LRU_SINGLE ctrtrs_LRU_single
#define TRTRS_LRN_SINGLE ctrtrs_LRN_single
#define TRTRS_LCU_SINGLE ctrtrs_LCU_single
#define TRTRS_LCN_SINGLE ctrtrs_LCN_single
#define TRTRS_UNU_PARALLEL ctrtrs_UNU_parallel
#define TRTRS_UNN_PARALLEL ctrtrs_UNN_parallel
#define TRTRS_UTU_PARALLEL ctrtrs_UTU_parallel
#define TRTRS_UTN_PARALLEL ctrtrs_UTN_parallel
#define TRTRS_URU_PARALLEL ctrtrs_URU_parallel
#define TRTRS_URN_PARALLEL ctrtrs_URN_parallel
#define TRTRS_UCU_PARALLEL ctrtrs_UCU_parallel
#define TRTRS_UCN_PARALLEL ctrtrs_UCN_parallel
#define TRTRS_LNU_PARALLEL ctrtrs_LNU_parallel
#define TRTRS_LNN_PARALLEL ctrtrs_LNN_parallel
#define TRTRS_LTU_PARALLEL ctrtrs_LTU_parallel
#define TRTRS_LTN_PARALLEL ctrtrs_LTN_parallel
#define TRTRS_LRU_PARALLEL ctrtrs_LRU_parallel
#define TRTRS_LRN_PARALLEL ctrtrs_LRN_parallel
#define TRTRS_LCU_PARALLEL ctrtrs_LCU_parallel
#define TRTRS_LCN_PARALLEL ctrtrs_LCN_parallel
#endif
#endif

View File

@ -94,7 +94,7 @@ static inline unsigned int rpcc(void){
#define RPCC_DEFINED #define RPCC_DEFINED
#ifndef NO_AFFINITY #ifndef NO_AFFINITY
#define WHEREAMI //#define WHEREAMI
static inline int WhereAmI(void){ static inline int WhereAmI(void){
int ret=0; int ret=0;
__asm__ __volatile__(".set push \n" __asm__ __volatile__(".set push \n"

View File

@ -63,6 +63,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
float (*snrm2_k) (BLASLONG, float *, BLASLONG); float (*snrm2_k) (BLASLONG, float *, BLASLONG);
float (*sasum_k) (BLASLONG, float *, BLASLONG); float (*sasum_k) (BLASLONG, float *, BLASLONG);
float (*ssum_k) (BLASLONG, float *, BLASLONG);
int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
@ -154,6 +155,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
double (*dnrm2_k) (BLASLONG, double *, BLASLONG); double (*dnrm2_k) (BLASLONG, double *, BLASLONG);
double (*dasum_k) (BLASLONG, double *, BLASLONG); double (*dasum_k) (BLASLONG, double *, BLASLONG);
double (*dsum_k) (BLASLONG, double *, BLASLONG);
int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
@ -245,6 +247,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG);
int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble);
@ -332,6 +335,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
float (*cnrm2_k) (BLASLONG, float *, BLASLONG); float (*cnrm2_k) (BLASLONG, float *, BLASLONG);
float (*casum_k) (BLASLONG, float *, BLASLONG); float (*casum_k) (BLASLONG, float *, BLASLONG);
float (*csum_k) (BLASLONG, float *, BLASLONG);
int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
@ -495,6 +499,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG);
double (*znrm2_k) (BLASLONG, double *, BLASLONG); double (*znrm2_k) (BLASLONG, double *, BLASLONG);
double (*zasum_k) (BLASLONG, double *, BLASLONG); double (*zasum_k) (BLASLONG, double *, BLASLONG);
double (*zsum_k) (BLASLONG, double *, BLASLONG);
int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
@ -660,6 +665,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG);
xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG);
int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);

View File

@ -39,7 +39,36 @@
#ifndef COMMON_POWER #ifndef COMMON_POWER
#define COMMON_POWER #define COMMON_POWER
#if defined(POWER8) #define str(x) #x
#ifdef OS_AIX
#define XXSPLTD(T,A,z) xxpermdi T, A, A, 0b##z##z
#define XXMRGHD(T,A,B) xxpermdi T, A, B, 0b00
#define XXMRGLD(T,A,B) xxpermdi T, A, B, 0b11
#define XXSWAPD(T,A) xxpermdi T, A, A, 0b10
#define XVMOVDP(T,A) xvcpsgndp T, A, A
#define XXSPLTD_S(T,A,z) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b" str(z ## z) " \n\t"
#define XXMRGHD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b00 \n\t"
#define XXMRGLD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b11 \n\t"
#define XXSWAPD_S(T,A) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b10 \n\t"
#else
#define XXSPLTD(T,A,z) xxspltd T, A, z
#define XXMRGHD(T,A,B) xxmrghd T, A, B
#define XXMRGLD(T,A,B) xxmrgld T, A, B
#define XXSWAPD(T,A) xxswapd T, A
#define XVMOVDP(T,A) xvmovdp T, A
#define XXSPLTD_S(T,A,z) "xxspltd " str(T) ", " str(A) ", " str(z)" \n\t"
#define XXMRGHD_S(T,A,B) "xxmrghd " str(T) ", " str(A) ", " str(B)" \n\t"
#define XXMRGLD_S(T,A,B) "xxmrgld " str(T) ", " str(A) ", " str(B)" \n\t"
#define XXSWAPD_S(T,A) "xxswapd " str(T) ", " str(A) " \n\t"
#endif
#if defined(POWER8) || defined(POWER9)
#define MB __asm__ __volatile__ ("eieio":::"memory") #define MB __asm__ __volatile__ ("eieio":::"memory")
#define WMB __asm__ __volatile__ ("eieio":::"memory") #define WMB __asm__ __volatile__ ("eieio":::"memory")
#else #else
@ -241,7 +270,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define HAVE_PREFETCH #define HAVE_PREFETCH
#endif #endif
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(PPC970)
#define DCBT_ARG 0 #define DCBT_ARG 0
#else #else
#define DCBT_ARG 8 #define DCBT_ARG 8
@ -263,7 +292,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define L1_PREFETCH dcbtst #define L1_PREFETCH dcbtst
#endif #endif
#if defined(POWER8) #if defined(POWER8) || defined(POWER9)
#define L1_DUALFETCH #define L1_DUALFETCH
#define L1_PREFETCHSIZE (16 + 128 * 100) #define L1_PREFETCHSIZE (16 + 128 * 100)
#define L1_PREFETCH dcbtst #define L1_PREFETCH dcbtst
@ -499,7 +528,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#if defined(ASSEMBLER) && !defined(NEEDPARAM) #if defined(ASSEMBLER) && !defined(NEEDPARAM)
#ifdef OS_LINUX #if defined(OS_LINUX) || defined(OS_FREEBSD)
#ifndef __64BIT__ #ifndef __64BIT__
#define PROLOGUE \ #define PROLOGUE \
.section .text;\ .section .text;\
@ -598,9 +627,14 @@ REALNAME:;\
#ifndef __64BIT__ #ifndef __64BIT__
#define PROLOGUE \ #define PROLOGUE \
.machine "any";\ .machine "any";\
.toc;\
.globl .REALNAME;\ .globl .REALNAME;\
.globl REALNAME;\
.csect REALNAME[DS],3;\
REALNAME:;\
.long .REALNAME, TOC[tc0], 0;\
.csect .text[PR],5;\ .csect .text[PR],5;\
.REALNAME:; .REALNAME:
#define EPILOGUE \ #define EPILOGUE \
_section_.text:;\ _section_.text:;\
@ -611,9 +645,14 @@ _section_.text:;\
#define PROLOGUE \ #define PROLOGUE \
.machine "any";\ .machine "any";\
.toc;\
.globl .REALNAME;\ .globl .REALNAME;\
.globl REALNAME;\
.csect REALNAME[DS],3;\
REALNAME:;\
.llong .REALNAME, TOC[tc0], 0;\
.csect .text[PR], 5;\ .csect .text[PR], 5;\
.REALNAME:; .REALNAME:
#define EPILOGUE \ #define EPILOGUE \
_section_.text:;\ _section_.text:;\
@ -774,7 +813,7 @@ Lmcount$lazy_ptr:
#define HALT mfspr r0, 1023 #define HALT mfspr r0, 1023
#ifdef OS_LINUX #if defined(OS_LINUX) || defined(OS_FREEBSD)
#if defined(PPC440) || defined(PPC440FP2) #if defined(PPC440) || defined(PPC440FP2)
#undef MAX_CPU_NUMBER #undef MAX_CPU_NUMBER
#define MAX_CPU_NUMBER 1 #define MAX_CPU_NUMBER 1
@ -802,7 +841,7 @@ Lmcount$lazy_ptr:
#define BUFFER_SIZE ( 2 << 20) #define BUFFER_SIZE ( 2 << 20)
#elif defined(PPC440FP2) #elif defined(PPC440FP2)
#define BUFFER_SIZE ( 16 << 20) #define BUFFER_SIZE ( 16 << 20)
#elif defined(POWER8) #elif defined(POWER8) || defined(POWER9)
#define BUFFER_SIZE ( 64 << 20) #define BUFFER_SIZE ( 64 << 20)
#else #else
#define BUFFER_SIZE ( 16 << 20) #define BUFFER_SIZE ( 16 << 20)
@ -819,7 +858,7 @@ Lmcount$lazy_ptr:
#define MAP_ANONYMOUS MAP_ANON #define MAP_ANONYMOUS MAP_ANON
#endif #endif
#ifdef OS_LINUX #if defined(OS_LINUX) || defined(OS_FREEBSD)
#ifndef __64BIT__ #ifndef __64BIT__
#define FRAMESLOT(X) (((X) * 4) + 8) #define FRAMESLOT(X) (((X) * 4) + 8)
#else #else

View File

@ -19,6 +19,7 @@
#define QDOTC_K qdot_k #define QDOTC_K qdot_k
#define QNRM2_K qnrm2_k #define QNRM2_K qnrm2_k
#define QSCAL_K qscal_k #define QSCAL_K qscal_k
#define QSUM_K qsum_k
#define QSWAP_K qswap_k #define QSWAP_K qswap_k
#define QROT_K qrot_k #define QROT_K qrot_k
@ -161,6 +162,7 @@
#define QDOTC_K gotoblas -> qdot_k #define QDOTC_K gotoblas -> qdot_k
#define QNRM2_K gotoblas -> qnrm2_k #define QNRM2_K gotoblas -> qnrm2_k
#define QSCAL_K gotoblas -> qscal_k #define QSCAL_K gotoblas -> qscal_k
#define QSUM_K gotoblas -> qsum_k
#define QSWAP_K gotoblas -> qswap_k #define QSWAP_K gotoblas -> qswap_k
#define QROT_K gotoblas -> qrot_k #define QROT_K gotoblas -> qrot_k

View File

@ -12,6 +12,7 @@
#define ISMAX_K ismax_k #define ISMAX_K ismax_k
#define ISMIN_K ismin_k #define ISMIN_K ismin_k
#define SASUM_K sasum_k #define SASUM_K sasum_k
#define SSUM_K ssum_k
#define SAXPYU_K saxpy_k #define SAXPYU_K saxpy_k
#define SAXPYC_K saxpy_k #define SAXPYC_K saxpy_k
#define SCOPY_K scopy_k #define SCOPY_K scopy_k
@ -170,6 +171,7 @@
#define ISMAX_K gotoblas -> ismax_k #define ISMAX_K gotoblas -> ismax_k
#define ISMIN_K gotoblas -> ismin_k #define ISMIN_K gotoblas -> ismin_k
#define SASUM_K gotoblas -> sasum_k #define SASUM_K gotoblas -> sasum_k
#define SSUM_K gotoblas -> ssum_k
#define SAXPYU_K gotoblas -> saxpy_k #define SAXPYU_K gotoblas -> saxpy_k
#define SAXPYC_K gotoblas -> saxpy_k #define SAXPYC_K gotoblas -> saxpy_k
#define SCOPY_K gotoblas -> scopy_k #define SCOPY_K gotoblas -> scopy_k

View File

@ -45,16 +45,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* SIZE must be carefully chosen to be: * SIZE must be carefully chosen to be:
* - as small as possible to maximize the number of stack allocation * - as small as possible to maximize the number of stack allocation
* - large enough to support all architectures and kernel * - large enough to support all architectures and kernel
* Chosing a too small SIZE will lead to a stack smashing. * Choosing a SIZE too small will lead to a stack smashing.
*/ */
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \ #define STACK_ALLOC(SIZE, TYPE, BUFFER) \
/* make it volatile because some function (ex: dgemv_n.S) */ \ /* make it volatile because some function (ex: dgemv_n.S) */ \
/* do not restore all register */ \ /* do not restore all register */ \
volatile int stack_alloc_size = SIZE; \ volatile int stack_alloc_size = SIZE; \
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \ if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \
stack_alloc_size = 0; \ STACK_ALLOC_PROTECT_SET \
STACK_ALLOC_PROTECT_SET \ /* Avoid declaring an array of length 0 */ \
TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \ TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1] \
__attribute__((aligned(0x20))); \
BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1); BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
#else #else
//Original OpenBLAS/GotoBLAS codes. //Original OpenBLAS/GotoBLAS codes.

View File

@ -194,10 +194,6 @@ int trsm_thread(int mode, BLASLONG m, BLASLONG n,
int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG); int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
int beta_thread(int mode, BLASLONG m, BLASLONG n,
double alpha_r, double alpha_i,
void *c, BLASLONG ldc, int (*fuction)());
int getrf_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, int getrf_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k,
void *offsetA, BLASLONG lda, void *offsetA, BLASLONG lda,
void *offsetB, BLASLONG jb, void *offsetB, BLASLONG jb,

View File

@ -19,6 +19,7 @@
#define XDOTC_K xdotc_k #define XDOTC_K xdotc_k
#define XNRM2_K xnrm2_k #define XNRM2_K xnrm2_k
#define XSCAL_K xscal_k #define XSCAL_K xscal_k
#define XSUM_K xsum_k
#define XSWAP_K xswap_k #define XSWAP_K xswap_k
#define XROT_K xqrot_k #define XROT_K xqrot_k
@ -227,6 +228,7 @@
#define XDOTC_K gotoblas -> xdotc_k #define XDOTC_K gotoblas -> xdotc_k
#define XNRM2_K gotoblas -> xnrm2_k #define XNRM2_K gotoblas -> xnrm2_k
#define XSCAL_K gotoblas -> xscal_k #define XSCAL_K gotoblas -> xscal_k
#define XSUM_K gotoblas -> xsum_k
#define XSWAP_K gotoblas -> xswap_k #define XSWAP_K gotoblas -> xswap_k
#define XROT_K gotoblas -> xqrot_k #define XROT_K gotoblas -> xqrot_k

View File

@ -178,10 +178,16 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
result = x/y; result = x/y;
return result; return result;
#else #else
#if (MAX_CPU_NUMBER > 64)
if ( y > 64) {
result = x/y;
return result;
}
#endif
y = blas_quick_divide_table[y]; y = blas_quick_divide_table[y];
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x): "0" (y));
return result; return result;
#endif #endif
@ -208,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#endif #endif
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
//Enable some optimazation for barcelona. //Enable some optimization for barcelona.
#define BARCELONA_OPTIMIZATION #define BARCELONA_OPTIMIZATION
#endif #endif

View File

@ -60,8 +60,13 @@
#endif #endif
*/ */
#define MB #ifdef __GNUC__
#define WMB #define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
#else
#define MB do {} while (0)
#define WMB do {} while (0)
#endif
static void __inline blas_lock(volatile BLASULONG *address){ static void __inline blas_lock(volatile BLASULONG *address){
@ -124,7 +129,8 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
*ecx=cpuinfo[2]; *ecx=cpuinfo[2];
*edx=cpuinfo[3]; *edx=cpuinfo[3];
#else #else
__asm__ __volatile__("cpuid" __asm__ __volatile__("mov $0, %%ecx;"
"cpuid"
: "=a" (*eax), : "=a" (*eax),
"=b" (*ebx), "=b" (*ebx),
"=c" (*ecx), "=c" (*ecx),
@ -196,9 +202,16 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
if (y <= 1) return x; if (y <= 1) return x;
#if (MAX_CPU_NUMBER > 64)
if (y > 64) {
result = x / y;
return result;
}
#endif
y = blas_quick_divide_table[y]; y = blas_quick_divide_table[y];
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y));
return result; return result;
} }
@ -212,7 +225,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#endif #endif
#define HUGE_PAGESIZE ( 2 << 20) #define HUGE_PAGESIZE ( 2 << 20)
#ifndef BUFFERSIZE
#define BUFFER_SIZE (32 << 20) #define BUFFER_SIZE (32 << 20)
#else
#define BUFFER_SIZE (32 << BUFFERSIZE)
#endif
#define SEEK_ADDRESS #define SEEK_ADDRESS
@ -264,7 +281,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#ifdef ASSEMBLER #ifdef ASSEMBLER
#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR)
//Enable some optimazation for barcelona. //Enable some optimization for barcelona.
#define BARCELONA_OPTIMIZATION #define BARCELONA_OPTIMIZATION
#endif #endif

View File

@ -19,6 +19,7 @@
#define ZDOTC_K zdotc_k #define ZDOTC_K zdotc_k
#define ZNRM2_K znrm2_k #define ZNRM2_K znrm2_k
#define ZSCAL_K zscal_k #define ZSCAL_K zscal_k
#define ZSUM_K zsum_k
#define ZSWAP_K zswap_k #define ZSWAP_K zswap_k
#define ZROT_K zdrot_k #define ZROT_K zdrot_k
@ -249,6 +250,7 @@
#define ZDOTC_K gotoblas -> zdotc_k #define ZDOTC_K gotoblas -> zdotc_k
#define ZNRM2_K gotoblas -> znrm2_k #define ZNRM2_K gotoblas -> znrm2_k
#define ZSCAL_K gotoblas -> zscal_k #define ZSCAL_K gotoblas -> zscal_k
#define ZSUM_K gotoblas -> zsum_k
#define ZSWAP_K gotoblas -> zswap_k #define ZSWAP_K gotoblas -> zswap_k
#define ZROT_K gotoblas -> zdrot_k #define ZROT_K gotoblas -> zdrot_k

14
cpp_thread_test/Makefile Normal file
View File

@ -0,0 +1,14 @@
include ../Makefile.rule
all :: dgemv_tester dgemm_tester
dgemv_tester :
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester
./dgemv_tester
dgemm_tester : dgemv_tester
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester
./dgemm_tester
clean ::
rm -f dgemv_tester dgemm_tester

View File

@ -0,0 +1,55 @@
inline void pauser(){
/// a portable way to pause a program
std::string dummy;
std::cout << "Press enter to continue...";
std::getline(std::cin, dummy);
}
void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
for(uint32_t i=0; i<numMat; i++){
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
matBlock[i][j] = rngdist(PRNG);
}
}
for(uint32_t i=numMat; i<(numConcurrentThreads*numMat); i+=numMat){
for(uint32_t j=0; j<numMat; j++){
matBlock[i+j] = matBlock[j];
}
}
}
void FillVectors(std::vector<std::vector<double>>& vecBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numVec){
for(uint32_t i=0; i<numVec; i++){
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
vecBlock[i][j] = rngdist(PRNG);
}
}
for(uint32_t i=numVec; i<(numConcurrentThreads*numVec); i+=numVec){
for(uint32_t j=0; j<numVec; j++){
vecBlock[i+j] = vecBlock[j];
}
}
}
std::mt19937_64 InitPRNG(){
std::random_device rd;
std::mt19937_64 PRNG(rd()); //seed PRNG using /dev/urandom or similar OS provided RNG
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
//make sure the internal state of the PRNG is properly mixed by generating 10M random numbers
//PRNGs often have unreliable distribution uniformity and other statistical properties before their internal state is sufficiently mixed
for (uint32_t i=0;i<10000000;i++) rngdist(PRNG);
return PRNG;
}
void PrintMatrices(const std::vector<std::vector<double>>& matBlock, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
for (uint32_t i=0;i<numConcurrentThreads*numMat;i++){
std::cout<<i<<std::endl;
for (uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
for (uint32_t k = 0; k < static_cast<uint32_t>(randomMatSize); k++){
std::cout<<matBlock[i][j*randomMatSize + k]<<" ";
}
std::cout<<std::endl;
}
std::cout<<std::endl;
}
}

View File

@ -0,0 +1,92 @@
#include <iostream>
#include <vector>
#include <random>
#include <future>
#include <omp.h>
#include "../cblas.h"
#include "cpp_thread_safety_common.h"
void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMatSize){
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, randomMatSize, randomMatSize, randomMatSize, 1.0, A, randomMatSize, B, randomMatSize, 0.1, C, randomMatSize);
}
int main(int argc, char* argv[]){
blasint randomMatSize = 1024; //dimension of the random square matrices used
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
uint32_t numTestRounds = 16; //number of testing rounds before success exit
if (argc > 4){
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
abort();
}
if(argc == 4){
std::vector<std::string> cliArgs;
for (int i = 1; i < argc; i++){
cliArgs.push_back(argv[i]);
std::cout<<argv[i]<<std::endl;
}
randomMatSize = std::stoul(cliArgs[0]);
numConcurrentThreads = std::stoul(cliArgs[1]);
numTestRounds = std::stoul(cliArgs[2]);
}
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
std::vector<std::vector<double>> matBlock(numConcurrentThreads*3);
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
std::cout<<"*----------------------------*\n";
std::cout<<"| DGEMM thread safety tester |\n";
std::cout<<"*----------------------------*\n";
std::cout<<"Size of random matrices(N=M=K): "<<randomMatSize<<'\n';
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
std::cout<<"Initializing random number generator..."<<std::flush;
std::mt19937_64 PRNG = InitPRNG();
std::cout<<"done\n";
std::cout<<"Preparing to test CBLAS DGEMM thread safety\n";
std::cout<<"Allocating matrices..."<<std::flush;
for(uint32_t i=0; i<(numConcurrentThreads*3); i++){
matBlock[i].resize(randomMatSize*randomMatSize);
}
std::cout<<"done\n";
//pauser();
std::cout<<"Filling matrices with random numbers..."<<std::flush;
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 3);
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
std::cout<<"done\n";
std::cout<<"Testing CBLAS DGEMM thread safety\n";
omp_set_num_threads(numConcurrentThreads);
for(uint32_t R=0; R<numTestRounds; R++){
std::cout<<"DGEMM round #"<<R<<std::endl;
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
#pragma omp parallel for default(none) shared(futureBlock, matBlock, randomMatSize, numConcurrentThreads)
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemm, &matBlock[i*3][0], &matBlock[i*3+1][0], &matBlock[i*3+2][0], randomMatSize);
//launch_cblas_dgemm( &matBlock[i][0], &matBlock[i+1][0], &matBlock[i+2][0]);
}
std::cout<<"done\n";
std::cout<<"Waiting for threads to finish..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i].get();
}
std::cout<<"done\n";
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
std::cout<<"Comparing results from different threads..."<<std::flush;
for(uint32_t i=3; i<(numConcurrentThreads*3); i+=3){ //i is the index of matrix A, for a given thread
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
if (std::abs(matBlock[i+2][j] - matBlock[2][j]) > 1.0E-13){ //i+2 is the index of matrix C, for a given thread
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+2<<std::endl;
std::cout<<"CBLAS DGEMM thread safety test FAILED!"<<std::endl;
return -1;
}
}
}
std::cout<<"OK!\n"<<std::endl;
}
std::cout<<"CBLAS DGEMM thread safety test PASSED!\n"<<std::endl;
return 0;
}

View File

@ -0,0 +1,101 @@
#include <iostream>
#include <vector>
#include <random>
#include <future>
#include <omp.h>
#include "../cblas.h"
#include "cpp_thread_safety_common.h"
void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){
const blasint inc = 1;
cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc);
}
int main(int argc, char* argv[]){
blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
uint32_t numTestRounds = 16; //number of testing rounds before success exit
if (argc > 4){
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
abort();
}
if(argc == 4){
std::vector<std::string> cliArgs;
for (int i = 1; i < argc; i++){
cliArgs.push_back(argv[i]);
std::cout<<argv[i]<<std::endl;
}
randomMatSize = std::stoul(cliArgs.at(0));
numConcurrentThreads = std::stoul(cliArgs.at(1));
numTestRounds = std::stoul(cliArgs.at(2));
}
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
std::vector<std::vector<double>> matBlock(numConcurrentThreads);
std::vector<std::vector<double>> vecBlock(numConcurrentThreads*2);
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
std::cout<<"*----------------------------*\n";
std::cout<<"| DGEMV thread safety tester |\n";
std::cout<<"*----------------------------*\n";
std::cout<<"Size of random matrices and vectors(N=M): "<<randomMatSize<<'\n';
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
std::cout<<"Initializing random number generator..."<<std::flush;
std::mt19937_64 PRNG = InitPRNG();
std::cout<<"done\n";
std::cout<<"Preparing to test CBLAS DGEMV thread safety\n";
std::cout<<"Allocating matrices..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
matBlock.at(i).resize(randomMatSize*randomMatSize);
}
std::cout<<"done\n";
std::cout<<"Allocating vectors..."<<std::flush;
for(uint32_t i=0; i<(numConcurrentThreads*2); i++){
vecBlock.at(i).resize(randomMatSize);
}
std::cout<<"done\n";
//pauser();
std::cout<<"Filling matrices with random numbers..."<<std::flush;
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 1);
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads);
std::cout<<"done\n";
std::cout<<"Filling vectors with random numbers..."<<std::flush;
FillVectors(vecBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 2);
std::cout<<"done\n";
std::cout<<"Testing CBLAS DGEMV thread safety"<<std::endl;
omp_set_num_threads(numConcurrentThreads);
for(uint32_t R=0; R<numTestRounds; R++){
std::cout<<"DGEMV round #"<<R<<std::endl;
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
#pragma omp parallel for default(none) shared(futureBlock, matBlock, vecBlock, randomMatSize, numConcurrentThreads)
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemv, &matBlock[i][0], &vecBlock[i*2][0], &vecBlock[i*2+1][0], randomMatSize);
}
std::cout<<"done\n";
std::cout<<"Waiting for threads to finish..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
futureBlock[i].get();
}
std::cout<<"done\n";
std::cout<<"Comparing results from different threads..."<<std::flush;
for(uint32_t i=2; i<(numConcurrentThreads*2); i+=2){ //i is the index of vector x, for a given thread
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+1<<std::endl;
std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl;
return -1;
}
}
}
std::cout<<"OK!\n"<<std::endl;
}
std::cout<<"CBLAS DGEMV thread safety test PASSED!\n"<<std::endl;
return 0;
}

View File

@ -53,6 +53,7 @@
#define VENDOR_SIS 8 #define VENDOR_SIS 8
#define VENDOR_TRANSMETA 9 #define VENDOR_TRANSMETA 9
#define VENDOR_NSC 10 #define VENDOR_NSC 10
#define VENDOR_HYGON 11
#define VENDOR_UNKNOWN 99 #define VENDOR_UNKNOWN 99
#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) #define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
@ -115,6 +116,8 @@
#define CORE_STEAMROLLER 25 #define CORE_STEAMROLLER 25
#define CORE_EXCAVATOR 26 #define CORE_EXCAVATOR 26
#define CORE_ZEN 27 #define CORE_ZEN 27
#define CORE_SKYLAKEX 28
#define CORE_DHYANA 29
#define HAVE_SSE (1 << 0) #define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1) #define HAVE_SSE2 (1 << 1)
@ -137,6 +140,8 @@
#define HAVE_AVX (1 << 18) #define HAVE_AVX (1 << 18)
#define HAVE_FMA4 (1 << 19) #define HAVE_FMA4 (1 << 19)
#define HAVE_FMA3 (1 << 20) #define HAVE_FMA3 (1 << 20)
#define HAVE_AVX512VL (1 << 21)
#define HAVE_AVX2 (1 << 22)
#define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_I 1
#define CACHE_INFO_L1_D 2 #define CACHE_INFO_L1_D 2
@ -211,5 +216,9 @@ typedef struct {
#define CPUTYPE_STEAMROLLER 49 #define CPUTYPE_STEAMROLLER 49
#define CPUTYPE_EXCAVATOR 50 #define CPUTYPE_EXCAVATOR 50
#define CPUTYPE_ZEN 51 #define CPUTYPE_ZEN 51
#define CPUTYPE_SKYLAKEX 52
#define CPUTYPE_DHYANA 53
#define CPUTYPE_HYGON_UNKNOWN 54
#endif #endif

View File

@ -34,7 +34,7 @@
#define CPU_CORTEXA15 4 #define CPU_CORTEXA15 4
static char *cpuname[] = { static char *cpuname[] = {
"UNKOWN", "UNKNOWN",
"ARMV6", "ARMV6",
"ARMV7", "ARMV7",
"CORTEXA9", "CORTEXA9",

View File

@ -29,27 +29,47 @@
#define CPU_UNKNOWN 0 #define CPU_UNKNOWN 0
#define CPU_ARMV8 1 #define CPU_ARMV8 1
#define CPU_CORTEXA57 2 // Arm
#define CPU_VULCAN 3 #define CPU_CORTEXA53 2
#define CPU_THUNDERX 4 #define CPU_CORTEXA57 3
#define CPU_THUNDERX2T99 5 #define CPU_CORTEXA72 4
#define CPU_CORTEXA73 5
// Qualcomm
#define CPU_FALKOR 6
// Cavium
#define CPU_THUNDERX 7
#define CPU_THUNDERX2T99 8
//Hisilicon
#define CPU_TSV110 9
// Ampere
#define CPU_EMAG8180 10
static char *cpuname[] = { static char *cpuname[] = {
"UNKNOWN", "UNKNOWN",
"ARMV8" , "ARMV8" ,
"CORTEXA53",
"CORTEXA57", "CORTEXA57",
"VULCAN", "CORTEXA72",
"CORTEXA73",
"FALKOR",
"THUNDERX", "THUNDERX",
"THUNDERX2T99" "THUNDERX2T99",
"TSV110",
"EMAG8180"
}; };
static char *cpuname_lower[] = { static char *cpuname_lower[] = {
"unknown", "unknown",
"armv8" , "armv8",
"cortexa53",
"cortexa57", "cortexa57",
"vulcan", "cortexa72",
"cortexa73",
"falkor",
"thunderx", "thunderx",
"thunderx2t99" "thunderx2t99",
"tsv110",
"emag8180"
}; };
int get_feature(char *search) int get_feature(char *search)
@ -78,7 +98,7 @@ int get_feature(char *search)
if( p == NULL ) return 0; if( p == NULL ) return 0;
t = strtok(p," "); t = strtok(p," ");
while( t = strtok(NULL," ")) while( (t = strtok(NULL," ")))
{ {
if (!strcmp(t, search)) { return(1); } if (!strcmp(t, search)) { return(1); }
} }
@ -114,15 +134,31 @@ int detect(void)
fclose(infile); fclose(infile);
if(cpu_part != NULL && cpu_implementer != NULL) { if(cpu_part != NULL && cpu_implementer != NULL) {
if (strstr(cpu_implementer, "0x41") && // Arm
(strstr(cpu_part, "0xd07") || strstr(cpu_part,"0xd08") || strstr(cpu_part,"0xd03") )) if (strstr(cpu_implementer, "0x41")) {
return CPU_CORTEXA57; //or compatible A53, A72 if (strstr(cpu_part, "0xd03"))
else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42")) return CPU_CORTEXA53;
return CPU_VULCAN; else if (strstr(cpu_part, "0xd07"))
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43")) return CPU_CORTEXA57;
else if (strstr(cpu_part, "0xd08"))
return CPU_CORTEXA72;
else if (strstr(cpu_part, "0xd09"))
return CPU_CORTEXA73;
}
// Qualcomm
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
return CPU_FALKOR;
// Cavium
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0a1"))
return CPU_THUNDERX; return CPU_THUNDERX;
else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */ else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
return CPU_THUNDERX2T99; return CPU_THUNDERX2T99;
// HiSilicon
else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01"))
return CPU_TSV110;
// Ampere
else if (strstr(cpu_implementer, "0x50") && strstr(cpu_part, "0x000"))
return CPU_EMAG8180;
} }
p = (char *) NULL ; p = (char *) NULL ;
@ -177,67 +213,93 @@ void get_subdirname(void)
printf("arm64"); printf("arm64");
} }
void get_cpucount(void)
{
int n=0;
#ifdef linux
FILE *infile;
char buffer[2048], *p,*t;
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{
if (!strncmp("processor", buffer, 9))
n++;
}
fclose(infile);
printf("#define NUM_CORES %d\n",n);
#endif
}
void get_cpuconfig(void) void get_cpuconfig(void)
{ {
// All arches should define ARMv8
printf("#define ARMV8\n");
printf("#define HAVE_NEON\n"); // This shouldn't be necessary
printf("#define HAVE_VFPV4\n"); // This shouldn't be necessary
int d = detect(); int d = detect();
switch (d) switch (d)
{ {
case CPU_CORTEXA53:
printf("#define %s\n", cpuname[d]);
// Fall-through
case CPU_ARMV8: case CPU_ARMV8:
printf("#define ARMV8\n"); // Minimum parameters for ARMv8 (based on A53)
printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n"); printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 262144\n"); printf("#define L2_SIZE 262144\n");
printf("#define L2_LINESIZE 64\n"); printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n"); printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n"); printf("#define L2_ASSOCIATIVE 4\n");
break;
case CPU_VULCAN:
printf("#define VULCAN \n");
printf("#define HAVE_VFP \n");
printf("#define HAVE_VFPV3 \n");
printf("#define HAVE_NEON \n");
printf("#define HAVE_VFPV4 \n");
printf("#define L1_CODE_SIZE 32768 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
printf("#define L1_DATA_SIZE 32768 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
printf("#define L2_SIZE 262144 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define L3_SIZE 33554432 \n");
printf("#define L3_LINESIZE 64 \n");
printf("#define L3_ASSOCIATIVE 32 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break; break;
case CPU_CORTEXA57: case CPU_CORTEXA57:
printf("#define CORTEXA57\n"); case CPU_CORTEXA72:
printf("#define HAVE_VFP\n"); case CPU_CORTEXA73:
printf("#define HAVE_VFPV3\n"); // Common minimum settings for these Arm cores
printf("#define HAVE_NEON\n"); // Can change a lot, but we need to be conservative
printf("#define HAVE_VFPV4\n"); // TODO: detect info from /sys if possible
printf("#define %s\n", cpuname[d]);
printf("#define L1_CODE_SIZE 49152\n"); printf("#define L1_CODE_SIZE 49152\n");
printf("#define L1_CODE_LINESIZE 64\n"); printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_CODE_ASSOCIATIVE 3\n"); printf("#define L1_CODE_ASSOCIATIVE 3\n");
printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n"); printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L1_DATA_ASSOCIATIVE 2\n"); printf("#define L1_DATA_ASSOCIATIVE 2\n");
printf("#define L2_SIZE 2097152\n"); printf("#define L2_SIZE 524288\n");
printf("#define L2_LINESIZE 64\n"); printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 16\n"); printf("#define L2_ASSOCIATIVE 16\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n"); printf("#define DTB_SIZE 4096\n");
break; break;
case CPU_FALKOR:
printf("#define FALKOR\n");
printf("#define L1_CODE_SIZE 65536\n");
printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 128\n");
printf("#define L2_SIZE 524288\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
break;
case CPU_THUNDERX: case CPU_THUNDERX:
printf("#define ARMV8\n");
printf("#define THUNDERX\n"); printf("#define THUNDERX\n");
printf("#define L1_DATA_SIZE 32768\n"); printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 128\n"); printf("#define L1_DATA_LINESIZE 128\n");
@ -249,11 +311,7 @@ void get_cpuconfig(void)
break; break;
case CPU_THUNDERX2T99: case CPU_THUNDERX2T99:
printf("#define VULCAN \n"); printf("#define THUNDERX2T99 \n");
printf("#define HAVE_VFP \n");
printf("#define HAVE_VFPV3 \n");
printf("#define HAVE_NEON \n");
printf("#define HAVE_VFPV4 \n");
printf("#define L1_CODE_SIZE 32768 \n"); printf("#define L1_CODE_SIZE 32768 \n");
printf("#define L1_CODE_LINESIZE 64 \n"); printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 8 \n"); printf("#define L1_CODE_ASSOCIATIVE 8 \n");
@ -269,7 +327,35 @@ void get_cpuconfig(void)
printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n"); printf("#define DTB_SIZE 4096 \n");
break; break;
case CPU_TSV110:
printf("#define TSV110 \n");
printf("#define L1_CODE_SIZE 65536 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 4 \n");
printf("#define L1_DATA_SIZE 65536 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 4 \n");
printf("#define L2_SIZE 524228 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
case CPU_EMAG8180:
// Minimum parameters for ARMv8 (based on A53)
printf("#define EMAG8180\n");
printf("#define L1_CODE_SIZE 32768\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 262144\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
} }
get_cpucount();
} }
@ -305,12 +391,10 @@ void get_features(void)
if( p == NULL ) return; if( p == NULL ) return;
t = strtok(p," "); t = strtok(p," ");
while( t = strtok(NULL," ")) while( (t = strtok(NULL," ")))
{ {
} }
#endif #endif
return; return;
} }

View File

@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_UNKNOWN 0 #define CPU_UNKNOWN 0
#define CPU_P5600 1 #define CPU_P5600 1
#define CPU_1004K 2
static char *cpuname[] = { static char *cpuname[] = {
"UNKOWN", "UNKNOWN",
"P5600" "P5600",
"1004K"
}; };
int detect(void){ int detect(void){
@ -90,7 +92,7 @@ int detect(void){
if (!strncmp("cpu", buffer, 3)){ if (!strncmp("cpu", buffer, 3)){
p = strchr(buffer, ':') + 2; p = strchr(buffer, ':') + 2;
#if 0 #if 0
fprintf(stderr, "%s\n", p); fprintf(stderr, "%s \n", p);
#endif #endif
break; break;
} }
@ -99,43 +101,13 @@ int detect(void){
fclose(infile); fclose(infile);
if(p != NULL){ if(p != NULL){
if (strstr(p, "Loongson-3A")){ if (strstr(p, "5600")) {
return CPU_LOONGSON3A; return CPU_P5600;
}else if(strstr(p, "Loongson-3B")){ } else if (strstr(p, "1004K")) {
return CPU_LOONGSON3B; return CPU_1004K;
}else if (strstr(p, "Loongson-3")){ } else
infile = fopen("/proc/cpuinfo", "r");
p = (char *)NULL;
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("system type", buffer, 11)){
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if (strstr(p, "loongson3a"))
return CPU_LOONGSON3A;
}else{
return CPU_UNKNOWN; return CPU_UNKNOWN;
} }
}
//Check model name for Loongson3
infile = fopen("/proc/cpuinfo", "r");
p = (char *)NULL;
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("model name", buffer, 10)){
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if(p != NULL){
if (strstr(p, "Loongson-3A")){
return CPU_LOONGSON3A;
}else if(strstr(p, "Loongson-3B")){
return CPU_LOONGSON3B;
}
}
#endif #endif
return CPU_UNKNOWN; return CPU_UNKNOWN;
} }
@ -149,7 +121,7 @@ void get_architecture(void){
} }
void get_subarchitecture(void){ void get_subarchitecture(void){
if(detect()==CPU_P5600){ if(detect()==CPU_P5600|| detect()==CPU_1004K){
printf("P5600"); printf("P5600");
}else{ }else{
printf("UNKNOWN"); printf("UNKNOWN");
@ -170,6 +142,14 @@ void get_cpuconfig(void){
printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n"); printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 8\n"); printf("#define L2_ASSOCIATIVE 8\n");
} else if (detect()==CPU_1004K) {
printf("#define MIPS1004K\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 26144\n");
printf("#define DTB_DEFAULT_ENTRIES 8\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
}else{ }else{
printf("#define UNKNOWN\n"); printf("#define UNKNOWN\n");
} }
@ -178,6 +158,8 @@ void get_cpuconfig(void){
void get_libname(void){ void get_libname(void){
if(detect()==CPU_P5600) { if(detect()==CPU_P5600) {
printf("p5600\n"); printf("p5600\n");
} else if (detect()==CPU_1004K) {
printf("1004K\n");
}else{ }else{
printf("mips\n"); printf("mips\n");
} }

View File

@ -79,7 +79,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_I6500 6 #define CPU_I6500 6
static char *cpuname[] = { static char *cpuname[] = {
"UNKOWN", "UNKNOWN",
"SICORTEX", "SICORTEX",
"LOONGSON3A", "LOONGSON3A",
"LOONGSON3B", "LOONGSON3B",

View File

@ -56,6 +56,7 @@
#define CPUTYPE_CELL 6 #define CPUTYPE_CELL 6
#define CPUTYPE_PPCG4 7 #define CPUTYPE_PPCG4 7
#define CPUTYPE_POWER8 8 #define CPUTYPE_POWER8 8
#define CPUTYPE_POWER9 9
char *cpuname[] = { char *cpuname[] = {
"UNKNOWN", "UNKNOWN",
@ -66,7 +67,8 @@ char *cpuname[] = {
"POWER6", "POWER6",
"CELL", "CELL",
"PPCG4", "PPCG4",
"POWER8" "POWER8",
"POWER9"
}; };
char *lowercpuname[] = { char *lowercpuname[] = {
@ -78,7 +80,8 @@ char *lowercpuname[] = {
"power6", "power6",
"cell", "cell",
"ppcg4", "ppcg4",
"power8" "power8",
"power9"
}; };
char *corename[] = { char *corename[] = {
@ -90,7 +93,8 @@ char *corename[] = {
"POWER6", "POWER6",
"CELL", "CELL",
"PPCG4", "PPCG4",
"POWER8" "POWER8",
"POWER9"
}; };
int detect(void){ int detect(void){
@ -120,6 +124,7 @@ int detect(void){
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
@ -127,6 +132,33 @@ int detect(void){
#endif #endif
#ifdef _AIX #ifdef _AIX
FILE *infile;
char buffer[512], *p;
p = (char *)NULL;
infile = popen("prtconf|grep 'Processor Type'", "r");
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("Pro", buffer, 3)){
p = strchr(buffer, ':') + 2;
#if 0
fprintf(stderr, "%s\n", p);
#endif
break;
}
}
pclose(infile);
if (!strncasecmp(p, "POWER3", 6)) return CPUTYPE_POWER3;
if (!strncasecmp(p, "POWER4", 6)) return CPUTYPE_POWER4;
if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970;
if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9;
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;
return CPUTYPE_POWER5; return CPUTYPE_POWER5;
#endif #endif
@ -142,6 +174,52 @@ int detect(void){
return CPUTYPE_PPC970; return CPUTYPE_PPC970;
#endif #endif
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
int id;
__asm __volatile("mfpvr %0" : "=r"(id));
switch ( id >> 16 ) {
case 0x4e: // POWER9
return CPUTYPE_POWER9;
break;
case 0x4d:
case 0x4b: // POWER8/8E
return CPUTYPE_POWER8;
break;
case 0x4a:
case 0x3f: // POWER7/7E
return CPUTYPE_POWER6;
break;
case 0x3e:
return CPUTYPE_POWER6;
break;
case 0x3a:
return CPUTYPE_POWER5;
break;
case 0x35:
case 0x38: // POWER4 /4+
return CPUTYPE_POWER4;
break;
case 0x40:
case 0x41: // POWER3 /3+
return CPUTYPE_POWER3;
break;
case 0x39:
case 0x3c:
case 0x44:
case 0x45:
return CPUTYPE_PPC970;
break;
case 0x70:
return CPUTYPE_CELL;
break;
case 0x8003:
return CPUTYPE_PPCG4;
break;
default:
return CPUTYPE_UNKNOWN;
}
#endif
} }
void get_architecture(void){ void get_architecture(void){

View File

@ -50,6 +50,8 @@
#ifdef NO_AVX #ifdef NO_AVX
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM #define CPUTYPE_HASWELL CPUTYPE_NEHALEM
#define CORE_HASWELL CORE_NEHALEM #define CORE_HASWELL CORE_NEHALEM
#define CPUTYPE_SKYLAKEX CPUTYPE_NEHALEM
#define CORE_SKYLAKEX CORE_NEHALEM
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
#define CORE_SANDYBRIDGE CORE_NEHALEM #define CORE_SANDYBRIDGE CORE_NEHALEM
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
@ -95,10 +97,10 @@ static C_INLINE void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
("mov %%ebx, %%edi;" ("mov %%ebx, %%edi;"
"cpuid;" "cpuid;"
"xchgl %%ebx, %%edi;" "xchgl %%ebx, %%edi;"
: "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op), "c" (0) : "cc");
#else #else
__asm__ __volatile__ __asm__ __volatile__
("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) , "c" (0) : "cc");
#endif #endif
} }
@ -209,6 +211,44 @@ int support_avx(){
#endif #endif
} }
int support_avx2(){
#ifndef NO_AVX2
int eax, ebx, ecx=0, edx;
int ret=0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & (1<<7)) != 0)
ret=1; //OS supports AVX2
return ret;
#else
return 0;
#endif
}
int support_avx512(){
#if !defined(NO_AVX) && !defined(NO_AVX512)
int eax, ebx, ecx, edx;
int ret=0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & 32) != 32){
ret=0; //OS does not even support AVX2
}
if((ebx & (1<<31)) != 0){
xgetbv(0, &eax, &edx);
if((eax & 0xe0) == 0xe0)
ret=1; //OS supports AVX512VL
}
return ret;
#else
return 0;
#endif
}
int get_vendor(void){ int get_vendor(void){
int eax, ebx, ecx, edx; int eax, ebx, ecx, edx;
@ -231,6 +271,7 @@ int get_vendor(void){
if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS;
if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA;
if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC; if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC;
if (!strcmp(vendor, "HygonGenuine")) return VENDOR_HYGON;
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
@ -292,6 +333,8 @@ int get_cputype(int gettype){
if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2;
#ifndef NO_AVX #ifndef NO_AVX
if (support_avx()) feature |= HAVE_AVX; if (support_avx()) feature |= HAVE_AVX;
if (support_avx2()) feature |= HAVE_AVX2;
if (support_avx512()) feature |= HAVE_AVX512VL;
if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3;
#endif #endif
@ -1004,7 +1047,9 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
} }
} }
if ((get_vendor() == VENDOR_AMD) || (get_vendor() == VENDOR_CENTAUR)) { if ((get_vendor() == VENDOR_AMD) ||
(get_vendor() == VENDOR_HYGON) ||
(get_vendor() == VENDOR_CENTAUR)) {
cpuid(0x80000005, &eax, &ebx, &ecx, &edx); cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
LDTB.size = 4096; LDTB.size = 4096;
@ -1152,7 +1197,11 @@ int get_cpuname(void){
case 3: case 3:
case 5: case 5:
case 6: case 6:
#if defined(__x86_64__) || defined(__amd64__)
return CPUTYPE_CORE2;
#else
return CPUTYPE_PENTIUM2; return CPUTYPE_PENTIUM2;
#endif
case 7: case 7:
case 8: case 8:
case 10: case 10:
@ -1166,7 +1215,7 @@ int get_cpuname(void){
return CPUTYPE_CORE2; return CPUTYPE_CORE2;
} }
break; break;
case 1: case 1: // family 6 exmodel 1
switch (model) { switch (model) {
case 6: case 6:
return CPUTYPE_CORE2; return CPUTYPE_CORE2;
@ -1183,7 +1232,7 @@ int get_cpuname(void){
return CPUTYPE_DUNNINGTON; return CPUTYPE_DUNNINGTON;
} }
break; break;
case 2: case 2: // family 6 exmodel 2
switch (model) { switch (model) {
case 5: case 5:
//Intel Core (Clarkdale) / Core (Arrandale) //Intel Core (Clarkdale) / Core (Arrandale)
@ -1212,7 +1261,7 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
} }
break; break;
case 3: case 3: // family 6 exmodel 3
switch (model) { switch (model) {
case 7: case 7:
// Bay Trail // Bay Trail
@ -1226,57 +1275,47 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 12: case 12:
case 15: case 15:
if(support_avx()) if(support_avx2())
#ifndef NO_AVX2
return CPUTYPE_HASWELL; return CPUTYPE_HASWELL;
#else if(support_avx())
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
#endif
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 13: case 13:
//Broadwell //Broadwell
if(support_avx()) if(support_avx2())
#ifndef NO_AVX2
return CPUTYPE_HASWELL; return CPUTYPE_HASWELL;
#else if(support_avx())
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
#endif
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
} }
break; break;
case 4: case 4: // family 6 exmodel 4
switch (model) { switch (model) {
case 5: case 5:
case 6: case 6:
if(support_avx()) if(support_avx2())
#ifndef NO_AVX2
return CPUTYPE_HASWELL; return CPUTYPE_HASWELL;
#else if(support_avx())
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
#endif
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 7: case 7:
case 15: case 15:
//Broadwell //Broadwell
if(support_avx()) if(support_avx2())
#ifndef NO_AVX2
return CPUTYPE_HASWELL; return CPUTYPE_HASWELL;
#else if(support_avx())
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
#endif
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 14: case 14:
//Skylake //Skylake
if(support_avx()) if(support_avx2())
#ifndef NO_AVX2
return CPUTYPE_HASWELL; return CPUTYPE_HASWELL;
#else if(support_avx())
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
#endif
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 12: case 12:
@ -1286,54 +1325,85 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
} }
break; break;
case 5: case 5: // family 6 exmodel 5
switch (model) { switch (model) {
case 6: case 6:
//Broadwell //Broadwell
if(support_avx()) if(support_avx2())
#ifndef NO_AVX2
return CPUTYPE_HASWELL; return CPUTYPE_HASWELL;
#else if(support_avx())
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
#endif
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 5: case 5:
// Skylake X
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
case 14: case 14:
// Skylake // Skylake
if(support_avx()) if(support_avx2())
#ifndef NO_AVX2
return CPUTYPE_HASWELL; return CPUTYPE_HASWELL;
#else if(support_avx())
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
#endif
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 7: case 7:
// Xeon Phi Knights Landing // Xeon Phi Knights Landing
if(support_avx()) if(support_avx2())
#ifndef NO_AVX2
return CPUTYPE_HASWELL; return CPUTYPE_HASWELL;
#else if(support_avx())
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
#endif
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
case 12: case 12:
// Apollo Lake // Apollo Lake
case 15:
// Denverton
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
} }
break; break;
case 6: // family 6 exmodel 6
switch (model) {
case 6: // Cannon Lake
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
break;
case 7: // family 6 exmodel 7
switch (model) {
case 10: // Goldmont Plus
return CPUTYPE_NEHALEM;
case 14: // Ice Lake
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
break;
case 9: case 9:
case 8: case 8:
switch (model) { switch (model) {
case 14: // Kaby Lake case 14: // Kaby Lake and refreshes
if(support_avx()) if(support_avx2())
#ifndef NO_AVX2
return CPUTYPE_HASWELL; return CPUTYPE_HASWELL;
#else if(support_avx())
return CPUTYPE_SANDYBRIDGE; return CPUTYPE_SANDYBRIDGE;
#endif
else else
return CPUTYPE_NEHALEM; return CPUTYPE_NEHALEM;
} }
@ -1361,7 +1431,11 @@ int get_cpuname(void){
case 0x5: case 0x5:
return CPUTYPE_AMDK6; return CPUTYPE_AMDK6;
case 0x6: case 0x6:
#if defined(__x86_64__) || defined(__amd64__)
return CPUTYPE_BARCELONA;
#else
return CPUTYPE_ATHLON; return CPUTYPE_ATHLON;
#endif
case 0xf: case 0xf:
switch (exfamily) { switch (exfamily) {
case 0: case 0:
@ -1420,6 +1494,8 @@ int get_cpuname(void){
switch (model) { switch (model) {
case 1: case 1:
// AMD Ryzen // AMD Ryzen
case 8:
// AMD Ryzen2
if(support_avx()) if(support_avx())
#ifndef NO_AVX2 #ifndef NO_AVX2
return CPUTYPE_ZEN; return CPUTYPE_ZEN;
@ -1435,6 +1511,26 @@ int get_cpuname(void){
return CPUTYPE_AMD_UNKNOWN; return CPUTYPE_AMD_UNKNOWN;
} }
if (vendor == VENDOR_HYGON){
switch (family) {
case 0xf:
switch (exfamily) {
case 9:
//Hygon Dhyana
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_ZEN;
#else
return CPUTYPE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator
#endif
else
return CPUTYPE_BARCELONA;
}
break;
}
return CPUTYPE_HYGON_UNKNOWN;
}
if (vendor == VENDOR_CYRIX){ if (vendor == VENDOR_CYRIX){
switch (family) { switch (family) {
case 0x4: case 0x4:
@ -1556,6 +1652,8 @@ static char *cpuname[] = {
"STEAMROLLER", "STEAMROLLER",
"EXCAVATOR", "EXCAVATOR",
"ZEN", "ZEN",
"SKYLAKEX",
"DHYANA"
}; };
static char *lowercpuname[] = { static char *lowercpuname[] = {
@ -1610,10 +1708,12 @@ static char *lowercpuname[] = {
"steamroller", "steamroller",
"excavator", "excavator",
"zen", "zen",
"skylakex",
"dhyana"
}; };
static char *corename[] = { static char *corename[] = {
"UNKOWN", "UNKNOWN",
"80486", "80486",
"P5", "P5",
"P6", "P6",
@ -1641,6 +1741,8 @@ static char *corename[] = {
"STEAMROLLER", "STEAMROLLER",
"EXCAVATOR", "EXCAVATOR",
"ZEN", "ZEN",
"SKYLAKEX",
"DHYANA"
}; };
static char *corename_lower[] = { static char *corename_lower[] = {
@ -1672,6 +1774,8 @@ static char *corename_lower[] = {
"steamroller", "steamroller",
"excavator", "excavator",
"zen", "zen",
"skylakex",
"dhyana"
}; };
@ -1714,7 +1818,11 @@ int get_coretype(void){
case 4: case 4:
case 5: case 5:
case 6: case 6:
#if defined(__x86_64__) || defined(__amd64__)
return CORE_CORE2;
#else
return CORE_P6; return CORE_P6;
#endif
case 7: case 7:
return CORE_KATMAI; return CORE_KATMAI;
case 8: case 8:
@ -1860,6 +1968,19 @@ int get_coretype(void){
else else
return CORE_NEHALEM; return CORE_NEHALEM;
case 5: case 5:
// Skylake X
#ifndef NO_AVX512
return CORE_SKYLAKEX;
#else
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
#endif
case 14: case 14:
// Skylake // Skylake
if(support_avx()) if(support_avx())
@ -1885,6 +2006,38 @@ int get_coretype(void){
return CORE_NEHALEM; return CORE_NEHALEM;
} }
break; break;
case 6:
if (model == 6)
#ifndef NO_AVX512
return CORE_SKYLAKEX;
#else
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
#endif
break;
case 7:
if (model == 10)
return CORE_NEHALEM;
if (model == 14)
#ifndef NO_AVX512
return CORE_SKYLAKEX;
#else
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
#endif
break;
case 9: case 9:
case 8: case 8:
if (model == 14) { // Kaby Lake if (model == 14) { // Kaby Lake
@ -1908,7 +2061,11 @@ int get_coretype(void){
if (vendor == VENDOR_AMD){ if (vendor == VENDOR_AMD){
if (family <= 0x5) return CORE_80486; if (family <= 0x5) return CORE_80486;
#if defined(__x86_64__) || defined(__amd64__)
if (family <= 0xe) return CORE_BARCELONA;
#else
if (family <= 0xe) return CORE_ATHLON; if (family <= 0xe) return CORE_ATHLON;
#endif
if (family == 0xf){ if (family == 0xf){
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
else if (exfamily == 5) return CORE_BOBCAT; else if (exfamily == 5) return CORE_BOBCAT;
@ -1958,6 +2115,8 @@ int get_coretype(void){
switch (model) { switch (model) {
case 1: case 1:
// AMD Ryzen // AMD Ryzen
case 8:
// Ryzen 2
if(support_avx()) if(support_avx())
#ifndef NO_AVX2 #ifndef NO_AVX2
return CORE_ZEN; return CORE_ZEN;
@ -1973,6 +2132,23 @@ int get_coretype(void){
} }
} }
if (vendor == VENDOR_HYGON){
if (family == 0xf){
if (exfamily == 9) {
if(support_avx())
#ifndef NO_AVX2
return CORE_ZEN;
#else
return CORE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator
#endif
else
return CORE_BARCELONA;
} else {
return CORE_BARCELONA;
}
}
}
if (vendor == VENDOR_CENTAUR) { if (vendor == VENDOR_CENTAUR) {
switch (family) { switch (family) {
case 0x6: case 0x6:
@ -2059,6 +2235,8 @@ void get_cpuconfig(void){
if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n");
if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n");
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); if (features & HAVE_AVX ) printf("#define HAVE_AVX\n");
if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n");
if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n");
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
@ -2127,6 +2305,8 @@ void get_sse(void){
if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n");
if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n");
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); if (features & HAVE_AVX ) printf("HAVE_AVX=1\n");
if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n");
if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n");
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");

View File

@ -27,17 +27,23 @@
#include <string.h> #include <string.h>
#define CPU_GENERIC 0 #define CPU_GENERIC 0
#define CPU_Z13 1 #define CPU_Z13 1
#define CPU_Z14 2
#define CPU_Z15 3
static char *cpuname[] = { static char *cpuname[] = {
"ZARCH_GENERIC", "ZARCH_GENERIC",
"Z13" "Z13",
"Z14",
"Z15"
}; };
static char *cpuname_lower[] = { static char *cpuname_lower[] = {
"zarch_generic", "zarch_generic",
"z13" "z13",
"z14",
"z15"
}; };
int detect(void) int detect(void)
@ -61,6 +67,10 @@ int detect(void)
if (strstr(p, "2964")) return CPU_Z13; if (strstr(p, "2964")) return CPU_Z13;
if (strstr(p, "2965")) return CPU_Z13; if (strstr(p, "2965")) return CPU_Z13;
if (strstr(p, "3906")) return CPU_Z14;
if (strstr(p, "3907")) return CPU_Z14;
if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14
if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14
return CPU_GENERIC; return CPU_GENERIC;
} }
@ -107,5 +117,16 @@ void get_cpuconfig(void)
printf("#define Z13\n"); printf("#define Z13\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n");
break; break;
case CPU_Z14:
printf("#define Z14\n");
printf("#define L1_DATA_SIZE 131072\n");
printf("#define L1_DATA_LINESIZE 256\n");
printf("#define L1_DATA_ASSOCIATIVE 8\n");
printf("#define L2_SIZE 4194304\n");
printf("#define L2_LINESIZE 256\n");
printf("#define L2_ASSOCIATIVE 8\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
} }
} }

View File

@ -101,6 +101,10 @@ OS_INTERIX
OS_LINUX OS_LINUX
#endif #endif
#if defined(__HAIKU__)
OS_HAIKU
#endif
#if defined(__i386) || defined(_X86) #if defined(__i386) || defined(_X86)
ARCH_X86 ARCH_X86
#endif #endif
@ -109,7 +113,7 @@ ARCH_X86
ARCH_X86_64 ARCH_X86_64
#endif #endif
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) #if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) || defined(__POWERPC__)
ARCH_POWER ARCH_POWER
#endif #endif

View File

@ -6,6 +6,8 @@ TOPDIR = ..
include $(TOPDIR)/Makefile.system include $(TOPDIR)/Makefile.system
override CFLAGS += -DADD$(BU) -DCBLAS override CFLAGS += -DADD$(BU) -DCBLAS
override TARGET_ARCH=
override TARGET_MACH=
LIB = $(TOPDIR)/$(LIBNAME) LIB = $(TOPDIR)/$(LIBNAME)
@ -102,7 +104,13 @@ clean ::
rm -f x* rm -f x*
FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
CEXTRALIB = ifeq ($(USE_OPENMP), 1)
ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(C_COMPILER), CLANG)
CEXTRALIB = -lomp
endif
endif
endif
# Single real # Single real
xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME) xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME)

View File

@ -577,7 +577,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 ***************************** * ************************* STEST1 *****************************
* *
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
* *

View File

@ -1503,6 +1503,8 @@ C $ ' .' )
NC = 0 NC = 0
RESET = .TRUE. RESET = .TRUE.
ERRMAX = RZERO ERRMAX = RZERO
RALS = RONE
RBETS = RONE
* *
DO 100 IN = 1, NIDIM DO 100 IN = 1, NIDIM
N = IDIM( IN ) N = IDIM( IN )

View File

@ -653,7 +653,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 ***************************** * ************************* STEST1 *****************************
* *
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
* *

View File

@ -653,7 +653,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 ***************************** * ************************* STEST1 *****************************
* *
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
* *

View File

@ -577,7 +577,7 @@
SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
* ************************* STEST1 ***************************** * ************************* STEST1 *****************************
* *
* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
* REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
* ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
* *

View File

@ -1504,6 +1504,8 @@ C $ ' .' )
NC = 0 NC = 0
RESET = .TRUE. RESET = .TRUE.
ERRMAX = RZERO ERRMAX = RZERO
RALS = RONE
RBETS = RONE
* *
DO 100 IN = 1, NIDIM DO 100 IN = 1, NIDIM
N = IDIM( IN ) N = IDIM( IN )

View File

@ -5,7 +5,7 @@ T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS. T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO 16.0 THRESHOLD VALUE OF TEST RATIO
6 NUMBER OF VALUES OF N 7 NUMBER OF VALUES OF N
1 2 3 5 7 9 35 VALUES OF N 1 2 3 5 7 9 35 VALUES OF N
3 NUMBER OF VALUES OF ALPHA 3 NUMBER OF VALUES OF ALPHA
0.0 1.0 0.7 VALUES OF ALPHA 0.0 1.0 0.7 VALUES OF ALPHA

View File

@ -5,7 +5,7 @@ T LOGICAL FLAG, T TO STOP ON FAILURES.
T LOGICAL FLAG, T TO TEST ERROR EXITS. T LOGICAL FLAG, T TO TEST ERROR EXITS.
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
16.0 THRESHOLD VALUE OF TEST RATIO 16.0 THRESHOLD VALUE OF TEST RATIO
6 NUMBER OF VALUES OF N 7 NUMBER OF VALUES OF N
0 1 2 3 5 9 35 VALUES OF N 0 1 2 3 5 9 35 VALUES OF N
3 NUMBER OF VALUES OF ALPHA 3 NUMBER OF VALUES OF ALPHA
0.0 1.0 0.7 VALUES OF ALPHA 0.0 1.0 0.7 VALUES OF ALPHA

View File

@ -62,9 +62,36 @@
#endif #endif
#endif #endif
#ifndef TRANSA #ifndef thread_local
# if __STDC_VERSION__ >= 201112 && !defined __STDC_NO_THREADS__
# define thread_local _Thread_local
# elif defined _WIN32 && ( \
defined _MSC_VER || \
defined __ICL || \
defined __DMC__ || \
defined __BORLANDC__ )
# define thread_local __declspec(thread)
/* note that ICC (linux) and Clang are covered by __GNUC__ */
# elif defined __GNUC__ || \
defined __SUNPRO_C || \
defined __xlC__
# define thread_local __thread
# else
# define UNSAFE
#endif
#endif
#if defined USE_OPENMP
#undef UNSAFE
#endif
#if !defined(TRANSA) && !defined(UNSAFE)
#define Y_DUMMY_NUM 1024 #define Y_DUMMY_NUM 1024
#if defined(USE_OPENMP)
static FLOAT y_dummy[Y_DUMMY_NUM]; static FLOAT y_dummy[Y_DUMMY_NUM];
#pragma omp threadprivate(y_dummy)
# else
static thread_local FLOAT y_dummy[Y_DUMMY_NUM];
# endif
#endif #endif
static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
@ -105,10 +132,12 @@ static int gemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
#ifdef TRANSA #ifdef TRANSA
y += n_from * incy * COMPSIZE; y += n_from * incy * COMPSIZE;
#else #else
# ifndef UNSAFE
//for split matrix row (n) direction and vector x of gemv_n //for split matrix row (n) direction and vector x of gemv_n
x += n_from * incx * COMPSIZE; x += n_from * incx * COMPSIZE;
//store partial result for every thread //store partial result for every thread
y += (m_to - m_from) * 1 * COMPSIZE * pos; y += (m_to - m_from) * 1 * COMPSIZE * pos;
# endif
#endif #endif
} }
@ -136,7 +165,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
BLASLONG width, i, num_cpu; BLASLONG width, i, num_cpu;
#ifndef TRANSA #if !defined(TRANSA) && !defined(UNSAFE)
int split_x=0; int split_x=0;
#endif #endif
@ -212,7 +241,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
i -= width; i -= width;
} }
#ifndef TRANSA #if !defined(TRANSA) && !defined(UNSAFE)
//try to split matrix on row direction and x. //try to split matrix on row direction and x.
//Then, reduction. //Then, reduction.
if (num_cpu < nthreads) { if (num_cpu < nthreads) {
@ -272,7 +301,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
exec_blas(num_cpu, queue); exec_blas(num_cpu, queue);
} }
#ifndef TRANSA #if !defined(TRANSA) && !defined(UNSAFE)
if(split_x==1){ if(split_x==1){
//reduction //reduction
for(i=0; i<num_cpu; i++){ for(i=0; i<num_cpu; i++){

View File

@ -54,16 +54,12 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu
COPY_K(m, b, incb, buffer, 1); COPY_K(m, b, incb, buffer, 1);
} }
/*FIXME the GEMV unrolling performed here was found to be broken, see issue 1332 */ for (is = 0; is < m; is += DTB_ENTRIES){
/* Multiplying DTB size by 100 is just a quick-and-dirty hack to disable it for now[B */
for (is = 0; is < m; is += DTB_ENTRIES * 100){ min_i = MIN(m - is, DTB_ENTRIES);
min_i = MIN(m - is, DTB_ENTRIES * 100);
#ifndef TRANSA #ifndef TRANSA
if (is > 0){ if (is > 0){
fprintf(stderr,"WARNING unrolling of the trmv_U loop may give wrong results\n");
GEMV_N(is, min_i, 0, dp1, GEMV_N(is, min_i, 0, dp1,
a + is * lda, lda, a + is * lda, lda,
B + is, 1, B + is, 1,

View File

@ -346,7 +346,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
if (range_n[num_cpu] > m) range_n[num_cpu] = m; if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
queue[num_cpu].mode = mode; queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel; queue[num_cpu].routine = trmv_kernel;
@ -386,7 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
range_m[num_cpu + 1] = range_m[num_cpu] + width; range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
if (range_n[num_cpu] > m) range_n[num_cpu] = m; if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;
queue[num_cpu].mode = mode; queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel; queue[num_cpu].routine = trmv_kernel;

View File

@ -362,7 +362,7 @@ cgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -410,7 +410,7 @@ zgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -458,7 +458,7 @@ xgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -558,7 +558,7 @@ cgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -606,7 +606,7 @@ zgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -654,7 +654,7 @@ xgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -1821,7 +1821,7 @@ cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -1869,7 +1869,7 @@ zgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -1917,7 +1917,7 @@ xgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -1974,7 +1974,7 @@ cgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -2022,7 +2022,7 @@ zgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -2070,7 +2070,7 @@ xgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -2731,7 +2731,7 @@ cgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -2779,7 +2779,7 @@ zgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -2827,7 +2827,7 @@ xgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -2927,7 +2927,7 @@ cgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -2975,7 +2975,7 @@ zgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -3023,7 +3023,7 @@ xgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -4190,7 +4190,7 @@ cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -4238,7 +4238,7 @@ zgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -4286,7 +4286,7 @@ xgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -4343,7 +4343,7 @@ cgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -4391,7 +4391,7 @@ zgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@ -4439,7 +4439,7 @@ xgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)

View File

@ -338,7 +338,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
START_RPCC(); START_RPCC();
@ -398,7 +398,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
START_RPCC(); START_RPCC();
@ -463,7 +463,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
START_RPCC(); START_RPCC();

View File

@ -332,13 +332,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#else #else
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
else else
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
START_RPCC(); START_RPCC();

View File

@ -91,7 +91,12 @@
#endif #endif
typedef struct { typedef struct {
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; #if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t; } job_t;
@ -99,7 +104,7 @@ typedef struct {
#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
BETA[0], BETA[1], NULL, 0, NULL, 0, \ BETA[0], BETA[1], NULL, 0, NULL, 0, \
(FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC) (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC)
#endif #endif
#ifndef ICOPYB_OPERATION #ifndef ICOPYB_OPERATION
@ -403,13 +408,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Make sure if no one is using another buffer */ /* Make sure if no one is using another buffer */
for (i = 0; i < args -> nthreads; i++) for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
STOP_RPCC(waiting1); STOP_RPCC(waiting1);
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
min_jj = MIN(n_to, xxx + div_n) - jjs; min_jj = MIN(n_to, xxx + div_n) - jjs;
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
START_RPCC(); START_RPCC();
@ -436,7 +441,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for (i = 0; i < args -> nthreads; i++) for (i = 0; i < args -> nthreads; i++)
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
} WMB;
}
current = mypos; current = mypos;
@ -453,7 +459,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
START_RPCC(); START_RPCC();
/* thread has to wait */ /* thread has to wait */
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
STOP_RPCC(waiting2); STOP_RPCC(waiting2);
@ -472,6 +478,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
if (m_to - m_from == min_i) { if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
WMB;
} }
} }
} while (current != mypos); } while (current != mypos);
@ -512,6 +519,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
if (is + min_i >= m_to) { if (is + min_i >= m_to) {
/* Thread doesn't need this buffer any more */ /* Thread doesn't need this buffer any more */
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
WMB;
} }
} }
@ -536,13 +544,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Make sure if no one is using another buffer */ /* Make sure if no one is using another buffer */
for (i = 0; i < args -> nthreads; i++) for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
STOP_RPCC(waiting1); STOP_RPCC(waiting1);
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
min_jj = MIN(n_to, xxx + div_n) - jjs; min_jj = MIN(n_to, xxx + div_n) - jjs;
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
START_RPCC(); START_RPCC();
@ -590,7 +598,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
START_RPCC(); START_RPCC();
/* thread has to wait */ /* thread has to wait */
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
STOP_RPCC(waiting2); STOP_RPCC(waiting2);
@ -608,6 +616,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
if (m_to - m_from == min_i) { if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
WMB;
} }
} }
} while (current != mypos); } while (current != mypos);
@ -672,13 +681,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Make sure if no one is using another buffer */ /* Make sure if no one is using another buffer */
for (i = 0; i < args -> nthreads; i++) for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
STOP_RPCC(waiting1); STOP_RPCC(waiting1);
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){ for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
min_jj = MIN(n_to, xxx + div_n) - jjs; min_jj = MIN(n_to, xxx + div_n) - jjs;
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
START_RPCC(); START_RPCC();
@ -726,7 +735,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
START_RPCC(); START_RPCC();
/* thread has to wait */ /* thread has to wait */
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
STOP_RPCC(waiting2); STOP_RPCC(waiting2);
@ -743,8 +752,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
} }
if (m_to - m_from == min_i) { if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
} WMB;
}
} }
} while (current != mypos); } while (current != mypos);
@ -782,7 +792,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#endif #endif
if (is + min_i >= m_to) { if (is + min_i >= m_to) {
/* Thread doesn't need this buffer any more */ /* Thread doesn't need this buffer any more */
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0; job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
} }
} }
@ -799,7 +810,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for (i = 0; i < args -> nthreads; i++) { for (i = 0; i < args -> nthreads; i++) {
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;MB;};
} }
} }
@ -835,6 +846,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
*range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
#else
CRITICAL_SECTION level3_lock;
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
blas_arg_t newarg; blas_arg_t newarg;
blas_queue_t queue[MAX_CPU_NUMBER]; blas_queue_t queue[MAX_CPU_NUMBER];
@ -864,6 +884,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE; mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE;
#endif #endif
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
pthread_mutex_lock(&level3_lock);
#else
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
newarg.m = args -> m; newarg.m = args -> m;
newarg.n = args -> n; newarg.n = args -> n;
newarg.k = args -> k; newarg.k = args -> k;
@ -968,6 +996,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
free(job); free(job);
#endif #endif
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
pthread_mutex_unlock(&level3_lock);
#else
LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
return 0; return 0;
} }

View File

@ -67,7 +67,12 @@
#endif #endif
typedef struct { typedef struct {
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; #if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t; } job_t;

View File

@ -48,6 +48,10 @@
#define SWITCH_RATIO 2 #define SWITCH_RATIO 2
#endif #endif
#ifndef GEMM_PREFERED_SIZE
#define GEMM_PREFERED_SIZE 1
#endif
//The array of job_t may overflow the stack. //The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t. //Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@ -91,7 +95,8 @@
#endif #endif
typedef struct { typedef struct {
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; volatile
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t; } job_t;
@ -346,7 +351,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Make sure if no one is using workspace */ /* Make sure if no one is using workspace */
START_RPCC(); START_RPCC();
for (i = 0; i < args -> nthreads; i++) for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;}; while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
STOP_RPCC(waiting1); STOP_RPCC(waiting1);
#if defined(FUSED_GEMM) && !defined(TIMING) #if defined(FUSED_GEMM) && !defined(TIMING)
@ -360,12 +365,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Split local region of B into parts */ /* Split local region of B into parts */
for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){ for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){
min_jj = MIN(n_to, js + div_n) - jjs; min_jj = MIN(n_to, js + div_n) - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
else else
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
/* Copy part of local region of B into workspace */ /* Copy part of local region of B into workspace */
START_RPCC(); START_RPCC();
OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
@ -408,7 +417,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Wait until other region of B is initialized */ /* Wait until other region of B is initialized */
START_RPCC(); START_RPCC();
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
STOP_RPCC(waiting2); STOP_RPCC(waiting2);
/* Apply kernel with local region of A and part of other region of B */ /* Apply kernel with local region of A and part of other region of B */
@ -426,6 +435,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Clear synchronization flag if this thread is done with other region of B */ /* Clear synchronization flag if this thread is done with other region of B */
if (m_to - m_from == min_i) { if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
WMB;
} }
} }
} while (current != mypos); } while (current != mypos);
@ -487,7 +497,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
START_RPCC(); START_RPCC();
for (i = 0; i < args -> nthreads; i++) { for (i = 0; i < args -> nthreads; i++) {
for (js = 0; js < DIVIDE_RATE; js++) { for (js = 0; js < DIVIDE_RATE; js++) {
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;}; while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;};
} }
} }
STOP_RPCC(waiting3); STOP_RPCC(waiting3);
@ -508,10 +518,29 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
return 0; return 0;
} }
static int round_up(int remainder, int width, int multiple)
{
if (multiple > remainder || width <= multiple)
return width;
width = (width + multiple - 1) / multiple;
width = width * multiple;
return width;
}
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
*range_n, FLOAT *sa, FLOAT *sb, *range_n, FLOAT *sa, FLOAT *sb,
BLASLONG nthreads_m, BLASLONG nthreads_n) { BLASLONG nthreads_m, BLASLONG nthreads_n) {
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
#else
CRITICAL_SECTION level3_lock;
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
blas_arg_t newarg; blas_arg_t newarg;
#ifndef USE_ALLOC_HEAP #ifndef USE_ALLOC_HEAP
@ -552,6 +581,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
#endif #endif
#endif #endif
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
pthread_mutex_lock(&level3_lock);
#else
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
#ifdef USE_ALLOC_HEAP #ifdef USE_ALLOC_HEAP
/* Dynamically allocate workspace */ /* Dynamically allocate workspace */
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
@ -599,9 +636,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
num_parts = 0; num_parts = 0;
while (m > 0){ while (m > 0){
width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts); width = blas_quickdivide(m + nthreads_m - num_parts - 1, nthreads_m - num_parts);
width = round_up(m, width, GEMM_PREFERED_SIZE);
m -= width; m -= width;
if (m < 0) width = width + m; if (m < 0) width = width + m;
range_M[num_parts + 1] = range_M[num_parts] + width; range_M[num_parts + 1] = range_M[num_parts] + width;
num_parts ++; num_parts ++;
} }
for (i = num_parts; i < MAX_CPU_NUMBER; i++) { for (i = num_parts; i < MAX_CPU_NUMBER; i++) {
@ -643,9 +685,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
if (width < SWITCH_RATIO) { if (width < SWITCH_RATIO) {
width = SWITCH_RATIO; width = SWITCH_RATIO;
} }
width = round_up(n, width, GEMM_PREFERED_SIZE);
n -= width; n -= width;
if (n < 0) width = width + n; if (n < 0) width = width + n;
range_N[num_parts + 1] = range_N[num_parts] + width; range_N[num_parts + 1] = range_N[num_parts] + width;
num_parts ++; num_parts ++;
} }
for (j = num_parts; j < MAX_CPU_NUMBER; j++) { for (j = num_parts; j < MAX_CPU_NUMBER; j++) {
@ -653,8 +698,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
} }
/* Clear synchronization flags */ /* Clear synchronization flags */
for (i = 0; i < MAX_CPU_NUMBER; i++) { for (i = 0; i < nthreads; i++) {
for (j = 0; j < MAX_CPU_NUMBER; j++) { for (j = 0; j < nthreads; j++) {
for (k = 0; k < DIVIDE_RATE; k++) { for (k = 0; k < DIVIDE_RATE; k++) {
job[i].working[j][CACHE_LINE_SIZE * k] = 0; job[i].working[j][CACHE_LINE_SIZE * k] = 0;
} }
@ -669,6 +714,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
free(job); free(job);
#endif #endif
#ifndef USE_OPENMP
#ifndef OS_WINDOWS
pthread_mutex_unlock(&level3_lock);
#else
LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
#endif
#endif
return 0; return 0;
} }

View File

@ -48,7 +48,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
BLASLONG width, i; BLASLONG width, i;
BLASLONG n_from, n_to; BLASLONG n_from, n_to;
double dnum, nf, nt, di; double dnum, nf, nt, di, dinum;
int num_cpu; int num_cpu;
int mask = 0; int mask = 0;
@ -109,7 +109,11 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
if (nthreads - num_cpu > 1) { if (nthreads - num_cpu > 1) {
di = (double)i; di = (double)i;
width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1); dinum = di * di +dnum;
if (dinum <0)
width = (BLASLONG)(( - di + mask)/(mask+1)) * (mask+1);
else
width = (BLASLONG)(( sqrt(dinum) - di + mask)/(mask+1)) * (mask+1);
if ((width <= 0) || (width > n_to - i)) width = n_to - i; if ((width <= 0) || (width > n_to - i)) width = n_to - i;
@ -136,9 +140,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
nf = (double)(arg -> n - n_from); nf = (double)(arg -> n - n_from);
nt = (double)(arg -> n - n_to); nt = (double)(arg -> n - n_to);
dnum = (nt * nt - nf * nf) / (double)nthreads; dnum = (nt * nt - nf * nf) / (double)nthreads;
num_cpu = 0; num_cpu = 0;
range[0] = n_from; range[0] = n_from;
@ -149,8 +151,11 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
if (nthreads - num_cpu > 1) { if (nthreads - num_cpu > 1) {
di = (double)(arg -> n - i); di = (double)(arg -> n - i);
width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1); dinum = di * di + dnum;
if (dinum<0)
width = ((BLASLONG)(di + mask)/(mask+1)) * (mask+1);
else
width = ((BLASLONG)((-sqrt(dinum) + di) + mask)/(mask+1)) * (mask+1);
if ((width <= 0) || (width > n_to - i)) width = n_to - i; if ((width <= 0) || (width > n_to - i)) width = n_to - i;
} else { } else {

View File

@ -135,10 +135,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
START_RPCC(); START_RPCC();
GEMM_ONCOPY(min_l, min_jj, b + (jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); GEMM_ONCOPY(min_l, min_jj, b + (jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
@ -201,10 +205,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
START_RPCC(); START_RPCC();
GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE); GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
@ -292,10 +300,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
START_RPCC(); START_RPCC();
GEMM_ONCOPY(min_l, min_jj, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb, GEMM_ONCOPY(min_l, min_jj, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb,
@ -358,10 +370,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
START_RPCC(); START_RPCC();
GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb, GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb,

View File

@ -122,10 +122,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = 0; jjs < ls - js; jjs += min_jj){ for(jjs = 0; jjs < ls - js; jjs += min_jj){
min_jj = ls - js - jjs; min_jj = ls - js - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
#ifndef TRANSA #ifndef TRANSA
GEMM_ONCOPY(min_l, min_jj, a + (ls + (js + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); GEMM_ONCOPY(min_l, min_jj, a + (ls + (js + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE);
#else #else
@ -142,10 +146,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = 0; jjs < min_l; jjs += min_jj){ for(jjs = 0; jjs < min_l; jjs += min_jj){
min_jj = min_l - jjs; min_jj = min_l - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
#ifndef TRANSA #ifndef TRANSA
TRMM_OLNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE); TRMM_OLNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE);
#else #else
@ -195,10 +203,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
#ifndef TRANSA #ifndef TRANSA
GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
#else #else
@ -246,10 +258,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = 0; jjs < min_l; jjs += min_jj){ for(jjs = 0; jjs < min_l; jjs += min_jj){
min_jj = min_l - jjs; min_jj = min_l - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
#ifndef TRANSA #ifndef TRANSA
TRMM_OUNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE); TRMM_OUNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE);
#else #else
@ -267,10 +283,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){
min_jj = js - ls - min_l - jjs; min_jj = js - ls - min_l - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
#ifndef TRANSA #ifndef TRANSA
GEMM_ONCOPY(min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda, GEMM_ONCOPY(min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda,
sb + min_l * (min_l + jjs) * COMPSIZE); sb + min_l * (min_l + jjs) * COMPSIZE);
@ -324,10 +344,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
#ifdef SKYLAKEX
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
#ifndef TRANSA #ifndef TRANSA
GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
#else #else

View File

@ -47,7 +47,11 @@ GenerateNamedObjects("abs.c" "DOUBLE" "z_abs" 0 "" "" 1)
GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" "" 1) GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" "" 1)
if (DYNAMIC_ARCH) if (DYNAMIC_ARCH)
list(APPEND COMMON_SOURCES dynamic.c) if (ARM64)
list(APPEND COMMON_SOURCES dynamic_arm64.c)
else ()
list(APPEND COMMON_SOURCES dynamic.c)
endif ()
else () else ()
list(APPEND COMMON_SOURCES parameter.c) list(APPEND COMMON_SOURCES parameter.c)
endif () endif ()

View File

@ -15,7 +15,19 @@ endif
# COMMONOBJS += info.$(SUFFIX) # COMMONOBJS += info.$(SUFFIX)
ifeq ($(DYNAMIC_ARCH), 1) ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH),arm64)
COMMONOBJS += dynamic_arm64.$(SUFFIX)
else
ifeq ($(ARCH),power)
COMMONOBJS += dynamic_power.$(SUFFIX)
else
ifeq ($(ARCH),zarch)
COMMONOBJS += dynamic_zarch.$(SUFFIX)
else
COMMONOBJS += dynamic.$(SUFFIX) COMMONOBJS += dynamic.$(SUFFIX)
endif
endif
endif
else else
COMMONOBJS += parameter.$(SUFFIX) COMMONOBJS += parameter.$(SUFFIX)
endif endif
@ -71,7 +83,19 @@ BLAS_SERVER = blas_server.c
endif endif
ifeq ($(DYNAMIC_ARCH), 1) ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH),arm64)
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX)
else
ifeq ($(ARCH),power)
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_power.$(SUFFIX)
else
ifeq ($(ARCH),zarch)
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX)
else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
endif
endif
endif
else else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
endif endif

View File

@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*********************************************************************/ /*********************************************************************/
#include "common.h" #include "common.h"
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) #if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_HAIKU)
#include <dlfcn.h> #include <dlfcn.h>
#include <signal.h> #include <signal.h>
#include <sys/resource.h> #include <sys/resource.h>
@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout();
/* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */ /* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */
/* jobs is queued. */ /* jobs is queued. */
/* We need this grobal for cheking if initialization is finished. */ /* We need this global for checking if initialization is finished. */
int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
/* Local Variables */ /* Local Variables */
@ -150,8 +150,8 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));
#ifdef MONITOR #ifdef MONITOR
/* Monitor is a function to see thread's status for every seconds. */ /* Monitor is a function to see thread's status for every second. */
/* Usually it turns off and it's for debugging. */ /* Usually it turns off and it's for debugging. */
static pthread_t monitor_thread; static pthread_t monitor_thread;
static int main_status[MAX_CPU_NUMBER]; static int main_status[MAX_CPU_NUMBER];
@ -582,7 +582,7 @@ int blas_thread_init(void){
if(ret!=0){ if(ret!=0){
struct rlimit rlim; struct rlimit rlim;
const char *msg = strerror(ret); const char *msg = strerror(ret);
fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create: %s\n", msg); fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %ld: %s\n", i+1,blas_num_threads,msg);
#ifdef RLIMIT_NPROC #ifdef RLIMIT_NPROC
if(0 == getrlimit(RLIMIT_NPROC, &rlim)) { if(0 == getrlimit(RLIMIT_NPROC, &rlim)) {
fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC " fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC "
@ -850,6 +850,11 @@ void goto_set_num_threads(int num_threads) {
long i; long i;
#ifdef SMP_SERVER
// Handle lazy re-init of the thread-pool after a POSIX fork
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif
if (num_threads < 1) num_threads = blas_num_threads; if (num_threads < 1) num_threads = blas_num_threads;
#ifndef NO_AFFINITY #ifndef NO_AFFINITY

View File

@ -36,6 +36,7 @@
/* or implied, of The University of Texas at Austin. */ /* or implied, of The University of Texas at Austin. */
/*********************************************************************/ /*********************************************************************/
#include <stdbool.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
//#include <sys/mman.h> //#include <sys/mman.h>
@ -47,13 +48,22 @@
#else #else
#ifndef OMP_SCHED
#define OMP_SCHED static
#endif
int blas_server_avail = 0; int blas_server_avail = 0;
static void * blas_thread_buffer[MAX_CPU_NUMBER]; static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
#if __STDC_VERSION__ >= 201112L
static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
#else
static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
#endif
void goto_set_num_threads(int num_threads) { void goto_set_num_threads(int num_threads) {
int i=0; int i=0, j=0;
if (num_threads < 1) num_threads = blas_num_threads; if (num_threads < 1) num_threads = blas_num_threads;
@ -68,15 +78,17 @@ void goto_set_num_threads(int num_threads) {
omp_set_num_threads(blas_cpu_number); omp_set_num_threads(blas_cpu_number);
//adjust buffer for each thread //adjust buffer for each thread
for(i=0; i<blas_cpu_number; i++){ for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
if(blas_thread_buffer[i]==NULL){ for(j=0; j<blas_cpu_number; j++){
blas_thread_buffer[i]=blas_memory_alloc(2); if(blas_thread_buffer[i][j]==NULL){
blas_thread_buffer[i][j]=blas_memory_alloc(2);
}
} }
} for(; j<MAX_CPU_NUMBER; j++){
for(; i<MAX_CPU_NUMBER; i++){ if(blas_thread_buffer[i][j]!=NULL){
if(blas_thread_buffer[i]!=NULL){ blas_memory_free(blas_thread_buffer[i][j]);
blas_memory_free(blas_thread_buffer[i]); blas_thread_buffer[i][j]=NULL;
blas_thread_buffer[i]=NULL; }
} }
} }
#if defined(ARCH_MIPS64) #if defined(ARCH_MIPS64)
@ -92,30 +104,34 @@ void openblas_set_num_threads(int num_threads) {
int blas_thread_init(void){ int blas_thread_init(void){
int i=0; int i=0, j=0;
blas_get_cpu_number(); blas_get_cpu_number();
blas_server_avail = 1; blas_server_avail = 1;
for(i=0; i<blas_num_threads; i++){ for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
blas_thread_buffer[i]=blas_memory_alloc(2); for(j=0; j<blas_num_threads; j++){
} blas_thread_buffer[i][j]=blas_memory_alloc(2);
for(; i<MAX_CPU_NUMBER; i++){ }
blas_thread_buffer[i]=NULL; for(; j<MAX_CPU_NUMBER; j++){
blas_thread_buffer[i][j]=NULL;
}
} }
return 0; return 0;
} }
int BLASFUNC(blas_thread_shutdown)(void){ int BLASFUNC(blas_thread_shutdown)(void){
int i=0; int i=0, j=0;
blas_server_avail = 0; blas_server_avail = 0;
for(i=0; i<MAX_CPU_NUMBER; i++){ for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
if(blas_thread_buffer[i]!=NULL){ for(j=0; j<MAX_CPU_NUMBER; j++){
blas_memory_free(blas_thread_buffer[i]); if(blas_thread_buffer[i][j]!=NULL){
blas_thread_buffer[i]=NULL; blas_memory_free(blas_thread_buffer[i][j]);
blas_thread_buffer[i][j]=NULL;
}
} }
} }
@ -206,7 +222,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
} }
} }
static void exec_threads(blas_queue_t *queue){ static void exec_threads(blas_queue_t *queue, int buf_index){
void *buffer, *sa, *sb; void *buffer, *sa, *sb;
int pos=0, release_flag=0; int pos=0, release_flag=0;
@ -223,7 +239,7 @@ static void exec_threads(blas_queue_t *queue){
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
pos = omp_get_thread_num(); pos = omp_get_thread_num();
buffer = blas_thread_buffer[pos]; buffer = blas_thread_buffer[buf_index][pos];
//fallback //fallback
if(buffer==NULL) { if(buffer==NULL) {
@ -291,7 +307,7 @@ static void exec_threads(blas_queue_t *queue){
int exec_blas(BLASLONG num, blas_queue_t *queue){ int exec_blas(BLASLONG num, blas_queue_t *queue){
BLASLONG i; BLASLONG i, buf_index;
if ((num <= 0) || (queue == NULL)) return 0; if ((num <= 0) || (queue == NULL)) return 0;
@ -302,16 +318,39 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
} }
#endif #endif
#pragma omp parallel for schedule(static) while(true) {
for(i=0; i < MAX_PARALLEL_NUMBER; i++) {
#if __STDC_VERSION__ >= 201112L
_Bool inuse = false;
if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) {
#else
if(blas_buffer_inuse[i] == false) {
blas_buffer_inuse[i] = true;
#endif
buf_index = i;
break;
}
}
if(i != MAX_PARALLEL_NUMBER)
break;
}
#pragma omp parallel for schedule(OMP_SCHED)
for (i = 0; i < num; i ++) { for (i = 0; i < num; i ++) {
#ifndef USE_SIMPLE_THREADED_LEVEL3 #ifndef USE_SIMPLE_THREADED_LEVEL3
queue[i].position = i; queue[i].position = i;
#endif #endif
exec_threads(&queue[i]); exec_threads(&queue[i], buf_index);
} }
#if __STDC_VERSION__ >= 201112L
atomic_store(&blas_buffer_inuse[buf_index], false);
#else
blas_buffer_inuse[buf_index] = false;
#endif
return 0; return 0;
} }

View File

@ -50,7 +50,7 @@
/* This is a thread implementation for Win32 lazy implementation */ /* This is a thread implementation for Win32 lazy implementation */
/* Thread server common infomation */ /* Thread server common information */
typedef struct{ typedef struct{
CRITICAL_SECTION lock; CRITICAL_SECTION lock;
HANDLE filled; HANDLE filled;
@ -61,7 +61,7 @@ typedef struct{
} blas_pool_t; } blas_pool_t;
/* We need this global for cheking if initialization is finished. */ /* We need this global for checking if initialization is finished. */
int blas_server_avail = 0; int blas_server_avail = 0;
/* Local Variables */ /* Local Variables */
@ -461,13 +461,22 @@ int BLASFUNC(blas_thread_shutdown)(void){
SetEvent(pool.killed); SetEvent(pool.killed);
for(i = 0; i < blas_num_threads - 1; i++){ for(i = 0; i < blas_num_threads - 1; i++){
WaitForSingleObject(blas_threads[i], 5); //INFINITE); // Could also just use WaitForMultipleObjects
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
#ifndef OS_WINDOWSSTORE #ifndef OS_WINDOWSSTORE
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
TerminateThread(blas_threads[i],0); if (WAIT_OBJECT_0 != wait_thread_value) {
TerminateThread(blas_threads[i],0);
}
#endif #endif
CloseHandle(blas_threads[i]);
} }
CloseHandle(pool.filled);
CloseHandle(pool.killed);
blas_server_avail = 0; blas_server_avail = 0;
} }
@ -478,7 +487,12 @@ int BLASFUNC(blas_thread_shutdown)(void){
void goto_set_num_threads(int num_threads) void goto_set_num_threads(int num_threads)
{ {
long i; long i;
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
// Handle lazy re-init of the thread-pool after a POSIX fork
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif
if (num_threads < 1) num_threads = blas_cpu_number; if (num_threads < 1) num_threads = blas_cpu_number;

Some files were not shown because too many files have changed in this diff Show More