Merge pull request #3717 from xianyi/develop
Update from develop for 0.3.21 release
This commit is contained in:
commit
9a34217cc6
|
|
@ -5,27 +5,20 @@ on: [push, pull_request]
|
|||
jobs:
|
||||
build:
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest]
|
||||
fortran: [gfortran, flang]
|
||||
build: [cmake, make]
|
||||
exclude:
|
||||
- os: macos-latest
|
||||
fortran: flang
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Compilation cache
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: ~/.ccache
|
||||
# We include the commit sha in the cache key, as new cache entries are
|
||||
# only created if there is no existing entry for the key yet.
|
||||
key: ${{ runner.os }}-ccache-${{ github.sha }}
|
||||
# Restore any ccache cache entry, if none for
|
||||
# ${{ runner.os }}-ccache-${{ github.sha }} exists
|
||||
restore-keys: |
|
||||
${{ runner.os }}-ccache-
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Print system information
|
||||
run: |
|
||||
|
|
@ -34,7 +27,7 @@ jobs:
|
|||
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||
sysctl -a | grep machdep.cpu
|
||||
else
|
||||
echo "$RUNNER_OS not supported"
|
||||
echo "::error::$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
|
@ -43,61 +36,224 @@ jobs:
|
|||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
sudo apt-get install -y gfortran cmake ccache
|
||||
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
|
||||
brew reinstall gcc
|
||||
brew install coreutils cmake ccache
|
||||
else
|
||||
echo "$RUNNER_OS not supported"
|
||||
echo "::error::$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
ccache -M 300M # Limit the ccache size; Github's overall cache limit is 5GB
|
||||
|
||||
- name: gfortran build
|
||||
if: matrix.build == 'make' && matrix.fortran == 'gfortran'
|
||||
- name: Compilation cache
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.ccache
|
||||
# We include the commit sha in the cache key, as new cache entries are
|
||||
# only created if there is no existing entry for the key yet.
|
||||
# GNU make and cmake call the compilers differently. It looks like
|
||||
# that causes the cache to mismatch. Keep the ccache for both build
|
||||
# tools separate to avoid polluting each other.
|
||||
key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
|
||||
# Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
|
||||
restore-keys: |
|
||||
ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}
|
||||
ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}
|
||||
ccache-${{ runner.os }}-${{ matrix.build }}
|
||||
|
||||
- name: Configure ccache
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
export PATH="/usr/lib/ccache:${PATH}"
|
||||
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||
export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}"
|
||||
else
|
||||
echo "$RUNNER_OS not supported"
|
||||
exit 1
|
||||
if [ "${{ matrix.build }}" = "make" ]; then
|
||||
# Add ccache to path
|
||||
if [ "$RUNNER_OS" = "Linux" ]; then
|
||||
echo "/usr/lib/ccache" >> $GITHUB_PATH
|
||||
elif [ "$RUNNER_OS" = "macOS" ]; then
|
||||
echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH
|
||||
else
|
||||
echo "::error::$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
|
||||
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||
echo "max_size = 300M" > ~/.ccache/ccache.conf
|
||||
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||
ccache -s
|
||||
|
||||
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0
|
||||
|
||||
- name: flang build
|
||||
if: matrix.build == 'make' && matrix.fortran == 'flang'
|
||||
- name: Build OpenBLAS
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
export PATH="/usr/lib/ccache:${PATH}"
|
||||
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||
exit 0
|
||||
else
|
||||
echo "$RUNNER_OS not supported"
|
||||
exit 1
|
||||
if [ "${{ matrix.fortran }}" = "flang" ]; then
|
||||
# download and install classic flang
|
||||
cd /usr/
|
||||
sudo wget -nv https://github.com/flang-compiler/flang/releases/download/flang_20190329/flang-20190329-x86-70.tgz
|
||||
sudo tar xf flang-20190329-x86-70.tgz
|
||||
sudo rm flang-20190329-x86-70.tgz
|
||||
cd -
|
||||
fi
|
||||
case "${{ matrix.build }}" in
|
||||
"make")
|
||||
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}"
|
||||
;;
|
||||
"cmake")
|
||||
mkdir build && cd build
|
||||
cmake -DDYNAMIC_ARCH=1 \
|
||||
-DNOFORTRAN=0 \
|
||||
-DBUILD_WITHOUT_LAPACK=0 \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=ON \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
|
||||
..
|
||||
cmake --build .
|
||||
;;
|
||||
*)
|
||||
echo "::error::Configuration not supported"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
cd /usr/
|
||||
sudo wget -nv https://github.com/flang-compiler/flang/releases/download/flang_20190329/flang-20190329-x86-70.tgz
|
||||
sudo tar xf flang-20190329-x86-70.tgz
|
||||
sudo rm flang-20190329-x86-70.tgz
|
||||
cd -
|
||||
- name: Show ccache status
|
||||
continue-on-error: true
|
||||
run: ccache -s
|
||||
|
||||
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC=flang
|
||||
|
||||
|
||||
- name: CMake gfortran build
|
||||
if: matrix.build == 'cmake' && matrix.fortran == 'gfortran'
|
||||
- name: Run tests
|
||||
timeout-minutes: 60
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
export PATH="/usr/lib/ccache:${PATH}"
|
||||
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||
export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}"
|
||||
else
|
||||
echo "$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
case "${{ matrix.build }}" in
|
||||
"make")
|
||||
MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0'
|
||||
echo "::group::Tests in 'test' directory"
|
||||
make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
|
||||
echo "::endgroup::"
|
||||
echo "::group::Tests in 'ctest' directory"
|
||||
make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
|
||||
echo "::endgroup::"
|
||||
echo "::group::Tests in 'utest' directory"
|
||||
make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
|
||||
echo "::endgroup::"
|
||||
;;
|
||||
"cmake")
|
||||
cd build && ctest
|
||||
;;
|
||||
*)
|
||||
echo "::error::Configuration not supported"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DDYNAMIC_ARCH=1 -DNOFORTRAN=0 -DBUILD_WITHOUT_LAPACK=0 -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_BUILD_TYPE=Release ..
|
||||
make -j$(nproc)
|
||||
|
||||
msys2:
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
msystem: [MINGW64, MINGW32, CLANG64]
|
||||
idx: [int32, int64]
|
||||
include:
|
||||
- msystem: MINGW64
|
||||
idx: int32
|
||||
target-prefix: mingw-w64-x86_64
|
||||
fc-pkg: mingw-w64-x86_64-gcc-fortran
|
||||
- msystem: MINGW32
|
||||
idx: int32
|
||||
target-prefix: mingw-w64-i686
|
||||
fc-pkg: mingw-w64-i686-gcc-fortran
|
||||
- msystem: CLANG64
|
||||
idx: int32
|
||||
target-prefix: mingw-w64-clang-x86_64
|
||||
c-lapack-flags: -DC_LAPACK=ON
|
||||
- msystem: MINGW64
|
||||
idx: int64
|
||||
idx64-flags: -DBINARY=64 -DINTERFACE64=1
|
||||
target-prefix: mingw-w64-x86_64
|
||||
fc-pkg: mingw-w64-x86_64-gcc-fortran
|
||||
- msystem: CLANG64
|
||||
idx: int64
|
||||
idx64-flags: -DBINARY=64 -DINTERFACE64=1
|
||||
target-prefix: mingw-w64-clang-x86_64
|
||||
c-lapack-flags: -DC_LAPACK=ON
|
||||
exclude:
|
||||
- msystem: MINGW32
|
||||
idx: int64
|
||||
|
||||
defaults:
|
||||
run:
|
||||
# Use MSYS2 bash as default shell
|
||||
shell: msys2 {0}
|
||||
|
||||
env:
|
||||
CHERE_INVOKING: 1
|
||||
|
||||
steps:
|
||||
- name: Get CPU name
|
||||
shell: pwsh
|
||||
run : |
|
||||
Get-CIMInstance -Class Win32_Processor | Select-Object -Property Name
|
||||
|
||||
- name: Install build dependencies
|
||||
uses: msys2/setup-msys2@v2
|
||||
with:
|
||||
msystem: ${{ matrix.msystem }}
|
||||
update: true
|
||||
release: false # Use pre-installed version
|
||||
install: >-
|
||||
base-devel
|
||||
${{ matrix.target-prefix }}-cc
|
||||
${{ matrix.fc-pkg }}
|
||||
${{ matrix.target-prefix }}-cmake
|
||||
${{ matrix.target-prefix }}-ninja
|
||||
${{ matrix.target-prefix }}-ccache
|
||||
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Compilation cache
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
# It looks like this path needs to be hard-coded.
|
||||
path: C:/msys64/home/runneradmin/.ccache
|
||||
# We include the commit sha in the cache key, as new cache entries are
|
||||
# only created if there is no existing entry for the key yet.
|
||||
key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ github.ref }}-${{ github.sha }}
|
||||
# Restore a matching ccache cache entry. Prefer same branch.
|
||||
restore-keys: |
|
||||
ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ github.ref }}
|
||||
ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}
|
||||
|
||||
- name: Configure ccache
|
||||
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota.
|
||||
run: |
|
||||
which ccache
|
||||
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||
echo "max_size = 250M" > ~/.ccache/ccache.conf
|
||||
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||
ccache -s
|
||||
echo $HOME
|
||||
cygpath -w $HOME
|
||||
|
||||
- name: Configure OpenBLAS
|
||||
run: |
|
||||
mkdir build && cd build
|
||||
cmake -DBUILD_SHARED_LIBS=ON \
|
||||
-DBUILD_STATIC_LIBS=ON \
|
||||
-DDYNAMIC_ARCH=ON \
|
||||
-DUSE_THREAD=ON \
|
||||
-DNUM_THREADS=64 \
|
||||
-DTARGET=CORE2 \
|
||||
${{ matrix.idx64-flags }} \
|
||||
${{ matrix.c-lapack-flags }} \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
|
||||
..
|
||||
|
||||
- name: Build OpenBLAS
|
||||
run: cd build && cmake --build .
|
||||
|
||||
- name: Show ccache status
|
||||
continue-on-error: true
|
||||
run: ccache -s
|
||||
|
||||
- name: Run tests
|
||||
timeout-minutes: 60
|
||||
run: cd build && ctest
|
||||
|
|
|
|||
15
.travis.yml
15
.travis.yml
|
|
@ -25,11 +25,12 @@ matrix:
|
|||
# - BTYPE="BINARY=64"
|
||||
#
|
||||
# - <<: *test-ubuntu
|
||||
os: linux-ppc64le
|
||||
os: linux
|
||||
arch: ppc64le
|
||||
before_script: &common-before
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
|
||||
script:
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
- travis_wait 20 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
- make -C test $COMMON_FLAGS $BTYPE
|
||||
- make -C ctest $COMMON_FLAGS $BTYPE
|
||||
- make -C utest $COMMON_FLAGS $BTYPE
|
||||
|
|
@ -43,6 +44,7 @@ matrix:
|
|||
arch: s390x
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=Z13 NUM_THREADS=32"
|
||||
- sudo apt-get install --only-upgrade binutils
|
||||
env:
|
||||
# for matrix annotation only
|
||||
- TARGET_BOX=IBMZ_LINUX
|
||||
|
|
@ -55,6 +57,7 @@ matrix:
|
|||
compiler: clang
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=Z13 NUM_THREADS=32"
|
||||
- sudo apt-get install --only-upgrade binutils
|
||||
env:
|
||||
# for matrix annotation only
|
||||
- TARGET_BOX=IBMZ_LINUX
|
||||
|
|
@ -101,7 +104,7 @@ matrix:
|
|||
- sudo apt-get update
|
||||
- sudo apt-get install gcc-9 gfortran-9 -y
|
||||
script:
|
||||
- make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
|
||||
- travis_wait 20 make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
|
||||
- make -C test $COMMON_FLAGS $BTYPE
|
||||
- make -C ctest $COMMON_FLAGS $BTYPE
|
||||
- make -C utest $COMMON_FLAGS $BTYPE
|
||||
|
|
@ -118,7 +121,7 @@ matrix:
|
|||
- sudo apt-get update
|
||||
- sudo apt-get install gcc-9 gfortran-9 -y
|
||||
script:
|
||||
- make QUIET_MAKE=1 BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
|
||||
- travis_wait 20 make QUIET_MAKE=1 BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
|
||||
- make -C test $COMMON_FLAGS $BTYPE
|
||||
- make -C ctest $COMMON_FLAGS $BTYPE
|
||||
- make -C utest $COMMON_FLAGS $BTYPE
|
||||
|
|
@ -269,9 +272,9 @@ matrix:
|
|||
# - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1"
|
||||
# - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1"
|
||||
|
||||
- &test-graviton2
|
||||
- &test-neoversen1
|
||||
os: linux
|
||||
arch: arm64-graviton2
|
||||
arch: arm64
|
||||
dist: focal
|
||||
group: edge
|
||||
virt: lxd
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
Thank you for the support.
|
||||
|
||||
### [2019.12/2021.9] [Chan-Zuckerberg Foundation EOSS Initiative](https://chanzuckerberg.com/eoss/)
|
||||
|
||||
Between December 2019 and September 2021, development and maintaining of OpenBLAS was funded in part by the Chan-Zuckerberg Foundation in the context of two grants awarded to the NumPy Foundation and managed by NumFocus (Cycles 1 and 3 of the Essential Open Source Software for Science (EOSS) Initiative of the Chan-Zuckerberg Foundation)
|
||||
|
||||
### [2013.8] [Testbed for OpenBLAS project](https://www.bountysource.com/fundraisers/443-testbed-for-openblas-project)
|
||||
|
||||
https://www.bountysource.com/fundraisers/443-testbed-for-openblas-project/pledges
|
||||
|
|
|
|||
|
|
@ -17,14 +17,12 @@ include(GNUInstallDirs)
|
|||
|
||||
include(CMakePackageConfigHelpers)
|
||||
|
||||
if(MSVC AND NOT DEFINED NOFORTRAN)
|
||||
set(NOFORTRAN ON)
|
||||
endif()
|
||||
|
||||
#######
|
||||
if(MSVC)
|
||||
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
|
||||
endif()
|
||||
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF)
|
||||
|
||||
option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
|
||||
|
||||
option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
|
||||
|
||||
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
|
||||
|
||||
|
|
@ -36,6 +34,8 @@ option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several
|
|||
|
||||
option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF)
|
||||
|
||||
option(USE_PERL "Use the older PERL scripts for build preparation instead of universal shell scripts" OFF)
|
||||
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
|
||||
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
|
||||
else()
|
||||
|
|
@ -179,7 +179,7 @@ endforeach ()
|
|||
|
||||
# Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke.
|
||||
# Not using add_subdirectory here because lapack-netlib already has its own CMakeLists.txt. Instead include a cmake script with the sources we want.
|
||||
if (NOT NOFORTRAN AND NOT NO_LAPACK)
|
||||
if (NOT NO_LAPACK)
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/lapack.cmake")
|
||||
if (NOT NO_LAPACKE)
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/lapacke.cmake")
|
||||
|
|
@ -205,8 +205,8 @@ endif ()
|
|||
|
||||
# add objects to the openblas lib
|
||||
if(NOT NO_LAPACK)
|
||||
add_library(LAPACK OBJECT ${LA_SOURCES})
|
||||
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACK>")
|
||||
add_library(LAPACK_OVERRIDES OBJECT ${LA_SOURCES})
|
||||
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACK_OVERRIDES>")
|
||||
endif()
|
||||
if(NOT NO_LAPACKE)
|
||||
add_library(LAPACKE OBJECT ${LAPACKE_SOURCES})
|
||||
|
|
@ -247,7 +247,7 @@ endif()
|
|||
|
||||
if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS)
|
||||
set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
|
||||
if (NOT NOFORTRAN)
|
||||
if (NOT NOFORTRAN)
|
||||
set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
|
||||
set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
|
||||
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
|
||||
|
|
@ -314,14 +314,16 @@ endif()
|
|||
if (NOT NOFORTRAN)
|
||||
# Build test and ctest
|
||||
add_subdirectory(test)
|
||||
if (BUILD_TESTING)
|
||||
add_subdirectory(lapack-netlib/TESTING)
|
||||
endif()
|
||||
endif()
|
||||
if(NOT NO_CBLAS)
|
||||
add_subdirectory(ctest)
|
||||
endif()
|
||||
add_subdirectory(lapack-netlib/TESTING)
|
||||
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
|
||||
add_subdirectory(cpp_thread_test)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set_target_properties(${OpenBLAS_LIBS} PROPERTIES
|
||||
VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}
|
||||
|
|
@ -394,14 +396,23 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
|
|||
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED USE_PERL)
|
||||
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
|
||||
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||
COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
|
||||
COMMENT "renaming symbols"
|
||||
)
|
||||
else()
|
||||
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
|
||||
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def
|
||||
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
|
||||
COMMENT "renaming symbols"
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
|
||||
# Install project
|
||||
|
||||
# Install libraries
|
||||
|
|
|
|||
|
|
@ -207,3 +207,8 @@ In chronological order:
|
|||
|
||||
* Ilya Kurdyukov <https://github.com/ilyakurdyukov>
|
||||
* [2021-02-21] Add basic support for the Elbrus E2000 architecture
|
||||
|
||||
* PLCT Lab, Institute of Software Chinese Academy of Sciences
|
||||
* [2022-03] Support RISC-V Vector Intrinisc 1.0 version.
|
||||
|
||||
|
||||
|
|
@ -1,4 +1,86 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.21
|
||||
07-Aug-2022
|
||||
|
||||
general:
|
||||
- Updated the included LAPACK to Reference-LAPACK release 3.10.1
|
||||
- when no Fortran compiler is available, OpenBLAS builds will now automatically
|
||||
build LAPACK from an f2c-converted copy of LAPACK 3.9.0 unless the NO_LAPACK option
|
||||
is specified
|
||||
- similarly added C versions of the BLAS and CBLAS tests
|
||||
- enabled building of the ReLAPACK GEMMT kernels when ReLAPACK is built
|
||||
- function LAPACKE_lsame is now annotated with the GCC attribute "const" to aid static analyzers
|
||||
- added USE_TLS to the list of options reported by the openblas_get_config() function
|
||||
- CMAKE builds now support the BUILD_TESTING keyword (to disable the LAPACK testsuite) of Reference-LAPACK
|
||||
- fixed CMAKE builds of the laswp_ncopy and neg_tcopy kernels
|
||||
- removed the build system requirements for PERL (while keeping the original perl scripts as backup)
|
||||
- handle building and running OpenBLAS on systems that report zero available cpu cores
|
||||
- added SYMBOLPREFIX/SYMBOLSUFFIX handling for LAPACK 3.10.0 functions added in 0.3.20
|
||||
- fixed linking of the utests on QNX
|
||||
- Added support for compilation with the Intel ifx compiler
|
||||
- Added support for compilation with the Fujitsu FCC compiler for Fugaku
|
||||
- Added support for compilation with the Cray C and Fortran compilers
|
||||
- reverted OpenMP threadpool behaviour in the exec_blas call to its state before 0.3.11, that is
|
||||
the threadpool will no longer grow or shrink on demand as the overhead for this is too big at least with
|
||||
GNU OpenMP. The adaptive behaviour introduced in 0.3.11 can still be requested at runtime by setting
|
||||
the environment variable OMP_ADAPTIVE
|
||||
- worked around spurious STFSM/CTFSM errors reported by the LAPACK testsuite
|
||||
|
||||
x86_64:
|
||||
- fixed determination of compiler support for AVX512 and removed the 0.3.19
|
||||
workaround for building SKYLAKEX kernels on Sandybridge hardware
|
||||
- fixed compilation for the SKYLAKEX target with gcc 6
|
||||
- fixed compilation of the CooperLake SBGEMM kernel with LLVM
|
||||
- fixed compilation of the SkyLakeX small matrix GEMM kernels with LLVM or ICC
|
||||
- fixed compilation of some BFLOAT16 kernels with CMAKE
|
||||
- added support for the Zhaoxin/Centaur KH40000 cpu
|
||||
- fixed a potential crash in the ZSYMV kernel used for all targets except generic
|
||||
- fixed gmake compilation for DYNAMIC_ARCH with a DYNAMIC_LIST including ATOM
|
||||
- fixed compilation of LAPACKE with the INTEGER64 option on Windows
|
||||
- added support for cross-compiling to individual Intel or AMD targets using CMAKE
|
||||
(previously only CORE2 supported, added targets are ATOM, PRESCOTT, NEHALEM, SANDYBRIDGE,
|
||||
HASWELL,SKYLAKEX, COOPERLAKE, SAPPHIRERAPIDS, OPTERON, BARCELONA, BULLDOZER, PILEDRIVER,
|
||||
STEAMROLLER,EXCAVATOR, ZEN)
|
||||
|
||||
SPARC:
|
||||
- worked around an overflow error in the DNRM2 kernel
|
||||
|
||||
POWER:
|
||||
- worked around an overflow error in the POWER6 DNRM2 kernel
|
||||
- fixed compilation on PPC440
|
||||
- fixed a performance regression in the level1 BLAS on POWER10
|
||||
- fixed the POWER10 ZGEMM kernel
|
||||
- fixed singlethreaded builds for POWER10
|
||||
- fixed compilation of the POWER10 DGEMV kernel with older gcc versions
|
||||
- enabled compilation of the BFLOAT16 kernels by default
|
||||
- enabled the small matrix kernels by default for DYNAMIC_ARCH builds
|
||||
- added a workaround for a miscompilation of the CDOT and ZDOT kernels by GCC 12
|
||||
|
||||
- RISCV:
|
||||
- fixed cpu autodetection logic
|
||||
|
||||
ARMV8:
|
||||
- added an SBGEMM kernel for Neoverse N2
|
||||
- worked around an overflow error in the DNRM2 kernel used on M1, NeoverseN1, ThunderX2T99
|
||||
- added support for ARM64 systems running MS Windows
|
||||
- added support for cross-compiling to the GENERIC ARMV8 target under CMAKE (Windows/MSVC)
|
||||
- fixed a performance regression in the generic ARMV8 DGEMM kernel introduced in 0.3.19
|
||||
- added initial support for the Apple M1 cpu under Linux
|
||||
- added initial support for the Phytium FT2000 cpu
|
||||
- added initial support for the Cortex A510, A710, X1 and X2 cpu
|
||||
- fixed an accidental mixup of cpu identifiers in the autodetection code introduced in 0.3.20
|
||||
- fixed linking of Apple M1 builds on macOS 12 and later with recent XCode
|
||||
- made Neoverse N2 available in DYNAMIC_ARCH builds
|
||||
|
||||
MIPS,MIPS64:
|
||||
- worked around an overflow error in the DNRM2 kernel
|
||||
|
||||
LOONGARCH64:
|
||||
- worked around an overflow error in the DNRM2 kernel
|
||||
- added preliminary support for the LOONGSON2K1000 cpu
|
||||
- added DYNAMIC_ARCH support
|
||||
|
||||
====================================================================
|
||||
Version 0.3.20
|
||||
20-Feb-2022
|
||||
|
|
|
|||
22
Makefile
22
Makefile
|
|
@ -25,11 +25,14 @@ ifeq ($(NO_FORTRAN), 1)
|
|||
define NOFORTRAN
|
||||
1
|
||||
endef
|
||||
define NO_LAPACK
|
||||
ifneq ($(NO_LAPACK), 1)
|
||||
define C_LAPACK
|
||||
1
|
||||
endef
|
||||
endif
|
||||
export NOFORTRAN
|
||||
export NO_LAPACK
|
||||
export C_LAPACK
|
||||
endif
|
||||
|
||||
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
|
||||
|
|
@ -146,21 +149,25 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
|||
ifndef NO_FBLAS
|
||||
$(MAKE) -C test all
|
||||
endif
|
||||
endif
|
||||
ifneq ($(ONLY_CBLAS), 1)
|
||||
$(MAKE) -C utest all
|
||||
endif
|
||||
ifneq ($(NO_CBLAS), 1)
|
||||
ifneq ($(ONLY_CBLAS), 1)
|
||||
$(MAKE) -C ctest all
|
||||
endif
|
||||
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
|
||||
$(MAKE) -C cpp_thread_test all
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
libs :
|
||||
ifeq ($(CORE), UNKNOWN)
|
||||
$(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.)
|
||||
endif
|
||||
ifeq ($(NOFORTRAN), 1)
|
||||
$(info OpenBLAS: Detecting fortran compiler failed. Cannot compile LAPACK. Only compile BLAS.)
|
||||
$(info OpenBLAS: Detecting fortran compiler failed. Can only compile BLAS and f2c-converted LAPACK.)
|
||||
endif
|
||||
ifeq ($(NO_STATIC), 1)
|
||||
ifeq ($(NO_SHARED), 1)
|
||||
|
|
@ -241,19 +248,14 @@ hpl_p :
|
|||
fi; \
|
||||
done
|
||||
|
||||
ifeq ($(NO_LAPACK), 1)
|
||||
netlib :
|
||||
|
||||
else
|
||||
netlib : lapack_prebuild
|
||||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
ifneq ($(NO_LAPACK), 1)
|
||||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
|
||||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
|
||||
endif
|
||||
ifneq ($(NO_LAPACKE), 1)
|
||||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(NO_LAPACK), 1)
|
||||
re_lapack :
|
||||
|
|
@ -267,7 +269,7 @@ prof_lapack : lapack_prebuild
|
|||
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
|
||||
|
||||
lapack_prebuild :
|
||||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
ifeq ($(NO_LAPACK), $(filter 0,$(NO_LAPACK)))
|
||||
-@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
|
|
|
|||
|
|
@ -3,6 +3,9 @@ ifneq ($(C_COMPILER), PGI)
|
|||
ifeq ($(C_COMPILER), CLANG)
|
||||
ISCLANG=1
|
||||
endif
|
||||
ifeq ($(C_COMPILER), FUJITSU)
|
||||
ISCLANG=1
|
||||
endif
|
||||
ifneq (1, $(filter 1,$(GCCVERSIONGT4) $(ISCLANG)))
|
||||
CCOMMON_OPT += -march=armv8-a
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
|
|
@ -55,6 +58,13 @@ FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), FT2000)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
|
||||
# Use a72 tunings because Neoverse-N1 is only available
|
||||
# in GCC>=9
|
||||
ifeq ($(CORE), NEOVERSEN1)
|
||||
|
|
@ -114,9 +124,9 @@ ifeq ($(CORE), NEOVERSEN2)
|
|||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
|
||||
CCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2
|
||||
CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2
|
||||
FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.5-a -mtune=native
|
||||
|
|
@ -229,6 +239,43 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
|
||||
ifeq ($(CORE), CORTEXX1)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
|
||||
ifeq ($(CORE), CORTEXX2)
|
||||
CCOMMON_OPT += -march=armv8.4-a+sve
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.4-a+sve
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
#ifeq (1, $(filter 1,$(ISCLANG)))
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
|
||||
ifeq ($(CORE), CORTEXA510)
|
||||
CCOMMON_OPT += -march=armv8.4-a+sve
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.4-a+sve
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
|
||||
ifeq ($(CORE), CORTEXA710)
|
||||
CCOMMON_OPT += -march=armv8.4-a+sve
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.4-a+sve
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
endif
|
||||
|
|
|
|||
|
|
@ -15,6 +15,12 @@ TARGET_MAKE = Makefile.conf
|
|||
TARGET_CONF = config.h
|
||||
endif
|
||||
|
||||
ifdef USE_PERL
|
||||
SCRIPTSUFFIX = .pl
|
||||
else
|
||||
SCRIPTSUFFIX =
|
||||
endif
|
||||
|
||||
# CPUIDEMU = ../../cpuid/table.o
|
||||
|
||||
ifdef CPUIDEMU
|
||||
|
|
@ -46,17 +52,17 @@ TARGET_FLAGS = -mips64r6
|
|||
endif
|
||||
|
||||
ifeq ($(TARGET), C910V)
|
||||
TARGET_FLAGS = -march=rv64gcvxthead -mabi=lp64v
|
||||
TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d
|
||||
endif
|
||||
|
||||
all: getarch_2nd
|
||||
./getarch_2nd 0 >> $(TARGET_MAKE)
|
||||
./getarch_2nd 1 >> $(TARGET_CONF)
|
||||
|
||||
config.h : c_check f_check getarch
|
||||
perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) $(CFLAGS)
|
||||
$(TARGET_CONF): c_check$(SCRIPTSUFFIX) f_check$(SCRIPTSUFFIX) getarch
|
||||
./c_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) $(CFLAGS)
|
||||
ifneq ($(ONLY_CBLAS), 1)
|
||||
perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS)
|
||||
./f_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS)
|
||||
else
|
||||
#When we only build CBLAS, we set NOFORTRAN=2
|
||||
echo "NOFORTRAN=2" >> $(TARGET_MAKE)
|
||||
|
|
@ -71,9 +77,11 @@ endif
|
|||
|
||||
|
||||
getarch : getarch.c cpuid.S dummy $(CPUIDEMU)
|
||||
$(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) -o $(@F) getarch.c cpuid.S $(CPUIDEMU)
|
||||
avx512=$$(./c_check$(SCRIPTSUFFIX) - - $(CC) $(TARGET_FLAGS) $(CFLAGS) | grep NO_AVX512); \
|
||||
rv64gv=$$(./c_check$(SCRIPTSUFFIX) - - $(CC) $(TARGET_FLAGS) $(CFLAGS) | grep NO_RV64GV); \
|
||||
$(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) $${avx512:+-D$${avx512}} $${rv64gv:+-D$${rv64gv}} -o $(@F) getarch.c cpuid.S $(CPUIDEMU)
|
||||
|
||||
getarch_2nd : getarch_2nd.c config.h dummy
|
||||
getarch_2nd : getarch_2nd.c $(TARGET_CONF) dummy
|
||||
ifndef TARGET_CORE
|
||||
$(HOSTCC) -I. $(HOST_CFLAGS) -o $(@F) getarch_2nd.c
|
||||
else
|
||||
|
|
@ -81,3 +89,5 @@ else
|
|||
endif
|
||||
|
||||
dummy:
|
||||
|
||||
.PHONY: dummy
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
ifeq ($(CORE), C910V)
|
||||
CCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v
|
||||
FCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v -static
|
||||
CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920
|
||||
FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static
|
||||
endif
|
||||
|
|
|
|||
|
|
@ -261,8 +261,9 @@ endif
|
|||
#For small matrix optimization
|
||||
ifeq ($(ARCH), x86_64)
|
||||
SMALL_MATRIX_OPT = 1
|
||||
else ifeq ($(CORE), POWER10)
|
||||
else ifeq ($(ARCH), power)
|
||||
SMALL_MATRIX_OPT = 1
|
||||
BUILD_BFLOAT16 = 1
|
||||
endif
|
||||
ifeq ($(SMALL_MATRIX_OPT), 1)
|
||||
CCOMMON_OPT += -DSMALL_MATRIX_OPT
|
||||
|
|
@ -352,7 +353,7 @@ OBJCONV = $(CROSS_SUFFIX)objconv
|
|||
|
||||
# When fortran support was either not detected or actively deselected, only build BLAS.
|
||||
ifeq ($(NOFORTRAN), 1)
|
||||
NO_LAPACK = 1
|
||||
C_LAPACK = 1
|
||||
override FEXTRALIB =
|
||||
endif
|
||||
|
||||
|
|
@ -384,8 +385,12 @@ endif
|
|||
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
ifndef MACOSX_DEPLOYMENT_TARGET
|
||||
ifeq ($(ARCH), arm64)
|
||||
export MACOSX_DEPLOYMENT_TARGET=11.0
|
||||
else
|
||||
export MACOSX_DEPLOYMENT_TARGET=10.8
|
||||
endif
|
||||
endif
|
||||
MD5SUM = md5 -r
|
||||
endif
|
||||
|
||||
|
|
@ -675,6 +680,10 @@ ifeq ($(ARCH), mips64)
|
|||
DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), loongarch64)
|
||||
DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), zarch)
|
||||
DYNAMIC_CORE = ZARCH_GENERIC
|
||||
|
||||
|
|
@ -847,7 +856,7 @@ CCOMMON_OPT += -mabi=32
|
|||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), $(filter $(CORE),LOONGSON3R3 LOONGSON3R4))
|
||||
ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4))
|
||||
CCOMMON_OPT += -march=loongson3a
|
||||
FCOMMON_OPT += -march=loongson3a
|
||||
endif
|
||||
|
|
@ -887,11 +896,9 @@ BINARY_DEFINED = 1
|
|||
endif
|
||||
|
||||
ifeq ($(ARCH), loongarch64)
|
||||
ifeq ($(CORE), LOONGSON3R5)
|
||||
CCOMMON_OPT += -march=loongarch64 -mabi=lp64
|
||||
FCOMMON_OPT += -march=loongarch64 -mabi=lp64
|
||||
endif
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
|
|
@ -1041,9 +1048,13 @@ FCOMMON_OPT += -frecursive
|
|||
# work around ABI problem with passing single-character arguments
|
||||
FCOMMON_OPT += -fno-optimize-sibling-calls
|
||||
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||
ifneq ($(NOFORTRAN), 1)
|
||||
ifneq ($(NOFORTRAN), 2)
|
||||
ifneq ($(NO_LAPACK), 1)
|
||||
EXTRALIB += -lgfortran
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifdef NO_BINARY_MODE
|
||||
ifeq ($(ARCH), $(filter $(ARCH),mips64))
|
||||
ifdef BINARY64
|
||||
|
|
@ -1179,7 +1190,6 @@ FCOMMON_OPT += -i8
|
|||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
|
||||
ifndef BINARY64
|
||||
FCOMMON_OPT += -n32
|
||||
|
|
@ -1189,11 +1199,9 @@ endif
|
|||
ifeq ($(CORE), LOONGSON3R3)
|
||||
FCOMMON_OPT += -loongson3 -static
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), LOONGSON3R4)
|
||||
FCOMMON_OPT += -loongson3 -static
|
||||
endif
|
||||
|
||||
else
|
||||
ifndef BINARY64
|
||||
FCOMMON_OPT += -m32
|
||||
|
|
@ -1201,7 +1209,6 @@ else
|
|||
FCOMMON_OPT += -m64
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
FEXTRALIB += -lstdc++
|
||||
FCOMMON_OPT += -mp
|
||||
|
|
@ -1209,7 +1216,6 @@ endif
|
|||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), OPEN64)
|
||||
|
||||
ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
|
||||
ifndef BINARY64
|
||||
CCOMMON_OPT += -n32
|
||||
|
|
@ -1219,13 +1225,10 @@ endif
|
|||
ifeq ($(CORE), LOONGSON3R3)
|
||||
CCOMMON_OPT += -loongson3 -static
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), LOONGSON3R4)
|
||||
CCOMMON_OPT += -loongson3 -static
|
||||
endif
|
||||
|
||||
else
|
||||
|
||||
ifndef BINARY64
|
||||
CCOMMON_OPT += -m32
|
||||
else
|
||||
|
|
@ -1271,6 +1274,19 @@ FCOMMON_OPT += -openmp
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER), CRAY)
|
||||
CCOMMON_OPT += -DF_INTERFACE_INTEL
|
||||
FCOMMON_OPT += -hnopattern
|
||||
ifdef INTERFACE64
|
||||
ifneq ($(INTERFACE64), 0)
|
||||
FCOMMON_OPT += -s integer64
|
||||
endif
|
||||
endif
|
||||
ifneq ($(USE_OPENMP), 1)
|
||||
FCOMMON_OPT += -O noomp
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef BINARY64
|
||||
ifdef INTERFACE64
|
||||
ifneq ($(INTERFACE64), 0)
|
||||
|
|
@ -1303,6 +1319,10 @@ ifeq ($(DYNAMIC_OLDER), 1)
|
|||
CCOMMON_OPT += -DDYNAMIC_OLDER
|
||||
endif
|
||||
|
||||
ifeq ($(C_LAPACK), 1)
|
||||
CCOMMON_OPT += -DC_LAPACK
|
||||
endif
|
||||
|
||||
ifeq ($(NO_LAPACK), 1)
|
||||
CCOMMON_OPT += -DNO_LAPACK
|
||||
#Disable LAPACK C interface
|
||||
|
|
@ -1532,7 +1552,7 @@ override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
|
|||
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
|
||||
#MAKEOVERRIDES =
|
||||
|
||||
ifdef NEED_PIC
|
||||
ifeq ($(NEED_PIC), 1)
|
||||
ifeq (,$(findstring PIC,$(FFLAGS)))
|
||||
override FFLAGS += -fPIC
|
||||
endif
|
||||
|
|
@ -1550,6 +1570,11 @@ endif
|
|||
|
||||
ifeq ($(F_COMPILER),NAG)
|
||||
LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||
FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||
endif
|
||||
ifeq ($(F_COMPILER),CRAY)
|
||||
LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||
FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||
endif
|
||||
|
||||
LAPACK_CFLAGS = $(CFLAGS)
|
||||
|
|
@ -1562,6 +1587,7 @@ endif
|
|||
|
||||
ifdef OS_WINDOWS
|
||||
LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS
|
||||
LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE
|
||||
endif
|
||||
ifeq ($(C_COMPILER), LSB)
|
||||
LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE
|
||||
|
|
@ -1661,6 +1687,7 @@ export USE_OPENMP
|
|||
export CROSS
|
||||
export CROSS_SUFFIX
|
||||
export NOFORTRAN
|
||||
export C_LAPACK
|
||||
export NO_FBLAS
|
||||
export EXTRALIB
|
||||
export CEXTRALIB
|
||||
|
|
|
|||
|
|
@ -92,6 +92,10 @@ CORTEXA53
|
|||
CORTEXA57
|
||||
CORTEXA72
|
||||
CORTEXA73
|
||||
CORTEXA510
|
||||
CORTEXA710
|
||||
CORTEXX1
|
||||
CORTEXX2
|
||||
NEOVERSEN1
|
||||
NEOVERSEV1
|
||||
NEOVERSEN2
|
||||
|
|
@ -103,6 +107,9 @@ THUNDERX2T99
|
|||
TSV110
|
||||
THUNDERX3T110
|
||||
VORTEX
|
||||
A64FX
|
||||
ARMV8SVE
|
||||
FT2000
|
||||
|
||||
9.System Z:
|
||||
ZARCH_GENERIC
|
||||
|
|
@ -114,7 +121,9 @@ RISCV64_GENERIC
|
|||
C910V
|
||||
|
||||
11.LOONGARCH64:
|
||||
LOONGSONGENERIC
|
||||
LOONGSON3R5
|
||||
LOONGSON2K1000
|
||||
|
||||
12. Elbrus E2000:
|
||||
E2K
|
||||
|
|
|
|||
|
|
@ -65,7 +65,7 @@ jobs:
|
|||
- task: CMake@1
|
||||
inputs:
|
||||
workingDirectory: 'build' # Optional
|
||||
cmakeArgs: '-G "Visual Studio 16 2019" ..'
|
||||
cmakeArgs: '-G "Visual Studio 17 2022" ..'
|
||||
- task: CMake@1
|
||||
inputs:
|
||||
cmakeArgs: '--build . --config Release'
|
||||
|
|
@ -81,7 +81,7 @@ jobs:
|
|||
vmImage: 'windows-latest'
|
||||
steps:
|
||||
- script: |
|
||||
mingw32-make CC=gcc FC=gfortran DYNAMIC_ARCH=1 DYNAMIC_LIST="NEHALEM SANDYBRIDGE HASWELL"
|
||||
mingw32-make CC=gcc FC=gfortran DYNAMIC_ARCH=1 DYNAMIC_LIST="SANDYBRIDGE"
|
||||
|
||||
- job: Windows_clang_cmake
|
||||
pool:
|
||||
|
|
@ -103,7 +103,7 @@ jobs:
|
|||
|
||||
- job: Windows_flang_clang
|
||||
pool:
|
||||
vmImage: 'windows-latest'
|
||||
vmImage: 'windows-2022'
|
||||
steps:
|
||||
- script: |
|
||||
set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%"
|
||||
|
|
@ -114,11 +114,31 @@ jobs:
|
|||
conda install --yes --quiet ninja flang
|
||||
mkdir build
|
||||
cd build
|
||||
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
|
||||
cmake --build . --config Release
|
||||
ctest
|
||||
|
||||
- job: Windows_cl_flang
|
||||
pool:
|
||||
vmImage: 'windows-2022'
|
||||
steps:
|
||||
- script: |
|
||||
set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%"
|
||||
set "LIB=C:\Miniconda\Library\lib;%LIB%"
|
||||
set "CPATH=C:\Miniconda\Library\include;%CPATH%"
|
||||
conda config --add channels conda-forge --force
|
||||
conda config --set auto_update_conda false
|
||||
conda install --yes --quiet ninja flang
|
||||
mkdir build
|
||||
cd build
|
||||
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
cmake -G "Ninja" -DCMAKE_C_COMPILER=cl -DCMAKE_Fortran_COMPILER=flang -DC_LAPACK=1 -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
|
||||
cmake --build . --config Release
|
||||
ctest
|
||||
|
||||
|
||||
|
||||
- job: OSX_OpenMP
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
|
|
@ -143,11 +163,12 @@ jobs:
|
|||
variables:
|
||||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
MACOSX_DEPLOYMENT_TARGET: 11.0
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
brew install llvm libomp
|
||||
make TARGET=CORE2 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10
|
||||
make TARGET=CORE2 USE_OPENMP=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang NOFORTRAN=1
|
||||
|
||||
- job: OSX_OpenMP_Clang_cmake
|
||||
pool:
|
||||
|
|
@ -178,7 +199,7 @@ jobs:
|
|||
cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON ..
|
||||
cmake --build .
|
||||
ctest
|
||||
|
||||
|
||||
- job: OSX_Ifort_Clang
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
|
|
|
|||
|
|
@ -1,426 +1,415 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
#use File::Basename;
|
||||
# use File::Temp qw(tempfile);
|
||||
#!/bin/sh
|
||||
|
||||
# Checking cross compile
|
||||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
||||
$hostarch = `uname -m | sed -e s/i.86/x86/`;
|
||||
$hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS");
|
||||
chop($hostarch);
|
||||
$hostarch = "x86_64" if ($hostarch eq "amd64");
|
||||
$hostarch = "arm" if ($hostarch ne "arm64" && $hostarch =~ /^arm.*/);
|
||||
$hostarch = "arm64" if ($hostarch eq "aarch64");
|
||||
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
|
||||
$hostarch = "zarch" if ($hostarch eq "s390x");
|
||||
hostos=`uname -s | sed -e 's/\-.*//'`
|
||||
hostarch=`uname -m | sed -e 's/i.86/x86/'`
|
||||
if [ "$hostos" = "AIX" ] || [ "$hostos" = "SunOS" ]; then
|
||||
hostarch=`uname -p`
|
||||
fi
|
||||
case "$hostarch" in
|
||||
amd64) hostarch=x86_64 ;;
|
||||
arm*) [ "$hostarch" = "arm64" ] || hostarch='arm' ;;
|
||||
aarch64) hostarch=arm64 ;;
|
||||
powerpc*|ppc*) hostarch=power ;;
|
||||
s390x) hostarch=zarch ;;
|
||||
esac
|
||||
|
||||
#$tmpf = new File::Temp( UNLINK => 1 );
|
||||
$binary = $ENV{"BINARY"};
|
||||
makefile="$1"
|
||||
config="$2"
|
||||
|
||||
$makefile = shift(@ARGV);
|
||||
$config = shift(@ARGV);
|
||||
|
||||
$compiler_name = shift(@ARGV);
|
||||
$flags = join(" ", @ARGV);
|
||||
compiler_name="$3"
|
||||
shift 3
|
||||
flags="$*"
|
||||
|
||||
# First, we need to know the target OS and compiler name
|
||||
|
||||
$data = `$compiler_name $flags -E ctest.c`;
|
||||
|
||||
if ($?) {
|
||||
printf STDERR "C Compiler ($compiler_name) is something wrong.\n";
|
||||
die 1;
|
||||
{
|
||||
data=`$compiler_name $flags -E ctest.c`
|
||||
} || {
|
||||
printf '%s\n' "C Compiler ($compiler_name) is something wrong." >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
$cross_suffix = "";
|
||||
cross_suffix=""
|
||||
|
||||
eval "use File::Basename";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Basename, emulating its functionality";
|
||||
my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 );
|
||||
if ($dirnam ne ".") {
|
||||
$cross_suffix .= $dirnam . "/";
|
||||
}
|
||||
my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1);
|
||||
if ($basnam =~ /([^\s]*-)(.*)/) {
|
||||
$cross_suffix .= $1;
|
||||
}
|
||||
} else {
|
||||
if (dirname($compiler_name) ne ".") {
|
||||
$cross_suffix .= dirname($compiler_name) . "/";
|
||||
}
|
||||
if [ "`dirname $compiler_name`" != '.' ]; then
|
||||
cross_suffix="$cross_suffix`dirname $compiler_name`/"
|
||||
fi
|
||||
|
||||
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
|
||||
$cross_suffix .= $1;
|
||||
}
|
||||
}
|
||||
bn=`basename $compiler_name`
|
||||
case "$bn" in
|
||||
*-*) cross_suffix="$cross_suffix${bn%-*}-"
|
||||
esac
|
||||
|
||||
$compiler = "";
|
||||
$compiler = LSB if ($data =~ /COMPILER_LSB/);
|
||||
$compiler = CLANG if ($data =~ /COMPILER_CLANG/);
|
||||
$compiler = PGI if ($data =~ /COMPILER_PGI/);
|
||||
$compiler = PATHSCALE if ($data =~ /COMPILER_PATHSCALE/);
|
||||
$compiler = INTEL if ($data =~ /COMPILER_INTEL/);
|
||||
$compiler = OPEN64 if ($data =~ /COMPILER_OPEN64/);
|
||||
$compiler = SUN if ($data =~ /COMPILER_SUN/);
|
||||
$compiler = IBM if ($data =~ /COMPILER_IBM/);
|
||||
$compiler = DEC if ($data =~ /COMPILER_DEC/);
|
||||
$compiler = GCC if ($compiler eq "");
|
||||
compiler=""
|
||||
case "$data" in
|
||||
*COMPILER_LSB*) compiler=LSB ;;
|
||||
*COMPILER_CLANG*) compiler=CLANG ;;
|
||||
*COMPILER_PGI*) compiler=PGI ;;
|
||||
*COMPILER_PATHSCALE*) compiler=PATHSCALE ;;
|
||||
*COMPILER_INTEL*) compiler=INTEL ;;
|
||||
*COMPILER_OPEN64*) compiler=OPEN64 ;;
|
||||
*COMPILER_SUN*) compiler=SUN ;;
|
||||
*COMPILER_IBM*) compiler=IBM ;;
|
||||
*COMPILER_DEC*) compiler=DEC ;;
|
||||
*COMPILER_FUJITSU*) compiler=FUJITSU ;;
|
||||
esac
|
||||
if [ -z "$compiler" ]; then
|
||||
compiler=GCC
|
||||
fi
|
||||
|
||||
$os = Linux if ($data =~ /OS_LINUX/);
|
||||
$os = FreeBSD if ($data =~ /OS_FREEBSD/);
|
||||
$os = NetBSD if ($data =~ /OS_NETBSD/);
|
||||
$os = OpenBSD if ($data =~ /OS_OPENBSD/);
|
||||
$os = DragonFly if ($data =~ /OS_DRAGONFLY/);
|
||||
$os = Darwin if ($data =~ /OS_DARWIN/);
|
||||
$os = SunOS if ($data =~ /OS_SUNOS/);
|
||||
$os = AIX if ($data =~ /OS_AIX/);
|
||||
$os = osf if ($data =~ /OS_OSF/);
|
||||
$os = WINNT if ($data =~ /OS_WINNT/);
|
||||
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
|
||||
$os = Interix if ($data =~ /OS_INTERIX/);
|
||||
$os = Android if ($data =~ /OS_ANDROID/);
|
||||
$os = Haiku if ($data =~ /OS_HAIKU/);
|
||||
case "$data" in *OS_LINUX*) os=Linux ;; esac
|
||||
case "$data" in *OS_FREEBSD*) os=FreeBSD ;; esac
|
||||
case "$data" in *OS_NETBSD*) os=NetBSD ;; esac
|
||||
case "$data" in *OS_OPENBSD*) os=OpenBSD ;; esac
|
||||
case "$data" in *OS_DRAGONFLY*) os=DragonFly ;; esac
|
||||
case "$data" in *OS_DARWIN*) os=Darwin ;; esac
|
||||
case "$data" in *OS_SUNOS*) os=SunOS ;; esac
|
||||
case "$data" in *OS_AIX*) os=AIX ;; esac
|
||||
case "$data" in *OS_OSF*) os=osf ;; esac
|
||||
case "$data" in *OS_WINNT*) os=WINNT ;; esac
|
||||
case "$data" in *OS_CYGWIN_NT*) os=CYGWIN_NT ;; esac
|
||||
case "$data" in *OS_INTERIX*) os=Interix ;; esac
|
||||
case "$data" in *OS_ANDROID*) os=Android ;; esac
|
||||
case "$data" in *OS_HAIKU*) os=Haiku ;; esac
|
||||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = e2k if ($data =~ /ARCH_E2K/);
|
||||
$architecture = power if ($data =~ /ARCH_POWER/);
|
||||
$architecture = mips if ($data =~ /ARCH_MIPS/);
|
||||
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
|
||||
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
|
||||
$architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
|
||||
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
||||
case "$data" in
|
||||
*ARCH_X86_64*) architecture=x86_64 ;;
|
||||
*ARCH_X86*) architecture=x86 ;;
|
||||
*ARCH_E2K*) architecture=e2k ;;
|
||||
*ARCH_POWER*) architecture=power ;;
|
||||
*ARCH_MIPS64*) architecture=mips64 ;;
|
||||
*ARCH_MIPS*) architecture=mips ;;
|
||||
*ARCH_ALPHA*) architecture=alpha ;;
|
||||
*ARCH_SPARC*) architecture=sparc ;;
|
||||
*ARCH_IA64*) architecture=ia64 ;;
|
||||
*ARCH_ARM64*) architecture=arm64 ;;
|
||||
*ARCH_ARM*) architecture=arm ;;
|
||||
*ARCH_ZARCH*) architecture=zarch ;;
|
||||
*ARCH_RISCV64*) architecture=riscv64 ;;
|
||||
*ARCH_LOONGARCH64*) architecture=loongarch64 ;;
|
||||
esac
|
||||
|
||||
$defined = 0;
|
||||
defined=0
|
||||
|
||||
if ($os eq "AIX") {
|
||||
$compiler_name .= " -maix32" if ($binary eq "32");
|
||||
$compiler_name .= " -maix64" if ($binary eq "64");
|
||||
$defined = 1;
|
||||
}
|
||||
if [ "$os" = "AIX" ]; then
|
||||
case "$BINARY" in
|
||||
32) compiler_name="$compiler_name -maix32" ;;
|
||||
64) compiler_name="$compiler_name -maix64" ;;
|
||||
esac
|
||||
defined=1
|
||||
fi
|
||||
|
||||
if ($architecture eq "mips") {
|
||||
$compiler_name .= " -mabi=32";
|
||||
$defined = 1;
|
||||
}
|
||||
case "$architecture" in
|
||||
mips)
|
||||
compiler_name="$compiler_name -mabi=32"
|
||||
defined=1
|
||||
;;
|
||||
mips64)
|
||||
case "$BINARY" in
|
||||
32) compiler_name="$compiler_name -mabi=n32" ;;
|
||||
64) compiler_name="$compiler_name -mabi=64" ;;
|
||||
esac
|
||||
defined=1
|
||||
;;
|
||||
arm|arm64) defined=1 ;;
|
||||
zarch|e2k|alpha|ia64|riscv64|loonarch64)
|
||||
defined=1
|
||||
BINARY=64
|
||||
;;
|
||||
x86)
|
||||
[ "$os" != "Darwin" ] && [ "$os" != "SunOS" ] && {
|
||||
defined=1
|
||||
BINARY=32
|
||||
}
|
||||
;;
|
||||
esac
|
||||
|
||||
if ($architecture eq "mips64") {
|
||||
$compiler_name .= " -mabi=n32" if ($binary eq "32");
|
||||
$compiler_name .= " -mabi=64" if ($binary eq "64");
|
||||
$defined = 1;
|
||||
}
|
||||
case "$compiler" in
|
||||
PGI)
|
||||
case "$BINARY" in
|
||||
32) compiler_name="$compiler_name -tp p7" ;;
|
||||
64) compiler_name="$compiler_name -tp p7-64" ;;
|
||||
esac
|
||||
openmp='-mp'
|
||||
defined=1
|
||||
;;
|
||||
IBM)
|
||||
case "$BINARY" in
|
||||
32) compiler_name="$compiler_name -q32" ;;
|
||||
64) compiler_name="$compiler_name -q64" ;;
|
||||
esac
|
||||
openmp='-qsmp=omp'
|
||||
defined=1
|
||||
;;
|
||||
INTEL) openmp='-openmp' ;;
|
||||
PATHSCALE|OPEN64) openmp='-mp' ;;
|
||||
CLANG|GCC|LSB) openmp='-fopenmp' ;;
|
||||
FUJITSU) openmp='-Kopenmp' ;;
|
||||
esac
|
||||
|
||||
if (($architecture eq "arm") || ($architecture eq "arm64")) {
|
||||
$defined = 1;
|
||||
}
|
||||
|
||||
if ($architecture eq "zarch") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "e2k") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "alpha") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "ia64") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if (($architecture eq "x86") && ($os ne Darwin) && ($os ne SunOS)) {
|
||||
$defined = 1;
|
||||
$binary =32;
|
||||
}
|
||||
|
||||
if ($architecture eq "riscv64") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "loongarch64") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($compiler eq "PGI") {
|
||||
$compiler_name .= " -tp p7" if ($binary eq "32");
|
||||
$compiler_name .= " -tp p7-64" if ($binary eq "64");
|
||||
$openmp = "-mp";
|
||||
$defined = 1;
|
||||
}
|
||||
|
||||
if ($compiler eq "IBM") {
|
||||
$compiler_name .= " -q32" if ($binary eq "32");
|
||||
$compiler_name .= " -q64" if ($binary eq "64");
|
||||
$openmp = "-qsmp=omp";
|
||||
$defined = 1;
|
||||
}
|
||||
|
||||
if ($compiler eq "INTEL") {
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($compiler eq "PATHSCALE") {
|
||||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($compiler eq "OPEN64") {
|
||||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($compiler eq "CLANG") {
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($compiler eq "GCC" || $compiler eq "LSB") {
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($defined == 0) {
|
||||
$compiler_name .= " -m32" if ($binary eq "32");
|
||||
$compiler_name .= " -m64" if ($binary eq "64");
|
||||
}
|
||||
if [ "$defined" -eq 0 ]; then
|
||||
case "$BINARY" in
|
||||
32) compiler_name="$compiler_name -m32" ;;
|
||||
64) compiler_name="$compiler_name -m64" ;;
|
||||
esac
|
||||
fi
|
||||
|
||||
# Do again
|
||||
|
||||
$data = `$compiler_name $flags -E ctest.c`;
|
||||
|
||||
if ($?) {
|
||||
printf STDERR "C Compiler ($compiler_name) is something wrong.\n";
|
||||
die 1;
|
||||
}
|
||||
|
||||
$have_msa = 0;
|
||||
if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
||||
eval "use File::Temp qw(tempfile)";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Temp, so could not check MSA capatibility";
|
||||
} else {
|
||||
$tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
|
||||
$code = '"addvi.b $w0, $w1, 1"';
|
||||
$msa_flags = "-mmsa -mfp64 -mload-store-pairs";
|
||||
print $tmpf "#include <msa.h>\n\n";
|
||||
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
|
||||
|
||||
$args = "$msa_flags -o $tmpf.o $tmpf";
|
||||
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$have_msa = 0;
|
||||
} else {
|
||||
$have_msa = 1;
|
||||
}
|
||||
unlink("$tmpf.o");
|
||||
}
|
||||
}
|
||||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = e2k if ($data =~ /ARCH_E2K/);
|
||||
$architecture = power if ($data =~ /ARCH_POWER/);
|
||||
$architecture = mips if ($data =~ /ARCH_MIPS/);
|
||||
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
|
||||
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
|
||||
$architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
||||
|
||||
$binformat = bin32;
|
||||
$binformat = bin64 if ($data =~ /BINARY_64/);
|
||||
|
||||
$no_avx512= 0;
|
||||
if (($architecture eq "x86") || ($architecture eq "x86_64")) {
|
||||
eval "use File::Temp qw(tempfile)";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512";
|
||||
$no_avx512 = 0;
|
||||
} else {
|
||||
# $tmpf = new File::Temp( UNLINK => 1 );
|
||||
($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
|
||||
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
|
||||
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
|
||||
$args = " -march=skylake-avx512 -c -o $tmpf.o $tmpf";
|
||||
if ($compiler eq "PGI") {
|
||||
$args = " -tp skylake -c -o $tmpf.o $tmpf";
|
||||
}
|
||||
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$no_avx512 = 1;
|
||||
} else {
|
||||
$no_avx512 = 0;
|
||||
}
|
||||
unlink("$tmpf.o");
|
||||
}
|
||||
}
|
||||
|
||||
$c11_atomics = 0;
|
||||
if ($data =~ /HAVE_C11/) {
|
||||
eval "use File::Temp qw(tempfile)";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Temp, so could not check compiler compatibility with C11";
|
||||
$c11_atomics = 0;
|
||||
} else {
|
||||
($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
|
||||
print $tmpf "#include <stdatomic.h>\nint main(void){}\n";
|
||||
$args = " -c -o $tmpf.o $tmpf";
|
||||
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$c11_atomics = 0;
|
||||
} else {
|
||||
$c11_atomics = 1;
|
||||
}
|
||||
unlink("$tmpf.o");
|
||||
}
|
||||
}
|
||||
|
||||
if ($compiler eq "GCC" &&( ($architecture eq "x86") || ($architecture eq "x86_64"))) {
|
||||
$no_avx2 = 0;
|
||||
$oldgcc = 0;
|
||||
$data = `$compiler_name -dumpversion`;
|
||||
if ($data <= 4.6) {
|
||||
$no_avx2 = 1;
|
||||
$oldgcc = 1;
|
||||
}
|
||||
}
|
||||
|
||||
$data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
|
||||
|
||||
$data =~ /globl\s([_\.]*)(.*)/;
|
||||
|
||||
$need_fu = $1;
|
||||
|
||||
$cross = 0;
|
||||
|
||||
if ($architecture ne $hostarch) {
|
||||
$cross = 1;
|
||||
$cross = 0 if (($hostarch eq "x86_64") && ($architecture eq "x86"));
|
||||
$cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips"));
|
||||
}
|
||||
|
||||
$cross = 1 if ($os ne $hostos);
|
||||
|
||||
$openmp = "" if $ENV{USE_OPENMP} != 1;
|
||||
|
||||
$linker_L = "";
|
||||
$linker_l = "";
|
||||
$linker_a = "";
|
||||
|
||||
{
|
||||
$link = `$compiler_name $flags -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $flags $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`;
|
||||
|
||||
$link =~ s/\-Y\sP\,/\-Y/g;
|
||||
|
||||
@flags = split(/[\s\,\n]/, $link);
|
||||
# remove leading and trailing quotes from each flag.
|
||||
@flags = map {s/^['"]|['"]$//g; $_} @flags;
|
||||
|
||||
foreach $flags (@flags) {
|
||||
if (
|
||||
($flags =~ /^\-L/)
|
||||
&& ($flags !~ /^-LIST:/)
|
||||
&& ($flags !~ /^-LANG:/)
|
||||
) {
|
||||
$linker_L .= $flags . " "
|
||||
}
|
||||
|
||||
if ($flags =~ /^\-Y/) {
|
||||
$linker_L .= "-Wl,". $flags . " "
|
||||
}
|
||||
|
||||
if ($flags =~ /^\--exclude-libs/) {
|
||||
$linker_L .= "-Wl,". $flags . " ";
|
||||
$flags="";
|
||||
}
|
||||
|
||||
if (
|
||||
($flags =~ /^\-l/)
|
||||
&& ($flags !~ /gfortranbegin/)
|
||||
&& ($flags !~ /frtbegin/)
|
||||
&& ($flags !~ /pathfstart/)
|
||||
&& ($flags !~ /numa/)
|
||||
&& ($flags !~ /crt[0-9]/)
|
||||
&& ($flags !~ /gcc/)
|
||||
&& ($flags !~ /user32/)
|
||||
&& ($flags !~ /kernel32/)
|
||||
&& ($flags !~ /advapi32/)
|
||||
&& ($flags !~ /shell32/)
|
||||
&& ($flags !~ /omp/)
|
||||
&& ($flags !~ /[0-9]+/)
|
||||
) {
|
||||
$linker_l .= $flags . " "
|
||||
}
|
||||
|
||||
$linker_a .= $flags . " " if $flags =~ /\.a$/;
|
||||
}
|
||||
|
||||
data="$($compiler_name $flags -E ctest.c)"
|
||||
} || {
|
||||
printf '%s\n' "C Compiler ($compiler_name) is something wrong." >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
open(MAKEFILE, "> $makefile") || die "Can't create $makefile";
|
||||
open(CONFFILE, "> $config" ) || die "Can't create $config";
|
||||
have_msa=0
|
||||
if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
|
||||
tmpd="$(mktemp -d)"
|
||||
tmpf="$tmpd/a.c"
|
||||
code='"addvi.b $w0, $w1, 1"'
|
||||
msa_flags='-mmsa -mfp64 -mload-store-pairs'
|
||||
printf "#include <msa.h>\n\n" >> "$tmpf"
|
||||
printf "void main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
|
||||
|
||||
args="$msa_flags -o $tmpf.o $tmpf"
|
||||
have_msa=1
|
||||
{
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
have_msa=0
|
||||
}
|
||||
|
||||
rm -rf "$tmpd"
|
||||
fi
|
||||
|
||||
case "$data" in
|
||||
*ARCH_X86_64*) architecture=x86_64 ;;
|
||||
*ARCH_X86*) architecture=x86 ;;
|
||||
*ARCH_E2K*) architecture=e2k ;;
|
||||
*ARCH_POWER*) architecture=power ;;
|
||||
*ARCH_MIPS64*) architecture=mips64 ;;
|
||||
*ARCH_MIPS*) architecture=mips ;;
|
||||
*ARCH_ALPHA*) architecture=alpha ;;
|
||||
*ARCH_SPARC*) architecture=sparc ;;
|
||||
*ARCH_IA64*) architecture=ia64 ;;
|
||||
*ARCH_ARM64*) architecture=arm64 ;;
|
||||
*ARCH_ARM*) architecture=arm ;;
|
||||
*ARCH_ZARCH*) architecture=zarch ;;
|
||||
*ARCH_LOONGARCH64*) architecture=loongarch64 ;;
|
||||
esac
|
||||
|
||||
binformat='bin32'
|
||||
case "$data" in
|
||||
*BINARY_64*) binformat='bin64' ;;
|
||||
esac
|
||||
|
||||
no_avx512=0
|
||||
if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
|
||||
tmpd=`mktemp -d`
|
||||
tmpf="$tmpd/a.c"
|
||||
code='"vbroadcastss -4 * 4(%rsi), %zmm2"'
|
||||
printf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
|
||||
if [ "$compiler" = "PGI" ]; then
|
||||
args=" -tp skylake -c -o $tmpf.o $tmpf"
|
||||
else
|
||||
args=" -march=skylake-avx512 -c -o $tmpf.o $tmpf"
|
||||
fi
|
||||
no_avx512=0
|
||||
{
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
no_avx512=1
|
||||
}
|
||||
|
||||
rm -rf "$tmpd"
|
||||
fi
|
||||
|
||||
no_rv64gv=0
|
||||
if [ "$architecture" = "riscv64" ]; then
|
||||
tmpd=`mktemp -d`
|
||||
tmpf="$tmpd/a.c"
|
||||
code='"vsetvli zero, zero, e8, m1\n"'
|
||||
printf "int main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
|
||||
args=" -march=rv64gv -c -o $tmpf.o $tmpf"
|
||||
no_rv64gv=0
|
||||
{
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
no_rv64gv=1
|
||||
}
|
||||
rm -rf "$tmpd"
|
||||
fi
|
||||
|
||||
c11_atomics=0
|
||||
case "$data" in
|
||||
*HAVE_C11*)
|
||||
tmpd=`mktemp -d`
|
||||
tmpf="$tmpd/a.c"
|
||||
printf "#include <stdatomic.h>\nint main(void){}\n" >> "$tmpf"
|
||||
args=" -c -o $tmpf.o $tmpf"
|
||||
c11_atomics=1
|
||||
{
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
c11_atomics=0
|
||||
}
|
||||
|
||||
rm -rf "$tmpd"
|
||||
;;
|
||||
esac
|
||||
|
||||
oldgcc=0
|
||||
no_avx2=0
|
||||
if [ "$compiler" = "GCC" ]; then
|
||||
case "$architecture" in x86|x86_64)
|
||||
no_avx2=0
|
||||
oldgcc=0
|
||||
data=`$compiler_name -dumpversion`
|
||||
case "$data" in *.*.*)
|
||||
data="${data%.*}"
|
||||
esac
|
||||
if awk -v n1=$data -v n2=4.6 'BEGIN { exit !(n1 <= n2) }'; then
|
||||
no_avx2=1
|
||||
oldgcc=1
|
||||
fi
|
||||
esac
|
||||
fi
|
||||
|
||||
data=`$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`
|
||||
|
||||
need_fu=''
|
||||
if echo "$data" | grep 'globl[[:space:]][_\.]'; then
|
||||
need_fu="${data##*globl[[:space:]]}"
|
||||
need_fu="${need_fu%%[!_\.]*}"
|
||||
fi
|
||||
|
||||
cross=0
|
||||
|
||||
if [ "$architecture" != "$hostarch" ]; then
|
||||
cross=1
|
||||
[ "$hostarch" = "x86_64" ] && [ "$architecture" = "x86" ] && cross=0
|
||||
[ "$hostarch" = "mips64" ] && [ "$architecture" = "mips" ] && cross=0
|
||||
fi
|
||||
|
||||
[ "$os" != "$hostos" ] && cross=1
|
||||
[ "$os" = "Android" ] && [ "$hostos" = "Linux" ] && [ -n "$TERMUX_APP_PID" ] \
|
||||
&& cross=0
|
||||
|
||||
[ "$USE_OPENMP" != 1 ] && openmp=''
|
||||
|
||||
linker_L=""
|
||||
linker_l=""
|
||||
linker_a=""
|
||||
|
||||
link=`$compiler_name $flags -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $flags $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`
|
||||
|
||||
link=`echo "$link" | sed 's/\-Y[[:space:]]P\,/\-Y/g'`
|
||||
|
||||
|
||||
flags=`echo $link | tr "'[[:space:]],\n" " "`
|
||||
|
||||
# Strip trailing quotes
|
||||
old_flags="$flags"
|
||||
flags=''
|
||||
|
||||
|
||||
for flag in $old_flags; do
|
||||
f=`echo "$flag" | tr '"' ' '`
|
||||
flags="$flags $f"
|
||||
done
|
||||
|
||||
for flag in $flags; do
|
||||
case "$flag" in -L*)
|
||||
case "$flag" in
|
||||
-LIST:*|-LANG:*) ;;
|
||||
*) linker_L="$linker_L $flag" ;;
|
||||
esac
|
||||
esac
|
||||
|
||||
case "$flag" in -Y*)
|
||||
linker_L="$linker_L -Wl,$flag" ;;
|
||||
esac
|
||||
|
||||
case "$flag" in --exclude-libs*)
|
||||
linker_L="$linker_L -Wl,$flag"
|
||||
flags=""
|
||||
;;
|
||||
esac
|
||||
|
||||
case "$flag" in -l*)
|
||||
case "$flag" in
|
||||
*gfortranbegin*|*frtbegin*|*pathfstart*|*numa*|*crt[0-9]*|\
|
||||
*gcc*|*user32*|*kernel32*|*advapi32*|*shell32*|*omp*|\
|
||||
*[0-9]*) ;;
|
||||
*) linker_l="$linker_l $flag" ;;
|
||||
esac
|
||||
esac
|
||||
|
||||
case "$flag" in *.a) linker_a="$linker_a $flag" ;; esac
|
||||
done
|
||||
|
||||
[ "$makefile" = "-" ] && {
|
||||
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
|
||||
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
|
||||
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
|
||||
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
|
||||
exit 0
|
||||
}
|
||||
|
||||
:> "$makefile" || exit 1
|
||||
:> "$config" || exit 1
|
||||
|
||||
|
||||
# print $data, "\n";
|
||||
|
||||
print MAKEFILE "OSNAME=$os\n";
|
||||
print MAKEFILE "ARCH=$architecture\n";
|
||||
print MAKEFILE "C_COMPILER=$compiler\n";
|
||||
print MAKEFILE "BINARY32=\n" if $binformat ne bin32;
|
||||
print MAKEFILE "BINARY64=\n" if $binformat ne bin64;
|
||||
print MAKEFILE "BINARY32=1\n" if $binformat eq bin32;
|
||||
print MAKEFILE "BINARY64=1\n" if $binformat eq bin64;
|
||||
print MAKEFILE "FU=$need_fu\n" if $need_fu ne "";
|
||||
print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne "";
|
||||
print MAKEFILE "CROSS=1\n" if $cross != 0;
|
||||
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
|
||||
print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
|
||||
print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
|
||||
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
|
||||
print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1;
|
||||
print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1;
|
||||
{
|
||||
printf "OSNAME=%s\n" "$os"
|
||||
printf "ARCH=%s\n" "$architecture"
|
||||
printf "C_COMPILER=%s\n" "$compiler"
|
||||
[ $binformat != 'bin32' ] && printf "BINARY32=\n"
|
||||
[ $binformat != 'bin64' ] && printf "BINARY64=\n"
|
||||
[ "$binformat" = "bin32" ] && printf "BINARY32=1\n"
|
||||
[ "$binformat" = "bin64" ] && printf "BINARY64=1\n"
|
||||
[ -n "$need_fu" ] && printf 'FU=%s\n' "$need_fu"
|
||||
[ "$cross" -ne 0 ] && [ -n "$cross_suffix" ] && \
|
||||
printf "CROSS_SUFFIX=%s\n" "$cross_suffix"
|
||||
[ "$cross" -ne 0 ] && printf "CROSS=1\n"
|
||||
printf "CEXTRALIB=%s %s %s\n" "$linker_L" "$linker_l" "$linker_a"
|
||||
[ "$have_msa" -eq 1 ] && {
|
||||
printf "HAVE_MSA=1\n"
|
||||
printf "MSA_FLAGS=%s\n" "$msa_flags"
|
||||
}
|
||||
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
|
||||
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
|
||||
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
|
||||
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
|
||||
} >> "$makefile"
|
||||
|
||||
$os =~ tr/[a-z]/[A-Z]/;
|
||||
$architecture =~ tr/[a-z]/[A-Z]/;
|
||||
$compiler =~ tr/[a-z]/[A-Z]/;
|
||||
os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ `
|
||||
architecture=`echo "$architecture" | tr '[[:lower:]]' '[[:upper:]]' `
|
||||
compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' `
|
||||
|
||||
print CONFFILE "#define OS_$os\t1\n";
|
||||
print CONFFILE "#define ARCH_$architecture\t1\n";
|
||||
print CONFFILE "#define C_$compiler\t1\n";
|
||||
print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32;
|
||||
print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64;
|
||||
print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
|
||||
print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1;
|
||||
print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1;
|
||||
{
|
||||
printf "#define OS_%s\t1\n" "$os"
|
||||
printf "#define ARCH_%s\t1\n" "$architecture"
|
||||
printf "#define C_%s\t1\n" "$compiler"
|
||||
[ "$binformat" = "bin32" ] && printf "#define __32BIT__\t1\n"
|
||||
[ "$binformat" = "bin64" ] && printf "#define __64BIT__\t1\n"
|
||||
[ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu"
|
||||
[ "$have_msa" -eq 1 ] && printf "#define HAVE_MSA\t1\n"
|
||||
[ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n"
|
||||
} >> "$config"
|
||||
|
||||
|
||||
if ($os eq "LINUX") {
|
||||
if [ "$os" = "LINUX" ]; then
|
||||
|
||||
# @pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`);
|
||||
|
||||
# if ($pthread[2] ne "") {
|
||||
# print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n";
|
||||
# } else {
|
||||
print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n";
|
||||
printf "#define PTHREAD_CREATE_FUNC pthread_create\n" >> "$config"
|
||||
# }
|
||||
} else {
|
||||
print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n";
|
||||
}
|
||||
|
||||
close(MAKEFILE);
|
||||
close(CONFFILE);
|
||||
else
|
||||
printf "#define PTHREAD_CREATE_FUNC pthread_create\n" >> "$config"
|
||||
fi
|
||||
|
|
|
|||
|
|
@ -0,0 +1,456 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
#use File::Basename;
|
||||
# use File::Temp qw(tempfile);
|
||||
|
||||
# Checking cross compile
|
||||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
||||
$hostarch = `uname -m | sed -e s/i.86/x86/`;
|
||||
$hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS");
|
||||
chop($hostarch);
|
||||
$hostarch = "x86_64" if ($hostarch eq "amd64");
|
||||
$hostarch = "arm" if ($hostarch ne "arm64" && $hostarch =~ /^arm.*/);
|
||||
$hostarch = "arm64" if ($hostarch eq "aarch64");
|
||||
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
|
||||
$hostarch = "zarch" if ($hostarch eq "s390x");
|
||||
|
||||
#$tmpf = new File::Temp( UNLINK => 1 );
|
||||
$binary = $ENV{"BINARY"};
|
||||
|
||||
$makefile = shift(@ARGV);
|
||||
$config = shift(@ARGV);
|
||||
|
||||
$compiler_name = shift(@ARGV);
|
||||
$flags = join(" ", @ARGV);
|
||||
|
||||
# First, we need to know the target OS and compiler name
|
||||
|
||||
$data = `$compiler_name $flags -E ctest.c`;
|
||||
|
||||
if ($?) {
|
||||
printf STDERR "C Compiler ($compiler_name) is something wrong.\n";
|
||||
die 1;
|
||||
}
|
||||
|
||||
$cross_suffix = "";
|
||||
|
||||
eval "use File::Basename";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Basename, emulating its functionality";
|
||||
my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 );
|
||||
if ($dirnam ne ".") {
|
||||
$cross_suffix .= $dirnam . "/";
|
||||
}
|
||||
my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1);
|
||||
if ($basnam =~ /([^\s]*-)(.*)/) {
|
||||
$cross_suffix .= $1;
|
||||
}
|
||||
} else {
|
||||
if (dirname($compiler_name) ne ".") {
|
||||
$cross_suffix .= dirname($compiler_name) . "/";
|
||||
}
|
||||
|
||||
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
|
||||
$cross_suffix .= $1;
|
||||
}
|
||||
}
|
||||
|
||||
$compiler = "";
|
||||
$compiler = LSB if ($data =~ /COMPILER_LSB/);
|
||||
$compiler = CLANG if ($data =~ /COMPILER_CLANG/);
|
||||
$compiler = PGI if ($data =~ /COMPILER_PGI/);
|
||||
$compiler = PATHSCALE if ($data =~ /COMPILER_PATHSCALE/);
|
||||
$compiler = INTEL if ($data =~ /COMPILER_INTEL/);
|
||||
$compiler = OPEN64 if ($data =~ /COMPILER_OPEN64/);
|
||||
$compiler = SUN if ($data =~ /COMPILER_SUN/);
|
||||
$compiler = IBM if ($data =~ /COMPILER_IBM/);
|
||||
$compiler = DEC if ($data =~ /COMPILER_DEC/);
|
||||
$compiler = FUJITSU if ($data =~ /COMPILER_FUJITSU/);
|
||||
$compiler = GCC if ($compiler eq "");
|
||||
|
||||
$os = Linux if ($data =~ /OS_LINUX/);
|
||||
$os = FreeBSD if ($data =~ /OS_FREEBSD/);
|
||||
$os = NetBSD if ($data =~ /OS_NETBSD/);
|
||||
$os = OpenBSD if ($data =~ /OS_OPENBSD/);
|
||||
$os = DragonFly if ($data =~ /OS_DRAGONFLY/);
|
||||
$os = Darwin if ($data =~ /OS_DARWIN/);
|
||||
$os = SunOS if ($data =~ /OS_SUNOS/);
|
||||
$os = AIX if ($data =~ /OS_AIX/);
|
||||
$os = osf if ($data =~ /OS_OSF/);
|
||||
$os = WINNT if ($data =~ /OS_WINNT/);
|
||||
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
|
||||
$os = Interix if ($data =~ /OS_INTERIX/);
|
||||
$os = Android if ($data =~ /OS_ANDROID/);
|
||||
$os = Haiku if ($data =~ /OS_HAIKU/);
|
||||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = e2k if ($data =~ /ARCH_E2K/);
|
||||
$architecture = power if ($data =~ /ARCH_POWER/);
|
||||
$architecture = mips if ($data =~ /ARCH_MIPS/);
|
||||
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
|
||||
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
|
||||
$architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
|
||||
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
||||
|
||||
$defined = 0;
|
||||
|
||||
if ($os eq "AIX") {
|
||||
$compiler_name .= " -maix32" if ($binary eq "32");
|
||||
$compiler_name .= " -maix64" if ($binary eq "64");
|
||||
$defined = 1;
|
||||
}
|
||||
|
||||
if ($architecture eq "mips") {
|
||||
$compiler_name .= " -mabi=32";
|
||||
$defined = 1;
|
||||
}
|
||||
|
||||
if ($architecture eq "mips64") {
|
||||
$compiler_name .= " -mabi=n32" if ($binary eq "32");
|
||||
$compiler_name .= " -mabi=64" if ($binary eq "64");
|
||||
$defined = 1;
|
||||
}
|
||||
|
||||
if (($architecture eq "arm") || ($architecture eq "arm64")) {
|
||||
$defined = 1;
|
||||
}
|
||||
|
||||
if ($architecture eq "zarch") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "e2k") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "alpha") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "ia64") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if (($architecture eq "x86") && ($os ne Darwin) && ($os ne SunOS)) {
|
||||
$defined = 1;
|
||||
$binary =32;
|
||||
}
|
||||
|
||||
if ($architecture eq "riscv64") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "loongarch64") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($compiler eq "PGI") {
|
||||
$compiler_name .= " -tp p7" if ($binary eq "32");
|
||||
$compiler_name .= " -tp p7-64" if ($binary eq "64");
|
||||
$openmp = "-mp";
|
||||
$defined = 1;
|
||||
}
|
||||
|
||||
if ($compiler eq "IBM") {
|
||||
$compiler_name .= " -q32" if ($binary eq "32");
|
||||
$compiler_name .= " -q64" if ($binary eq "64");
|
||||
$openmp = "-qsmp=omp";
|
||||
$defined = 1;
|
||||
}
|
||||
|
||||
if ($compiler eq "INTEL") {
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($compiler eq "PATHSCALE") {
|
||||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($compiler eq "OPEN64") {
|
||||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($compiler eq "CLANG") {
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($compiler eq "GCC" || $compiler eq "LSB") {
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($compiler eq "FUJITSU") {
|
||||
$openmp = "-Kopenmp";
|
||||
}
|
||||
|
||||
if ($defined == 0) {
|
||||
$compiler_name .= " -m32" if ($binary eq "32");
|
||||
$compiler_name .= " -m64" if ($binary eq "64");
|
||||
}
|
||||
|
||||
# Do again
|
||||
|
||||
$data = `$compiler_name $flags -E ctest.c`;
|
||||
|
||||
if ($?) {
|
||||
printf STDERR "C Compiler ($compiler_name) is something wrong.\n";
|
||||
die 1;
|
||||
}
|
||||
|
||||
$have_msa = 0;
|
||||
if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
||||
eval "use File::Temp qw(tempfile)";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Temp, so could not check MSA capatibility";
|
||||
} else {
|
||||
$tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
|
||||
$code = '"addvi.b $w0, $w1, 1"';
|
||||
$msa_flags = "-mmsa -mfp64 -mload-store-pairs";
|
||||
print $tmpf "#include <msa.h>\n\n";
|
||||
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
|
||||
|
||||
$args = "$msa_flags -o $tmpf.o $tmpf";
|
||||
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$have_msa = 0;
|
||||
} else {
|
||||
$have_msa = 1;
|
||||
}
|
||||
unlink("$tmpf.o");
|
||||
}
|
||||
}
|
||||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = e2k if ($data =~ /ARCH_E2K/);
|
||||
$architecture = power if ($data =~ /ARCH_POWER/);
|
||||
$architecture = mips if ($data =~ /ARCH_MIPS/);
|
||||
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
|
||||
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
|
||||
$architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
||||
|
||||
$binformat = bin32;
|
||||
$binformat = bin64 if ($data =~ /BINARY_64/);
|
||||
|
||||
$no_avx512= 0;
|
||||
if (($architecture eq "x86") || ($architecture eq "x86_64")) {
|
||||
eval "use File::Temp qw(tempfile)";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512";
|
||||
$no_avx512 = 0;
|
||||
} else {
|
||||
# $tmpf = new File::Temp( UNLINK => 1 );
|
||||
($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
|
||||
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
|
||||
print $fh "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
|
||||
$args = " -march=skylake-avx512 -c -o $tmpf.o $tmpf";
|
||||
if ($compiler eq "PGI") {
|
||||
$args = " -tp skylake -c -o $tmpf.o $tmpf";
|
||||
}
|
||||
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$no_avx512 = 1;
|
||||
} else {
|
||||
$no_avx512 = 0;
|
||||
}
|
||||
unlink("$tmpf.o");
|
||||
}
|
||||
}
|
||||
|
||||
$no_rv64gv= 0;
|
||||
if (($architecture eq "riscv64")) {
|
||||
eval "use File::Temp qw(tempfile)";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Temp, so could not check compiler compatibility with the RISCV vector extension";
|
||||
$no_rv64gv = 0;
|
||||
} else {
|
||||
# $tmpf = new File::Temp( UNLINK => 1 );
|
||||
($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
|
||||
$code = '"vsetvli zero, zero, e8, m1\n"';
|
||||
print $fh "int main(void){ __asm__ volatile($code); }\n";
|
||||
$args = " -march=rv64gv -c -o $tmpf.o $tmpf";
|
||||
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$no_rv64gv = 1;
|
||||
} else {
|
||||
$no_rv64gv = 0;
|
||||
}
|
||||
unlink("$tmpf.o");
|
||||
}
|
||||
}
|
||||
|
||||
$c11_atomics = 0;
|
||||
if ($data =~ /HAVE_C11/) {
|
||||
eval "use File::Temp qw(tempfile)";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Temp, so could not check compiler compatibility with C11";
|
||||
$c11_atomics = 0;
|
||||
} else {
|
||||
($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
|
||||
print $fh "#include <stdatomic.h>\nint main(void){}\n";
|
||||
$args = " -c -o $tmpf.o $tmpf";
|
||||
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$c11_atomics = 0;
|
||||
} else {
|
||||
$c11_atomics = 1;
|
||||
}
|
||||
unlink("$tmpf.o");
|
||||
}
|
||||
}
|
||||
|
||||
if ($compiler eq "GCC" &&( ($architecture eq "x86") || ($architecture eq "x86_64"))) {
|
||||
$no_avx2 = 0;
|
||||
$oldgcc = 0;
|
||||
$data = `$compiler_name -dumpversion`;
|
||||
if ($data <= 4.6) {
|
||||
$no_avx2 = 1;
|
||||
$oldgcc = 1;
|
||||
}
|
||||
}
|
||||
|
||||
$data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
|
||||
|
||||
$data =~ /globl\s([_\.]*)(.*)/;
|
||||
|
||||
$need_fu = $1;
|
||||
|
||||
$cross = 0;
|
||||
|
||||
if ($architecture ne $hostarch) {
|
||||
$cross = 1;
|
||||
$cross = 0 if (($hostarch eq "x86_64") && ($architecture eq "x86"));
|
||||
$cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips"));
|
||||
}
|
||||
|
||||
$cross = 1 if ($os ne $hostos);
|
||||
$cross = 0 if (($os eq "Android") && ($hostos eq "Linux") && ($ENV{TERMUX_APP_PID} != ""));
|
||||
|
||||
$openmp = "" if $ENV{USE_OPENMP} != 1;
|
||||
|
||||
$linker_L = "";
|
||||
$linker_l = "";
|
||||
$linker_a = "";
|
||||
|
||||
{
|
||||
$link = `$compiler_name $flags -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $flags $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`;
|
||||
|
||||
$link =~ s/\-Y\sP\,/\-Y/g;
|
||||
|
||||
@flags = split(/[\s\,\n]/, $link);
|
||||
# remove leading and trailing quotes from each flag.
|
||||
@flags = map {s/^['"]|['"]$//g; $_} @flags;
|
||||
|
||||
foreach $flags (@flags) {
|
||||
if (
|
||||
($flags =~ /^\-L/)
|
||||
&& ($flags !~ /^-LIST:/)
|
||||
&& ($flags !~ /^-LANG:/)
|
||||
) {
|
||||
$linker_L .= $flags . " "
|
||||
}
|
||||
|
||||
if ($flags =~ /^\-Y/) {
|
||||
$linker_L .= "-Wl,". $flags . " "
|
||||
}
|
||||
|
||||
if ($flags =~ /^\--exclude-libs/) {
|
||||
$linker_L .= "-Wl,". $flags . " ";
|
||||
$flags="";
|
||||
}
|
||||
|
||||
if (
|
||||
($flags =~ /^\-l/)
|
||||
&& ($flags !~ /gfortranbegin/)
|
||||
&& ($flags !~ /frtbegin/)
|
||||
&& ($flags !~ /pathfstart/)
|
||||
&& ($flags !~ /numa/)
|
||||
&& ($flags !~ /crt[0-9]/)
|
||||
&& ($flags !~ /gcc/)
|
||||
&& ($flags !~ /user32/)
|
||||
&& ($flags !~ /kernel32/)
|
||||
&& ($flags !~ /advapi32/)
|
||||
&& ($flags !~ /shell32/)
|
||||
&& ($flags !~ /omp/)
|
||||
&& ($flags !~ /[0-9]+/)
|
||||
) {
|
||||
$linker_l .= $flags . " "
|
||||
}
|
||||
|
||||
$linker_a .= $flags . " " if $flags =~ /\.a$/;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
open(MAKEFILE, "> $makefile") || die "Can't create $makefile";
|
||||
open(CONFFILE, "> $config" ) || die "Can't create $config";
|
||||
|
||||
# print $data, "\n";
|
||||
|
||||
print MAKEFILE "OSNAME=$os\n";
|
||||
print MAKEFILE "ARCH=$architecture\n";
|
||||
print MAKEFILE "C_COMPILER=$compiler\n";
|
||||
print MAKEFILE "BINARY32=\n" if $binformat ne bin32;
|
||||
print MAKEFILE "BINARY64=\n" if $binformat ne bin64;
|
||||
print MAKEFILE "BINARY32=1\n" if $binformat eq bin32;
|
||||
print MAKEFILE "BINARY64=1\n" if $binformat eq bin64;
|
||||
print MAKEFILE "FU=$need_fu\n" if $need_fu ne "";
|
||||
print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne "";
|
||||
print MAKEFILE "CROSS=1\n" if $cross != 0;
|
||||
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
|
||||
print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
|
||||
print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
|
||||
print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1;
|
||||
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
|
||||
print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1;
|
||||
print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1;
|
||||
|
||||
$os =~ tr/[a-z]/[A-Z]/;
|
||||
$architecture =~ tr/[a-z]/[A-Z]/;
|
||||
$compiler =~ tr/[a-z]/[A-Z]/;
|
||||
|
||||
print CONFFILE "#define OS_$os\t1\n";
|
||||
print CONFFILE "#define ARCH_$architecture\t1\n";
|
||||
print CONFFILE "#define C_$compiler\t1\n";
|
||||
print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32;
|
||||
print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64;
|
||||
print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
|
||||
print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1;
|
||||
print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1;
|
||||
|
||||
|
||||
if ($os eq "LINUX") {
|
||||
|
||||
# @pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`);
|
||||
|
||||
# if ($pthread[2] ne "") {
|
||||
# print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n";
|
||||
# } else {
|
||||
print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n";
|
||||
# }
|
||||
} else {
|
||||
print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n";
|
||||
}
|
||||
|
||||
close(MAKEFILE);
|
||||
close(CONFFILE);
|
||||
2
cblas.h
2
cblas.h
|
|
@ -28,6 +28,8 @@ char* openblas_get_corename(void);
|
|||
#ifdef OPENBLAS_OS_LINUX
|
||||
/* Sets thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */
|
||||
int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set);
|
||||
/* Queries thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */
|
||||
int openblas_getaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set);
|
||||
#endif
|
||||
|
||||
/* Get the parallelization type which is used by OpenBLAS */
|
||||
|
|
|
|||
|
|
@ -161,6 +161,30 @@ if (${CORE} STREQUAL ARMV8SVE)
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL CORTEXA510)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL CORTEXA710)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL CORTEXX1)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL CORTEXX2)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL POWER10)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
|
|
|
|||
|
|
@ -50,6 +50,15 @@ else()
|
|||
set(ONLY_CBLAS_IN ${ONLY_CBLAS})
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED USE_PERL)
|
||||
add_custom_command(
|
||||
OUTPUT ${PROJECT_BINARY_DIR}/openblas.def
|
||||
#TARGET ${OpenBLAS_LIBNAME} PRE_LINK
|
||||
COMMAND "${PROJECT_SOURCE_DIR}/exports/gensymbol"
|
||||
ARGS "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def"
|
||||
COMMENT "Create openblas.def file"
|
||||
VERBATIM)
|
||||
else
|
||||
add_custom_command(
|
||||
OUTPUT ${PROJECT_BINARY_DIR}/openblas.def
|
||||
#TARGET ${OpenBLAS_LIBNAME} PRE_LINK
|
||||
|
|
@ -57,5 +66,5 @@ add_custom_command(
|
|||
ARGS "${PROJECT_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def"
|
||||
COMMENT "Create openblas.def file"
|
||||
VERBATIM)
|
||||
|
||||
endif()
|
||||
endif()
|
||||
|
|
@ -25,11 +25,19 @@ check_language(Fortran)
|
|||
if(CMAKE_Fortran_COMPILER)
|
||||
enable_language(Fortran)
|
||||
else()
|
||||
if (NOT NO_LAPACK)
|
||||
message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK")
|
||||
endif()
|
||||
set (NOFORTRAN 1)
|
||||
set (NO_LAPACK 1)
|
||||
if (NOT NO_LAPACK)
|
||||
if (NOT XXXXX)
|
||||
message(STATUS "No Fortran compiler found, can build only BLAS and f2c-converted LAPACK")
|
||||
set(C_LAPACK 1)
|
||||
if (INTERFACE64)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -DLAPACK_ILP64")
|
||||
endif ()
|
||||
set(TIMER "NONE")
|
||||
else ()
|
||||
message(STATUS "No Fortran compiler found, can build only BLAS")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (NOT ONLY_CBLAS)
|
||||
|
|
|
|||
|
|
@ -67,7 +67,15 @@ if (${F_COMPILER} STREQUAL "GFORTRAN")
|
|||
if (BINARY64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
|
||||
if (INTERFACE64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
|
||||
if (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel")
|
||||
if (WIN32)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} /integer-size:64")
|
||||
else ()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -integer-size 64")
|
||||
endif ()
|
||||
else ()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
|
||||
endif ()
|
||||
endif ()
|
||||
else ()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -m32")
|
||||
|
|
@ -214,6 +222,17 @@ if (${F_COMPILER} STREQUAL "COMPAQ")
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${F_COMPILER} STREQUAL "CRAY")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_INTEL")
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -hnopattern")
|
||||
if (INTERFACE64)
|
||||
set (FCOMMON_OPT "${FCOMMON_OPT} -s integer64")
|
||||
endif ()
|
||||
if (NOT USE_OPENMP)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -O noomp")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
# from the root Makefile - this is for lapack-netlib to compile the correct secnd file.
|
||||
if (${F_COMPILER} STREQUAL "GFORTRAN")
|
||||
set(TIMER "INT_ETIME")
|
||||
|
|
|
|||
|
|
@ -1,12 +1,14 @@
|
|||
# Sources for compiling lapack-netlib. Can't use CMakeLists.txt because lapack-netlib already has its own cmake files.
|
||||
|
||||
if (NOT C_LAPACK)
|
||||
message (STATUS "fortran lapack")
|
||||
set(ALLAUX ilaenv.f ilaenv2stage.f ieeeck.f lsamen.f iparmq.f iparam2stage.F
|
||||
ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f dlaset.f
|
||||
ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f dlaset.f la_xisnan.F90
|
||||
../INSTALL/ilaver.f xerbla_array.f
|
||||
../INSTALL/slamch.f)
|
||||
|
||||
set(SCLAUX
|
||||
scombssq.f sbdsvdx.f sstevx.f sstein.f
|
||||
la_constants.f90
|
||||
sbdsdc.f
|
||||
sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f
|
||||
slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f
|
||||
|
|
@ -15,16 +17,17 @@ set(SCLAUX
|
|||
slapy2.f slapy3.f slarnv.f
|
||||
slarra.f slarrb.f slarrc.f slarrd.f slarre.f slarrf.f slarrj.f
|
||||
slarrk.f slarrr.f slaneg.f
|
||||
slartg.f slaruv.f slas2.f slascl.f
|
||||
slartg.f90 slaruv.f slas2.f slascl.f
|
||||
slasd0.f slasd1.f slasd2.f slasd3.f slasd4.f slasd5.f slasd6.f
|
||||
slasd7.f slasd8.f slasda.f slasdq.f slasdt.f
|
||||
slaset.f slasq1.f slasq2.f slasq3.f slasq4.f slasq5.f slasq6.f
|
||||
slasr.f slasrt.f slassq.f slasv2.f spttrf.f sstebz.f sstedc.f
|
||||
slasr.f slasrt.f slassq.f90 slasv2.f spttrf.f sstebz.f sstedc.f
|
||||
ssteqr.f ssterf.f slaisnan.f sisnan.f
|
||||
slartgp.f slartgs.f
|
||||
slartgp.f slartgs.f ../INSTALL/sroundup_lwork.f
|
||||
../INSTALL/second_${TIMER}.f)
|
||||
|
||||
set(DZLAUX
|
||||
la_constants.f90
|
||||
dbdsdc.f
|
||||
dbdsvdx.f dstevx.f dstein.f
|
||||
dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f
|
||||
|
|
@ -34,13 +37,13 @@ set(DZLAUX
|
|||
dlapy2.f dlapy3.f dlarnv.f
|
||||
dlarra.f dlarrb.f dlarrc.f dlarrd.f dlarre.f dlarrf.f dlarrj.f
|
||||
dlarrk.f dlarrr.f dlaneg.f
|
||||
dlartg.f dlaruv.f dlas2.f dlascl.f
|
||||
dlartg.f90 dlaruv.f dlas2.f dlascl.f
|
||||
dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f
|
||||
dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f
|
||||
dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f
|
||||
dlasr.f dlasrt.f dlassq.f dlasv2.f dpttrf.f dstebz.f dstedc.f
|
||||
dlasr.f dlasrt.f dlassq.f90 dlasv2.f dpttrf.f dstebz.f dstedc.f
|
||||
dsteqr.f dsterf.f dlaisnan.f disnan.f
|
||||
dlartgp.f dlartgs.f
|
||||
dlartgp.f dlartgs.f ../INSTALL/droundup_lwork.f
|
||||
../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f)
|
||||
|
||||
set(SLASRC
|
||||
|
|
@ -58,6 +61,7 @@ set(SLASRC
|
|||
sggrqf.f sggsvd3.f sggsvp3.f sgtcon.f sgtrfs.f sgtsv.f
|
||||
sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f
|
||||
shsein.f shseqr.f slabrd.f slacon.f slacn2.f
|
||||
slaqz0.f slaqz1.f slaqz2.f slaqz3.f slaqz4.f
|
||||
slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f
|
||||
slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f
|
||||
slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f
|
||||
|
|
@ -170,10 +174,11 @@ set(CLASRC
|
|||
clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f
|
||||
claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f
|
||||
claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
|
||||
claqz0.f claqz1.f claqz2.f claqz3.f
|
||||
claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
|
||||
clarf.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
|
||||
clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f clartv.f
|
||||
clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f
|
||||
clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f90 clartv.f
|
||||
clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f90
|
||||
clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f
|
||||
clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrz.f
|
||||
cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f
|
||||
|
|
@ -244,6 +249,7 @@ set(DLASRC
|
|||
dggglm.f dgghrd.f dgghd3.f dgglse.f dggqrf.f
|
||||
dggrqf.f dggsvd3.f dggsvp3.f dgtcon.f dgtrfs.f dgtsv.f
|
||||
dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f
|
||||
dlaqz0.f dlaqz1.f dlaqz2.f dlaqz3.f dlaqz4.f
|
||||
dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f
|
||||
dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f
|
||||
dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f
|
||||
|
|
@ -345,6 +351,7 @@ set(ZLASRC
|
|||
zhetrs_3.f zhecon_3.f zhesv_rk.f
|
||||
zhesv_aa.f zhesv_aa_2stage.f zhetrf_aa.f zhetrf_aa_2stage.f zhetrs_aa.f zhetrs_aa_2stage.f
|
||||
zhgeqz.f zhpcon.f zhpev.f zhpevd.f
|
||||
zlaqz0.f zlaqz1.f zlaqz2.f zlaqz3.f
|
||||
zhpevx.f zhpgst.f zhpgv.f zhpgvd.f zhpgvx.f zhprfs.f zhpsv.f
|
||||
zhpsvx.f
|
||||
zhptrd.f zhptrf.f zhptri.f zhptrs.f zhsein.f zhseqr.f zlabrd.f
|
||||
|
|
@ -362,9 +369,9 @@ set(ZLASRC
|
|||
zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
|
||||
zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f
|
||||
zlarfg.f zlarfgp.f zlarft.f
|
||||
zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f
|
||||
zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f90 zlartv.f
|
||||
zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
|
||||
zlassq.f zlasyf.f zlasyf_rook.f zlasyf_rk.f zlasyf_aa.f
|
||||
zlassq.f90 zlasyf.f zlasyf_rook.f zlasyf_rk.f zlasyf_aa.f
|
||||
zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f
|
||||
zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f
|
||||
zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f
|
||||
|
|
@ -488,6 +495,499 @@ if(BUILD_COMPLEX16)
|
|||
message(STATUS "Building Double Precision Complex")
|
||||
endif()
|
||||
|
||||
else ()
|
||||
|
||||
message (STATUS "c lapack")
|
||||
set(ALLAUX ilaenv.c ilaenv2stage.c ieeeck.c lsamen.c iparmq.c iparam2stage.c
|
||||
ilaprec.c ilatrans.c ilauplo.c iladiag.c chla_transtype.c dlaset.c
|
||||
../INSTALL/ilaver.c xerbla_array.c
|
||||
../INSTALL/slamch.c)
|
||||
|
||||
set(SCLAUX
|
||||
scombssq.c sbdsvdx.c sstevx.c sstein.c
|
||||
sbdsdc.c
|
||||
sbdsqr.c sdisna.c slabad.c slacpy.c sladiv.c slae2.c slaebz.c
|
||||
slaed0.c slaed1.c slaed2.c slaed3.c slaed4.c slaed5.c slaed6.c
|
||||
slaed7.c slaed8.c slaed9.c slaeda.c slaev2.c slagtf.c
|
||||
slagts.c slamrg.c slanst.c
|
||||
slapy2.c slapy3.c slarnv.c
|
||||
slarra.c slarrb.c slarrc.c slarrd.c slarre.c slarrf.c slarrj.c
|
||||
slarrk.c slarrr.c slaneg.c
|
||||
slartg.c slaruv.c slas2.c slascl.c
|
||||
slasd0.c slasd1.c slasd2.c slasd3.c slasd4.c slasd5.c slasd6.c
|
||||
slasd7.c slasd8.c slasda.c slasdq.c slasdt.c
|
||||
slaset.c slasq1.c slasq2.c slasq3.c slasq4.c slasq5.c slasq6.c
|
||||
slasr.c slasrt.c slassq.c slasv2.c spttrf.c sstebz.c sstedc.c
|
||||
ssteqr.c ssterf.c slaisnan.c sisnan.c
|
||||
slartgp.c slartgs.c
|
||||
../INSTALL/second_${TIMER}.c)
|
||||
|
||||
set(DZLAUX
|
||||
dbdsdc.c
|
||||
dbdsvdx.c dstevx.c dstein.c
|
||||
dbdsqr.c ddisna.c dlabad.c dlacpy.c dladiv.c dlae2.c dlaebz.c
|
||||
dlaed0.c dlaed1.c dlaed2.c dlaed3.c dlaed4.c dlaed5.c dlaed6.c
|
||||
dlaed7.c dlaed8.c dlaed9.c dlaeda.c dlaev2.c dlagtf.c
|
||||
dlagts.c dlamrg.c dlanst.c
|
||||
dlapy2.c dlapy3.c dlarnv.c
|
||||
dlarra.c dlarrb.c dlarrc.c dlarrd.c dlarre.c dlarrf.c dlarrj.c
|
||||
dlarrk.c dlarrr.c dlaneg.c
|
||||
dlartg.c dlaruv.c dlas2.c dlascl.c
|
||||
dlasd0.c dlasd1.c dlasd2.c dlasd3.c dlasd4.c dlasd5.c dlasd6.c
|
||||
dlasd7.c dlasd8.c dlasda.c dlasdq.c dlasdt.c
|
||||
dlasq1.c dlasq2.c dlasq3.c dlasq4.c dlasq5.c dlasq6.c
|
||||
dlasr.c dlasrt.c dlassq.c dlasv2.c dpttrf.c dstebz.c dstedc.c
|
||||
dsteqr.c dsterf.c dlaisnan.c disnan.c
|
||||
dlartgp.c dlartgs.c
|
||||
../INSTALL/dlamch.c ../INSTALL/dsecnd_${TIMER}.c)
|
||||
|
||||
set(SLASRC
|
||||
sgbbrd.c sgbcon.c sgbequ.c sgbrfs.c sgbsv.c
|
||||
sgbsvx.c sgbtf2.c sgbtrf.c sgbtrs.c sgebak.c sgebal.c sgebd2.c
|
||||
sgebrd.c sgecon.c sgeequ.c sgees.c sgeesx.c sgeev.c sgeevx.c
|
||||
sgehd2.c sgehrd.c sgelq2.c sgelqf.c
|
||||
sgels.c sgelsd.c sgelss.c sgelsy.c sgeql2.c sgeqlf.c
|
||||
sgeqp3.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c
|
||||
sgesc2.c sgesdd.c sgesvd.c sgesvdx.c sgesvx.c sgetc2.c
|
||||
sgetrf2.c sgetri.c
|
||||
sggbak.c sggbal.c
|
||||
sgges.c sgges3.c sggesx.c sggev.c sggev3.c sggevx.c
|
||||
sggglm.c sgghrd.c sgghd3.c sgglse.c sggqrf.c
|
||||
sggrqf.c sggsvd3.c sggsvp3.c sgtcon.c sgtrfs.c sgtsv.c
|
||||
sgtsvx.c sgttrf.c sgttrs.c sgtts2.c shgeqz.c
|
||||
shsein.c shseqr.c slabrd.c slacon.c slacn2.c
|
||||
slaein.c slaexc.c slag2.c slags2.c slagtm.c slagv2.c slahqr.c
|
||||
slahr2.c slaic1.c slaln2.c slals0.c slalsa.c slalsd.c
|
||||
slangb.c slange.c slangt.c slanhs.c slansb.c slansp.c
|
||||
slansy.c slantb.c slantp.c slantr.c slanv2.c
|
||||
slapll.c slapmt.c
|
||||
slaqgb.c slaqge.c slaqp2.c slaqps.c slaqsb.c slaqsp.c slaqsy.c
|
||||
slaqr0.c slaqr1.c slaqr2.c slaqr3.c slaqr4.c slaqr5.c
|
||||
slaqtr.c slar1v.c slar2v.c ilaslr.c ilaslc.c
|
||||
slarf.c slarfb.c slarfb_gett.c slarfg.c slarfgp.c slarft.c slarfx.c slarfy.c slargv.c
|
||||
slarrv.c slartv.c
|
||||
slarz.c slarzb.c slarzt.c slasy2.c
|
||||
slasyf.c slasyf_rook.c slasyf_rk.c slasyf_aa.c
|
||||
slatbs.c slatdf.c slatps.c slatrd.c slatrs.c slatrz.c
|
||||
sopgtr.c sopmtr.c sorg2l.c sorg2r.c
|
||||
sorgbr.c sorghr.c sorgl2.c sorglq.c sorgql.c sorgqr.c sorgr2.c
|
||||
sorgrq.c sorgtr.c sorm2l.c sorm2r.c sorm22.c
|
||||
sormbr.c sormhr.c sorml2.c sormlq.c sormql.c sormqr.c sormr2.c
|
||||
sormr3.c sormrq.c sormrz.c sormtr.c spbcon.c spbequ.c spbrfs.c
|
||||
spbstf.c spbsv.c spbsvx.c
|
||||
spbtf2.c spbtrf.c spbtrs.c spocon.c spoequ.c sporfs.c sposv.c
|
||||
sposvx.c spotrf2.c spotri.c spstrf.c spstf2.c
|
||||
sppcon.c sppequ.c
|
||||
spprfs.c sppsv.c sppsvx.c spptrf.c spptri.c spptrs.c sptcon.c
|
||||
spteqr.c sptrfs.c sptsv.c sptsvx.c spttrs.c sptts2.c srscl.c
|
||||
ssbev.c ssbevd.c ssbevx.c ssbgst.c ssbgv.c ssbgvd.c ssbgvx.c
|
||||
ssbtrd.c sspcon.c sspev.c sspevd.c sspevx.c sspgst.c
|
||||
sspgv.c sspgvd.c sspgvx.c ssprfs.c sspsv.c sspsvx.c ssptrd.c
|
||||
ssptrf.c ssptri.c ssptrs.c sstegr.c sstev.c sstevd.c sstevr.c
|
||||
ssycon.c ssyev.c ssyevd.c ssyevr.c ssyevx.c ssygs2.c
|
||||
ssygst.c ssygv.c ssygvd.c ssygvx.c ssyrfs.c ssysv.c ssysvx.c
|
||||
ssytd2.c ssytf2.c ssytrd.c ssytrf.c ssytri.c ssytri2.c ssytri2x.c
|
||||
ssyswapr.c ssytrs.c ssytrs2.c
|
||||
ssyconv.c ssyconvf.c ssyconvf_rook.c
|
||||
ssysv_aa.c ssysv_aa_2stage.c ssytrf_aa.c ssytrf_aa_2stage.c ssytrs_aa.c ssytrs_aa_2stage.c
|
||||
ssytf2_rook.c ssytrf_rook.c ssytrs_rook.c
|
||||
ssytri_rook.c ssycon_rook.c ssysv_rook.c
|
||||
ssytf2_rk.c ssytrf_rk.c ssytrs_3.c
|
||||
ssytri_3.c ssytri_3x.c ssycon_3.c ssysv_rk.c
|
||||
ssysv_aa.c ssytrf_aa.c ssytrs_aa.c
|
||||
stbcon.c
|
||||
stbrfs.c stbtrs.c stgevc.c stgex2.c stgexc.c stgsen.c
|
||||
stgsja.c stgsna.c stgsy2.c stgsyl.c stpcon.c stprfs.c stptri.c
|
||||
stptrs.c
|
||||
strcon.c strevc.c strevc3.c strexc.c strrfs.c strsen.c strsna.c strsyl.c
|
||||
strtrs.c stzrzf.c sstemr.c
|
||||
slansf.c spftrf.c spftri.c spftrs.c ssfrk.c stfsm.c stftri.c stfttp.c
|
||||
stfttr.c stpttf.c stpttr.c strttf.c strttp.c
|
||||
sgejsv.c sgesvj.c sgsvj0.c sgsvj1.c
|
||||
sgeequb.c ssyequb.c spoequb.c sgbequb.c
|
||||
sbbcsd.c slapmr.c sorbdb.c sorbdb1.c sorbdb2.c sorbdb3.c sorbdb4.c
|
||||
sorbdb5.c sorbdb6.c sorcsd.c sorcsd2by1.c
|
||||
sgeqrt.c sgeqrt2.c sgeqrt3.c sgemqrt.c
|
||||
stpqrt.c stpqrt2.c stpmqrt.c stprfb.c
|
||||
sgelqt.c sgelqt3.c sgemlqt.c
|
||||
sgetsls.c sgetsqrhrt.c sgeqr.c slatsqr.c slamtsqr.c sgemqr.c
|
||||
sgelq.c slaswlq.c slamswlq.c sgemlq.c
|
||||
stplqt.c stplqt2.c stpmlqt.c
|
||||
ssytrd_2stage.c ssytrd_sy2sb.c ssytrd_sb2st.c ssb2st_kernels.c
|
||||
ssyevd_2stage.c ssyev_2stage.c ssyevx_2stage.c ssyevr_2stage.c
|
||||
ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c
|
||||
sgesvdq.c slaorhr_col_getrfnp.c
|
||||
slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c )
|
||||
|
||||
set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c
|
||||
sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c
|
||||
sla_syrfsx_extended.c sla_syamv.c sla_syrcond.c sla_syrpvgrw.c
|
||||
sposvxx.c sporfsx.c sla_porfsx_extended.c sla_porcond.c
|
||||
sla_porpvgrw.c sgbsvxx.c sgbrfsx.c sla_gbrfsx_extended.c
|
||||
sla_gbamv.c sla_gbrcond.c sla_gbrpvgrw.c sla_lin_berr.c slarscl2.c
|
||||
slascl2.c sla_wwaddw.c)
|
||||
|
||||
set(CLASRC
|
||||
cbdsqr.c cgbbrd.c cgbcon.c cgbequ.c cgbrfs.c cgbsv.c cgbsvx.c
|
||||
cgbtf2.c cgbtrf.c cgbtrs.c cgebak.c cgebal.c cgebd2.c cgebrd.c
|
||||
cgecon.c cgeequ.c cgees.c cgeesx.c cgeev.c cgeevx.c
|
||||
cgehd2.c cgehrd.c cgelq2.c cgelqf.c
|
||||
cgels.c cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c
|
||||
cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c
|
||||
cgesc2.c cgesdd.c cgesvd.c cgesvdx.c
|
||||
cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c
|
||||
cgesvx.c cgetc2.c cgetrf2.c
|
||||
cgetri.c
|
||||
cggbak.c cggbal.c
|
||||
cgges.c cgges3.c cggesx.c cggev.c cggev3.c cggevx.c
|
||||
cggglm.c cgghrd.c cgghd3.c cgglse.c cggqrf.c cggrqf.c
|
||||
cggsvd3.c cggsvp3.c
|
||||
cgtcon.c cgtrfs.c cgtsv.c cgtsvx.c cgttrf.c cgttrs.c cgtts2.c chbev.c
|
||||
chbevd.c chbevx.c chbgst.c chbgv.c chbgvd.c chbgvx.c chbtrd.c
|
||||
checon.c cheev.c cheevd.c cheevr.c cheevx.c chegs2.c chegst.c
|
||||
chegv.c chegvd.c chegvx.c cherfs.c chesv.c chesvx.c chetd2.c
|
||||
chetf2.c chetrd.c
|
||||
chetrf.c chetri.c chetri2.c chetri2x.c cheswapr.c
|
||||
chetrs.c chetrs2.c
|
||||
chetf2_rook.c chetrf_rook.c chetri_rook.c
|
||||
chetrs_rook.c checon_rook.c chesv_rook.c
|
||||
chetf2_rk.c chetrf_rk.c chetri_3.c chetri_3x.c
|
||||
chetrs_3.c checon_3.c chesv_rk.c
|
||||
chesv_aa.c chesv_aa_2stage.c chetrf_aa.c chetrf_aa_2stage.c chetrs_aa.c chetrs_aa_2stage.c
|
||||
chgeqz.c chpcon.c chpev.c chpevd.c
|
||||
chpevx.c chpgst.c chpgv.c chpgvd.c chpgvx.c chprfs.c chpsv.c
|
||||
chpsvx.c
|
||||
chptrd.c chptrf.c chptri.c chptrs.c chsein.c chseqr.c clabrd.c
|
||||
clacgv.c clacon.c clacn2.c clacp2.c clacpy.c clacrm.c clacrt.c cladiv.c
|
||||
claed0.c claed7.c claed8.c
|
||||
claein.c claesy.c claev2.c clags2.c clagtm.c
|
||||
clahef.c clahef_rook.c clahef_rk.c clahef_aa.c clahqr.c
|
||||
clahr2.c claic1.c clals0.c clalsa.c clalsd.c clangb.c clange.c clangt.c
|
||||
clanhb.c clanhe.c
|
||||
clanhp.c clanhs.c clanht.c clansb.c clansp.c clansy.c clantb.c
|
||||
clantp.c clantr.c clapll.c clapmt.c clarcm.c claqgb.c claqge.c
|
||||
claqhb.c claqhe.c claqhp.c claqp2.c claqps.c claqsb.c
|
||||
claqr0.c claqr1.c claqr2.c claqr3.c claqr4.c claqr5.c
|
||||
claqsp.c claqsy.c clar1v.c clar2v.c ilaclr.c ilaclc.c
|
||||
clarf.c clarfb.c clarfb_gett.c clarfg.c clarfgp.c clarft.c
|
||||
clarfx.c clarfy.c clargv.c clarnv.c clarrv.c clartg.c clartv.c
|
||||
clarz.c clarzb.c clarzt.c clascl.c claset.c clasr.c classq.c
|
||||
clasyf.c clasyf_rook.c clasyf_rk.c clasyf_aa.c
|
||||
clatbs.c clatdf.c clatps.c clatrd.c clatrs.c clatrz.c
|
||||
cpbcon.c cpbequ.c cpbrfs.c cpbstf.c cpbsv.c
|
||||
cpbsvx.c cpbtf2.c cpbtrf.c cpbtrs.c cpocon.c cpoequ.c cporfs.c
|
||||
cposv.c cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c
|
||||
cppcon.c cppequ.c cpprfs.c cppsv.c cppsvx.c cpptrf.c cpptri.c cpptrs.c
|
||||
cptcon.c cpteqr.c cptrfs.c cptsv.c cptsvx.c cpttrf.c cpttrs.c cptts2.c
|
||||
crot.c cspcon.c csprfs.c cspsv.c
|
||||
cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c
|
||||
cstegr.c cstein.c csteqr.c csycon.c
|
||||
csyrfs.c csysv.c csysvx.c csytf2.c csytrf.c csytri.c
|
||||
csytri2.c csytri2x.c csyswapr.c
|
||||
csytrs.c csytrs2.c
|
||||
csyconv.c csyconvf.c csyconvf_rook.c
|
||||
csytf2_rook.c csytrf_rook.c csytrs_rook.c
|
||||
csytri_rook.c csycon_rook.c csysv_rook.c
|
||||
csytf2_rk.c csytrf_rk.c csytrf_aa.c csytrf_aa_2stage.c csytrs_3.c csytrs_aa.c csytrs_aa_2stage.c
|
||||
csytri_3.c csytri_3x.c csycon_3.c csysv_rk.c csysv_aa.c csysv_aa_2stage.c
|
||||
ctbcon.c ctbrfs.c ctbtrs.c ctgevc.c ctgex2.c
|
||||
ctgexc.c ctgsen.c ctgsja.c ctgsna.c ctgsy2.c ctgsyl.c ctpcon.c
|
||||
ctprfs.c ctptri.c
|
||||
ctptrs.c ctrcon.c ctrevc.c ctrevc3.c ctrexc.c ctrrfs.c ctrsen.c ctrsna.c
|
||||
ctrsyl.c ctrtrs.c ctzrzf.c cung2l.c cung2r.c
|
||||
cungbr.c cunghr.c cungl2.c cunglq.c cungql.c cungqr.c cungr2.c
|
||||
cungrq.c cungtr.c cunm2l.c cunm2r.c cunmbr.c cunmhr.c cunml2.c cunm22.c
|
||||
cunmlq.c cunmql.c cunmqr.c cunmr2.c cunmr3.c cunmrq.c cunmrz.c
|
||||
cunmtr.c cupgtr.c cupmtr.c icmax1.c scsum1.c cstemr.c
|
||||
chfrk.c ctfttp.c clanhf.c cpftrf.c cpftri.c cpftrs.c ctfsm.c ctftri.c
|
||||
ctfttr.c ctpttf.c ctpttr.c ctrttf.c ctrttp.c
|
||||
cgeequb.c cgbequb.c csyequb.c cpoequb.c cheequb.c
|
||||
cbbcsd.c clapmr.c cunbdb.c cunbdb1.c cunbdb2.c cunbdb3.c cunbdb4.c
|
||||
cunbdb5.c cunbdb6.c cuncsd.c cuncsd2by1.c
|
||||
cgeqrt.c cgeqrt2.c cgeqrt3.c cgemqrt.c
|
||||
ctpqrt.c ctpqrt2.c ctpmqrt.c ctprfb.c
|
||||
cgelqt.c cgelqt3.c cgemlqt.c
|
||||
cgetsls.c cgetsqrhrt.c cgeqr.c clatsqr.c clamtsqr.c cgemqr.c
|
||||
cgelq.c claswlq.c clamswlq.c cgemlq.c
|
||||
ctplqt.c ctplqt2.c ctpmlqt.c
|
||||
chetrd_2stage.c chetrd_he2hb.c chetrd_hb2st.c chb2st_kernels.c
|
||||
cheevd_2stage.c cheev_2stage.c cheevx_2stage.c cheevr_2stage.c
|
||||
chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c
|
||||
cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c
|
||||
cungtsqr.c cungtsqr_row.c cunhr_col.c )
|
||||
|
||||
set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c
|
||||
cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c
|
||||
csysvxx.c csyrfsx.c cla_syrfsx_extended.c cla_syamv.c
|
||||
cla_syrcond_c.c cla_syrcond_x.c cla_syrpvgrw.c
|
||||
cposvxx.c cporfsx.c cla_porfsx_extended.c
|
||||
cla_porcond_c.c cla_porcond_x.c cla_porpvgrw.c
|
||||
cgbsvxx.c cgbrfsx.c cla_gbrfsx_extended.c cla_gbamv.c
|
||||
cla_gbrcond_c.c cla_gbrcond_x.c cla_gbrpvgrw.c
|
||||
chesvxx.c cherfsx.c cla_herfsx_extended.c cla_heamv.c
|
||||
cla_hercond_c.c cla_hercond_x.c cla_herpvgrw.c
|
||||
cla_lin_berr.c clarscl2.c clascl2.c cla_wwaddw.c)
|
||||
|
||||
set(DLASRC
|
||||
dgbbrd.c dgbcon.c dgbequ.c dgbrfs.c dgbsv.c
|
||||
dgbsvx.c dgbtf2.c dgbtrf.c dgbtrs.c dgebak.c dgebal.c dgebd2.c
|
||||
dgebrd.c dgecon.c dgeequ.c dgees.c dgeesx.c dgeev.c dgeevx.c
|
||||
dgehd2.c dgehrd.c dgelq2.c dgelqf.c
|
||||
dgels.c dgelsd.c dgelss.c dgelsy.c dgeql2.c dgeqlf.c
|
||||
dgeqp3.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c
|
||||
dgesc2.c dgesdd.c dgesvd.c dgesvdx.c dgesvx.c dgetc2.c
|
||||
dgetrf2.c dgetri.c
|
||||
dggbak.c dggbal.c
|
||||
dgges.c dgges3.c dggesx.c dggev.c dggev3.c dggevx.c
|
||||
dggglm.c dgghrd.c dgghd3.c dgglse.c dggqrf.c
|
||||
dggrqf.c dggsvd3.c dggsvp3.c dgtcon.c dgtrfs.c dgtsv.c
|
||||
dgtsvx.c dgttrf.c dgttrs.c dgtts2.c dhgeqz.c
|
||||
dhsein.c dhseqr.c dlabrd.c dlacon.c dlacn2.c
|
||||
dlaein.c dlaexc.c dlag2.c dlags2.c dlagtm.c dlagv2.c dlahqr.c
|
||||
dlahr2.c dlaic1.c dlaln2.c dlals0.c dlalsa.c dlalsd.c
|
||||
dlangb.c dlange.c dlangt.c dlanhs.c dlansb.c dlansp.c
|
||||
dlansy.c dlantb.c dlantp.c dlantr.c dlanv2.c
|
||||
dlapll.c dlapmt.c
|
||||
dlaqgb.c dlaqge.c dlaqp2.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c
|
||||
dlaqr0.c dlaqr1.c dlaqr2.c dlaqr3.c dlaqr4.c dlaqr5.c
|
||||
dlaqtr.c dlar1v.c dlar2v.c iladlr.c iladlc.c
|
||||
dlarf.c dlarfb.c dlarfb_gett.c dlarfg.c dlarfgp.c dlarft.c dlarfx.c dlarfy.c
|
||||
dlargv.c dlarrv.c dlartv.c
|
||||
dlarz.c dlarzb.c dlarzt.c dlasy2.c
|
||||
dlasyf.c dlasyf_rook.c dlasyf_rk.c dlasyf_aa.c
|
||||
dlatbs.c dlatdf.c dlatps.c dlatrd.c dlatrs.c dlatrz.c
|
||||
dopgtr.c dopmtr.c dorg2l.c dorg2r.c
|
||||
dorgbr.c dorghr.c dorgl2.c dorglq.c dorgql.c dorgqr.c dorgr2.c
|
||||
dorgrq.c dorgtr.c dorm2l.c dorm2r.c dorm22.c
|
||||
dormbr.c dormhr.c dorml2.c dormlq.c dormql.c dormqr.c dormr2.c
|
||||
dormr3.c dormrq.c dormrz.c dormtr.c dpbcon.c dpbequ.c dpbrfs.c
|
||||
dpbstf.c dpbsv.c dpbsvx.c
|
||||
dpbtf2.c dpbtrf.c dpbtrs.c dpocon.c dpoequ.c dporfs.c dposv.c
|
||||
dposvx.c dpotrf2.c dpotri.c dpotrs.c dpstrf.c dpstf2.c
|
||||
dppcon.c dppequ.c
|
||||
dpprfs.c dppsv.c dppsvx.c dpptrf.c dpptri.c dpptrs.c dptcon.c
|
||||
dpteqr.c dptrfs.c dptsv.c dptsvx.c dpttrs.c dptts2.c drscl.c
|
||||
dsbev.c dsbevd.c dsbevx.c dsbgst.c dsbgv.c dsbgvd.c dsbgvx.c
|
||||
dsbtrd.c dspcon.c dspev.c dspevd.c dspevx.c dspgst.c
|
||||
dspgv.c dspgvd.c dspgvx.c dsprfs.c dspsv.c dspsvx.c dsptrd.c
|
||||
dsptrf.c dsptri.c dsptrs.c dstegr.c dstev.c dstevd.c dstevr.c
|
||||
dsycon.c dsyev.c dsyevd.c dsyevr.c
|
||||
dsyevx.c dsygs2.c dsygst.c dsygv.c dsygvd.c dsygvx.c dsyrfs.c
|
||||
dsysv.c dsysvx.c
|
||||
dsytd2.c dsytf2.c dsytrd.c dsytrf.c dsytri.c dsytrs.c dsytrs2.c
|
||||
dsytri2.c dsytri2x.c dsyswapr.c
|
||||
dsyconv.c dsyconvf.c dsyconvf_rook.c
|
||||
dsytf2_rook.c dsytrf_rook.c dsytrs_rook.c
|
||||
dsytri_rook.c dsycon_rook.c dsysv_rook.c
|
||||
dsytf2_rk.c dsytrf_rk.c dsytrs_3.c
|
||||
dsytri_3.c dsytri_3x.c dsycon_3.c dsysv_rk.c
|
||||
dsysv_aa.c dsysv_aa_2stage.c dsytrf_aa.c dsytrf_aa_2stage.c dsytrs_aa.c dsytrs_aa_2stage.c
|
||||
dtbcon.c
|
||||
dtbrfs.c dtbtrs.c dtgevc.c dtgex2.c dtgexc.c dtgsen.c
|
||||
dtgsja.c dtgsna.c dtgsy2.c dtgsyl.c dtpcon.c dtprfs.c dtptri.c
|
||||
dtptrs.c
|
||||
dtrcon.c dtrevc.c dtrevc3.c dtrexc.c dtrrfs.c dtrsen.c dtrsna.c dtrsyl.c
|
||||
dtrtrs.c dtzrzf.c dstemr.c
|
||||
dsgesv.c dsposv.c dlag2s.c slag2d.c dlat2s.c
|
||||
dlansf.c dpftrf.c dpftri.c dpftrs.c dsfrk.c dtfsm.c dtftri.c dtfttp.c
|
||||
dtfttr.c dtpttf.c dtpttr.c dtrttf.c dtrttp.c
|
||||
dgejsv.c dgesvj.c dgsvj0.c dgsvj1.c
|
||||
dgeequb.c dsyequb.c dpoequb.c dgbequb.c
|
||||
dbbcsd.c dlapmr.c dorbdb.c dorbdb1.c dorbdb2.c dorbdb3.c dorbdb4.c
|
||||
dorbdb5.c dorbdb6.c dorcsd.c dorcsd2by1.c
|
||||
dgeqrt.c dgeqrt2.c dgeqrt3.c dgemqrt.c
|
||||
dtpqrt.c dtpqrt2.c dtpmqrt.c dtprfb.c
|
||||
dgelqt.c dgelqt3.c dgemlqt.c
|
||||
dgetsls.c dgetsqrhrt.c dgeqr.c dlatsqr.c dlamtsqr.c dgemqr.c
|
||||
dgelq.c dlaswlq.c dlamswlq.c dgemlq.c
|
||||
dtplqt.c dtplqt2.c dtpmlqt.c
|
||||
dsytrd_2stage.c dsytrd_sy2sb.c dsytrd_sb2st.c dsb2st_kernels.c
|
||||
dsyevd_2stage.c dsyev_2stage.c dsyevx_2stage.c dsyevr_2stage.c
|
||||
dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c
|
||||
dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c
|
||||
dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c )
|
||||
|
||||
set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c
|
||||
dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c
|
||||
dla_syrfsx_extended.c dla_syamv.c dla_syrcond.c dla_syrpvgrw.c
|
||||
dposvxx.c dporfsx.c dla_porfsx_extended.c dla_porcond.c
|
||||
dla_porpvgrw.c dgbsvxx.c dgbrfsx.c dla_gbrfsx_extended.c
|
||||
dla_gbamv.c dla_gbrcond.c dla_gbrpvgrw.c dla_lin_berr.c dlarscl2.c
|
||||
dlascl2.c dla_wwaddw.c)
|
||||
|
||||
set(ZLASRC
|
||||
zbdsqr.c zgbbrd.c zgbcon.c zgbequ.c zgbrfs.c zgbsv.c zgbsvx.c
|
||||
zgbtf2.c zgbtrf.c zgbtrs.c zgebak.c zgebal.c zgebd2.c zgebrd.c
|
||||
zgecon.c zgeequ.c zgees.c zgeesx.c zgeev.c zgeevx.c
|
||||
zgehd2.c zgehrd.c zgelq2.c zgelqf.c
|
||||
zgels.c zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c
|
||||
zgeqr2.c zgeqr2p.c zgeqrf.c zgeqrfp.c zgerfs.c zgerq2.c zgerqf.c
|
||||
zgesc2.c zgesdd.c zgesvd.c zgesvdx.c zgesvx.c
|
||||
zgesvj.c zgejsv.c zgsvj0.c zgsvj1.c
|
||||
zgetc2.c zgetrf2.c
|
||||
zgetri.c
|
||||
zggbak.c zggbal.c
|
||||
zgges.c zgges3.c zggesx.c zggev.c zggev3.c zggevx.c
|
||||
zggglm.c zgghrd.c zgghd3.c zgglse.c zggqrf.c zggrqf.c
|
||||
zggsvd3.c zggsvp3.c
|
||||
zgtcon.c zgtrfs.c zgtsv.c zgtsvx.c zgttrf.c zgttrs.c zgtts2.c zhbev.c
|
||||
zhbevd.c zhbevx.c zhbgst.c zhbgv.c zhbgvd.c zhbgvx.c zhbtrd.c
|
||||
zhecon.c zheev.c zheevd.c zheevr.c zheevx.c zhegs2.c zhegst.c
|
||||
zhegv.c zhegvd.c zhegvx.c zherfs.c zhesv.c zhesvx.c zhetd2.c
|
||||
zhetf2.c zhetrd.c
|
||||
zhetrf.c zhetri.c zhetri2.c zhetri2x.c zheswapr.c
|
||||
zhetrs.c zhetrs2.c
|
||||
zhetf2_rook.c zhetrf_rook.c zhetri_rook.c
|
||||
zhetrs_rook.c zhecon_rook.c zhesv_rook.c
|
||||
zhetf2_rk.c zhetrf_rk.c zhetri_3.c zhetri_3x.c
|
||||
zhetrs_3.c zhecon_3.c zhesv_rk.c
|
||||
zhesv_aa.c zhesv_aa_2stage.c zhetrf_aa.c zhetrf_aa_2stage.c zhetrs_aa.c zhetrs_aa_2stage.c
|
||||
zhgeqz.c zhpcon.c zhpev.c zhpevd.c
|
||||
zhpevx.c zhpgst.c zhpgv.c zhpgvd.c zhpgvx.c zhprfs.c zhpsv.c
|
||||
zhpsvx.c
|
||||
zhptrd.c zhptrf.c zhptri.c zhptrs.c zhsein.c zhseqr.c zlabrd.c
|
||||
zlacgv.c zlacon.c zlacn2.c zlacp2.c zlacpy.c zlacrm.c zlacrt.c zladiv.c
|
||||
zlaed0.c zlaed7.c zlaed8.c
|
||||
zlaein.c zlaesy.c zlaev2.c zlags2.c zlagtm.c
|
||||
zlahef.c zlahef_rook.c zlahef_rk.c zlahef_aa.c zlahqr.c
|
||||
zlahr2.c zlaic1.c zlals0.c zlalsa.c zlalsd.c zlangb.c zlange.c
|
||||
zlangt.c zlanhb.c
|
||||
zlanhe.c
|
||||
zlanhp.c zlanhs.c zlanht.c zlansb.c zlansp.c zlansy.c zlantb.c
|
||||
zlantp.c zlantr.c zlapll.c zlapmt.c zlaqgb.c zlaqge.c
|
||||
zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqps.c zlaqsb.c
|
||||
zlaqr0.c zlaqr1.c zlaqr2.c zlaqr3.c zlaqr4.c zlaqr5.c
|
||||
zlaqsp.c zlaqsy.c zlar1v.c zlar2v.c ilazlr.c ilazlc.c
|
||||
zlarcm.c zlarf.c zlarfb.c zlarfb_gett.c
|
||||
zlarfg.c zlarfgp.c zlarft.c
|
||||
zlarfx.c zlarfy.c zlargv.c zlarnv.c zlarrv.c zlartg.c zlartv.c
|
||||
zlarz.c zlarzb.c zlarzt.c zlascl.c zlaset.c zlasr.c
|
||||
zlassq.c zlasyf.c zlasyf_rook.c zlasyf_rk.c zlasyf_aa.c
|
||||
zlatbs.c zlatdf.c zlatps.c zlatrd.c zlatrs.c zlatrz.c
|
||||
zpbcon.c zpbequ.c zpbrfs.c zpbstf.c zpbsv.c
|
||||
zpbsvx.c zpbtf2.c zpbtrf.c zpbtrs.c zpocon.c zpoequ.c zporfs.c
|
||||
zposv.c zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c
|
||||
zppcon.c zppequ.c zpprfs.c zppsv.c zppsvx.c zpptrf.c zpptri.c zpptrs.c
|
||||
zptcon.c zpteqr.c zptrfs.c zptsv.c zptsvx.c zpttrf.c zpttrs.c zptts2.c
|
||||
zrot.c zspcon.c zsprfs.c zspsv.c
|
||||
zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c
|
||||
zstegr.c zstein.c zsteqr.c zsycon.c
|
||||
zsyrfs.c zsysv.c zsysvx.c zsytf2.c zsytrf.c zsytri.c
|
||||
zsytri2.c zsytri2x.c zsyswapr.c
|
||||
zsytrs.c zsytrs2.c
|
||||
zsyconv.c zsyconvf.c zsyconvf_rook.c
|
||||
zsytf2_rook.c zsytrf_rook.c zsytrs_rook.c zsytrs_aa.c zsytrs_aa_2stage.c
|
||||
zsytri_rook.c zsycon_rook.c zsysv_rook.c
|
||||
zsytf2_rk.c zsytrf_rk.c zsytrf_aa.c zsytrf_aa_2stage.c zsytrs_3.c
|
||||
zsytri_3.c zsytri_3x.c zsycon_3.c zsysv_rk.c zsysv_aa.c zsysv_aa_2stage.c
|
||||
ztbcon.c ztbrfs.c ztbtrs.c ztgevc.c ztgex2.c
|
||||
ztgexc.c ztgsen.c ztgsja.c ztgsna.c ztgsy2.c ztgsyl.c ztpcon.c
|
||||
ztprfs.c ztptri.c
|
||||
ztptrs.c ztrcon.c ztrevc.c ztrevc3.c ztrexc.c ztrrfs.c ztrsen.c ztrsna.c
|
||||
ztrsyl.c ztrtrs.c ztzrzf.c zung2l.c
|
||||
zung2r.c zungbr.c zunghr.c zungl2.c zunglq.c zungql.c zungqr.c zungr2.c
|
||||
zungrq.c zungtr.c zunm2l.c zunm2r.c zunmbr.c zunmhr.c zunml2.c zunm22.c
|
||||
zunmlq.c zunmql.c zunmqr.c zunmr2.c zunmr3.c zunmrq.c zunmrz.c
|
||||
zunmtr.c zupgtr.c
|
||||
zupmtr.c izmax1.c dzsum1.c zstemr.c
|
||||
zcgesv.c zcposv.c zlag2c.c clag2z.c zlat2c.c
|
||||
zhfrk.c ztfttp.c zlanhf.c zpftrf.c zpftri.c zpftrs.c ztfsm.c ztftri.c
|
||||
ztfttr.c ztpttf.c ztpttr.c ztrttf.c ztrttp.c
|
||||
zgeequb.c zgbequb.c zsyequb.c zpoequb.c zheequb.c
|
||||
zbbcsd.c zlapmr.c zunbdb.c zunbdb1.c zunbdb2.c zunbdb3.c zunbdb4.c
|
||||
zunbdb5.c zunbdb6.c zuncsd.c zuncsd2by1.c
|
||||
zgeqrt.c zgeqrt2.c zgeqrt3.c zgemqrt.c
|
||||
ztpqrt.c ztpqrt2.c ztpmqrt.c ztprfb.c
|
||||
ztplqt.c ztplqt2.c ztpmlqt.c
|
||||
zgelqt.c zgelqt3.c zgemlqt.c
|
||||
zgetsls.c zgetsqrhrt.c zgeqr.c zlatsqr.c zlamtsqr.c zgemqr.c
|
||||
zgelq.c zlaswlq.c zlamswlq.c zgemlq.c
|
||||
zhetrd_2stage.c zhetrd_he2hb.c zhetrd_hb2st.c zhb2st_kernels.c
|
||||
zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c
|
||||
zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c
|
||||
zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c
|
||||
zungtsqr.c zungtsqr_row.c zunhr_col.c)
|
||||
|
||||
set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c
|
||||
zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c
|
||||
zla_syrfsx_extended.c zla_syamv.c zla_syrcond_c.c zla_syrcond_x.c
|
||||
zla_syrpvgrw.c zposvxx.c zporfsx.c zla_porfsx_extended.c
|
||||
zla_porcond_c.c zla_porcond_x.c zla_porpvgrw.c zgbsvxx.c zgbrfsx.c
|
||||
zla_gbrfsx_extended.c zla_gbamv.c zla_gbrcond_c.c zla_gbrcond_x.c
|
||||
zla_gbrpvgrw.c zhesvxx.c zherfsx.c zla_herfsx_extended.c
|
||||
zla_heamv.c zla_hercond_c.c zla_hercond_x.c zla_herpvgrw.c
|
||||
zla_lin_berr.c zlarscl2.c zlascl2.c zla_wwaddw.c)
|
||||
|
||||
|
||||
if(USE_XBLAS)
|
||||
set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
|
||||
endif()
|
||||
|
||||
list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c
|
||||
DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c
|
||||
DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c)
|
||||
list(APPEND DLASRC DEPRECATED/dgegs.c DEPRECATED/dgegv.c
|
||||
DEPRECATED/dgeqpf.c DEPRECATED/dgelsx.c DEPRECATED/dggsvd.c
|
||||
DEPRECATED/dggsvp.c DEPRECATED/dlahrd.c DEPRECATED/dlatzm.c DEPRECATED/dtzrqf.c)
|
||||
list(APPEND CLASRC DEPRECATED/cgegs.c DEPRECATED/cgegv.c
|
||||
DEPRECATED/cgeqpf.c DEPRECATED/cgelsx.c DEPRECATED/cggsvd.c
|
||||
DEPRECATED/cggsvp.c DEPRECATED/clahrd.c DEPRECATED/clatzm.c DEPRECATED/ctzrqf.c)
|
||||
list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c
|
||||
DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c
|
||||
DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c)
|
||||
message(STATUS "Building deprecated routines")
|
||||
|
||||
set(DSLASRC spotrs.c)
|
||||
|
||||
set(ZCLASRC cpotrs.c)
|
||||
|
||||
set(SCATGEN slatm1.c slaran.c slarnd.c)
|
||||
|
||||
set(SMATGEN slatms.c slatme.c slatmr.c slatmt.c
|
||||
slagge.c slagsy.c slakf2.c slarge.c slaror.c slarot.c slatm2.c
|
||||
slatm3.c slatm5.c slatm6.c slatm7.c slahilb.c)
|
||||
|
||||
set(CMATGEN clatms.c clatme.c clatmr.c clatmt.c
|
||||
clagge.c claghe.c clagsy.c clakf2.c clarge.c claror.c clarot.c
|
||||
clatm1.c clarnd.c clatm2.c clatm3.c clatm5.c clatm6.c clahilb.c slatm7.c)
|
||||
|
||||
set(DZATGEN dlatm1.c dlaran.c dlarnd.c)
|
||||
|
||||
set(DMATGEN dlatms.c dlatme.c dlatmr.c dlatmt.c
|
||||
dlagge.c dlagsy.c dlakf2.c dlarge.c dlaror.c dlarot.c dlatm2.c
|
||||
dlatm3.c dlatm5.c dlatm6.c dlatm7.c dlahilb.c)
|
||||
|
||||
set(ZMATGEN zlatms.c zlatme.c zlatmr.c zlatmt.c
|
||||
zlagge.c zlaghe.c zlagsy.c zlakf2.c zlarge.c zlaror.c zlarot.c
|
||||
zlatm1.c zlarnd.c zlatm2.c zlatm3.c zlatm5.c zlatm6.c zlahilb.c dlatm7.c)
|
||||
|
||||
if(BUILD_SINGLE)
|
||||
set(LA_REL_SRC ${SLASRC} ${DSLASRC} ${ALLAUX} ${SCLAUX})
|
||||
set(LA_GEN_SRC ${SMATGEN} ${SCATGEN})
|
||||
message(STATUS "Building Single Precision")
|
||||
endif()
|
||||
if(BUILD_DOUBLE)
|
||||
set(LA_REL_SRC ${LA_REL_SRC} ${DLASRC} ${DSLASRC} ${ALLAUX} ${DZLAUX})
|
||||
set(LA_GEN_SRC ${LA_GEN_SRC} ${DMATGEN} ${DZATGEN})
|
||||
message(STATUS "Building Double Precision")
|
||||
endif()
|
||||
if(BUILD_COMPLEX)
|
||||
set(LA_REL_SRC ${LA_REL_SRC} ${CLASRC} ${ZCLASRC} ${ALLAUX} ${SCLAUX})
|
||||
SET(LA_GEN_SRC ${LA_GEN_SRC} ${CMATGEN} ${SCATGEN})
|
||||
message(STATUS "Building Single Precision Complex")
|
||||
endif()
|
||||
if(BUILD_COMPLEX16)
|
||||
set(LA_REL_SRC ${LA_REL_SRC} ${ZLASRC} ${ZCLASRC} ${ALLAUX} ${DZLAUX})
|
||||
SET(LA_GEN_SRC ${LA_GEN_SRC} ${ZMATGEN} ${DZATGEN})
|
||||
# for zlange/zlanhe
|
||||
if (NOT BUILD_DOUBLE)
|
||||
set (LA_REL_SRC ${LA_REL_SRC} dcombssq.c)
|
||||
endif ()
|
||||
message(STATUS "Building Double Precision Complex")
|
||||
endif()
|
||||
|
||||
endif()
|
||||
|
||||
# add lapack-netlib folder to the sources
|
||||
set(LA_SOURCES "")
|
||||
foreach (LA_FILE ${LA_REL_SRC})
|
||||
|
|
@ -496,4 +996,9 @@ endforeach ()
|
|||
foreach (LA_FILE ${LA_GEN_SRC})
|
||||
list(APPEND LA_SOURCES "${NETLIB_LAPACK_DIR}/TESTING/MATGEN/${LA_FILE}")
|
||||
endforeach ()
|
||||
set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}")
|
||||
|
||||
if (NOT C_LAPACK)
|
||||
set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}")
|
||||
else ()
|
||||
set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")
|
||||
endif ()
|
||||
|
|
|
|||
|
|
@ -131,6 +131,8 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
set(HAVE_SSE2 1)
|
||||
set(HAVE_SSE3 1)
|
||||
set(HAVE_SSSE3 1)
|
||||
set(SBGEMM_UNROLL_M 8)
|
||||
set(SBGEMM_UNROLL_N 4)
|
||||
set(SGEMM_UNROLL_M 8)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
|
|
@ -143,6 +145,684 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
set(CGEMM3M_UNROLL_N 4)
|
||||
set(ZGEMM3M_UNROLL_M 4)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "ATOM")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t24576\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L2_SIZE\t524288\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_CMOV\n"
|
||||
"#define HAVE_MMX\n"
|
||||
"#define HAVE_SSE\n"
|
||||
"#define HAVE_SSE2\n"
|
||||
"#define HAVE_SSE3\n"
|
||||
"#define HAVE_SSSE3\n"
|
||||
"#define SLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define DLOCAL_BUFFER_SIZE\t8192\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t8192\n")
|
||||
set(HAVE_SSE 1)
|
||||
set(HAVE_SSE2 1)
|
||||
set(HAVE_SSE3 1)
|
||||
set(HAVE_SSSE3 1)
|
||||
set(SBGEMM_UNROLL_M 8)
|
||||
set(SBGEMM_UNROLL_N 4)
|
||||
set(SGEMM_UNROLL_M 8)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
set(DGEMM_UNROLL_N 2)
|
||||
set(CGEMM_UNROLL_M 4)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 1)
|
||||
set(CGEMM3M_UNROLL_M 8)
|
||||
set(CGEMM3M_UNROLL_N 4)
|
||||
set(ZGEMM3M_UNROLL_M 4)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "PRESCOTT")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t16384\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L2_SIZE\t1048576\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_CMOV\n"
|
||||
"#define HAVE_MMX\n"
|
||||
"#define HAVE_SSE\n"
|
||||
"#define HAVE_SSE2\n"
|
||||
"#define HAVE_SSE3\n"
|
||||
"#define SLOCAL_BUFFER_SIZE\t8192\n"
|
||||
"#define DLOCAL_BUFFER_SIZE\t8192\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t8192\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t8192\n")
|
||||
set(HAVE_SSE 1)
|
||||
set(HAVE_SSE2 1)
|
||||
set(HAVE_SSE3 1)
|
||||
set(SBGEMM_UNROLL_M 8)
|
||||
set(SBGEMM_UNROLL_N 4)
|
||||
set(SGEMM_UNROLL_M 8)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 4)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(CGEMM3M_UNROLL_M 8)
|
||||
set(CGEMM3M_UNROLL_N 4)
|
||||
set(ZGEMM3M_UNROLL_M 4)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "NEHALEM")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L2_SIZE\t262144\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_CMOV\n"
|
||||
"#define HAVE_MMX\n"
|
||||
"#define HAVE_SSE\n"
|
||||
"#define HAVE_SSE2\n"
|
||||
"#define HAVE_SSE3\n"
|
||||
"#define HAVE_SSSE3\n"
|
||||
"#define HAVE_SSE4_1\n"
|
||||
"#define HAVE_SSE4_2\n"
|
||||
"#define SLOCAL_BUFFER_SIZE\t65535\n"
|
||||
"#define DLOCAL_BUFFER_SIZE\t32768\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t65536\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t32768\n")
|
||||
set(HAVE_SSE 1)
|
||||
set(HAVE_SSE2 1)
|
||||
set(HAVE_SSE3 1)
|
||||
set(HAVE_SSSE3 1)
|
||||
set(HAVE_SSE4_1 1)
|
||||
set(HAVE_SSE4_2 1)
|
||||
set(SBGEMM_UNROLL_M 8)
|
||||
set(SBGEMM_UNROLL_N 4)
|
||||
set(SGEMM_UNROLL_M 4)
|
||||
set(SGEMM_UNROLL_N 8)
|
||||
set(DGEMM_UNROLL_M 2)
|
||||
set(DGEMM_UNROLL_N 8)
|
||||
set(CGEMM_UNROLL_M 2)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 1)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(CGEMM3M_UNROLL_M 4)
|
||||
set(CGEMM3M_UNROLL_N 8)
|
||||
set(ZGEMM3M_UNROLL_M 2)
|
||||
set(ZGEMM3M_UNROLL_N 8)
|
||||
elseif ("${TCORE}" STREQUAL "SANDYBRIDGE")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L2_SIZE\t262144\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_CMOV\n"
|
||||
"#define HAVE_MMX\n"
|
||||
"#define HAVE_SSE\n"
|
||||
"#define HAVE_SSE2\n"
|
||||
"#define HAVE_SSE3\n"
|
||||
"#define HAVE_SSSE3\n"
|
||||
"#define HAVE_SSE4_1\n"
|
||||
"#define HAVE_SSE4_2\n"
|
||||
"#define HAVE_AVX\n"
|
||||
"#define SLOCAL_BUFFER_SIZE\t24576\n"
|
||||
"#define DLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t32768\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t24576\n")
|
||||
set(HAVE_SSE 1)
|
||||
set(HAVE_SSE2 1)
|
||||
set(HAVE_SSE3 1)
|
||||
set(HAVE_SSSE3 1)
|
||||
set(HAVE_SSE4_1 1)
|
||||
set(HAVE_SSE4_2 1)
|
||||
set(HAVE_AVX 1)
|
||||
set(SBGEMM_UNROLL_M 8)
|
||||
set(SBGEMM_UNROLL_N 4)
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 1)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(CGEMM3M_UNROLL_M 4)
|
||||
set(CGEMM3M_UNROLL_N 8)
|
||||
set(ZGEMM3M_UNROLL_M 2)
|
||||
set(ZGEMM3M_UNROLL_N 8)
|
||||
elseif ("${TCORE}" STREQUAL "HASWELL")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L2_SIZE\t262144\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_CMOV\n"
|
||||
"#define HAVE_MMX\n"
|
||||
"#define HAVE_SSE\n"
|
||||
"#define HAVE_SSE2\n"
|
||||
"#define HAVE_SSE3\n"
|
||||
"#define HAVE_SSSE3\n"
|
||||
"#define HAVE_SSE4_1\n"
|
||||
"#define HAVE_SSE4_2\n"
|
||||
"#define HAVE_AVX\n"
|
||||
"#define HAVE_AVX2\n"
|
||||
"#define HAVE_FMA3\n"
|
||||
"#define SLOCAL_BUFFER_SIZE\t20480\n"
|
||||
"#define DLOCAL_BUFFER_SIZE\t32768\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t12288\n")
|
||||
set(HAVE_SSE 1)
|
||||
set(HAVE_SSE2 1)
|
||||
set(HAVE_SSE3 1)
|
||||
set(HAVE_SSSE3 1)
|
||||
set(HAVE_SSE4_1 1)
|
||||
set(HAVE_SSE4_2 1)
|
||||
set(HAVE_AVX 1)
|
||||
set(HAVE_AVX2 1)
|
||||
set(HAVE_FMA3 1)
|
||||
set(SBGEMM_UNROLL_M 8)
|
||||
set(SBGEMM_UNROLL_N 4)
|
||||
set(SGEMM_UNROLL_M 8)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
set(DGEMM_UNROLL_N 8)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(CGEMM3M_UNROLL_M 8)
|
||||
set(CGEMM3M_UNROLL_N 4)
|
||||
set(ZGEMM3M_UNROLL_M 4)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "SKYLAKEX")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L2_SIZE\t262144\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_CMOV\n"
|
||||
"#define HAVE_MMX\n"
|
||||
"#define HAVE_SSE\n"
|
||||
"#define HAVE_SSE2\n"
|
||||
"#define HAVE_SSE3\n"
|
||||
"#define HAVE_SSSE3\n"
|
||||
"#define HAVE_SSE4_1\n"
|
||||
"#define HAVE_SSE4_2\n"
|
||||
"#define HAVE_AVX\n"
|
||||
"#define HAVE_AVX2\n"
|
||||
"#define HAVE_FMA3\n"
|
||||
"#define HAVE_AVX512VL\n"
|
||||
"#define SLOCAL_BUFFER_SIZE\t28672\n"
|
||||
"#define DLOCAL_BUFFER_SIZE\t12288\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t12288\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t8192\n")
|
||||
set(HAVE_CMOV 1)
|
||||
set(HAVE_MMX 1)
|
||||
set(HAVE_SSE 1)
|
||||
set(HAVE_SSE2 1)
|
||||
set(HAVE_SSE3 1)
|
||||
set(HAVE_SSSE3 1)
|
||||
set(HAVE_SSE4_1 1)
|
||||
set(HAVE_SSE4_2 1)
|
||||
set(HAVE_AVX 1)
|
||||
set(HAVE_AVX2 1)
|
||||
set(HAVE_FMA3 1)
|
||||
set(HAVE_AVX512VL 1)
|
||||
set(SBGEMM_UNROLL_M 8)
|
||||
set(SBGEMM_UNROLL_N 4)
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 16)
|
||||
set(DGEMM_UNROLL_N 2)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(CGEMM3M_UNROLL_M 8)
|
||||
set(CGEMM3M_UNROLL_N 4)
|
||||
set(ZGEMM3M_UNROLL_M 4)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "COOPERLAKE")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L2_SIZE\t262144\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_CMOV\n"
|
||||
"#define HAVE_MMX\n"
|
||||
"#define HAVE_SSE\n"
|
||||
"#define HAVE_SSE2\n"
|
||||
"#define HAVE_SSE3\n"
|
||||
"#define HAVE_SSSE3\n"
|
||||
"#define HAVE_SSE4_1\n"
|
||||
"#define HAVE_SSE4_2\n"
|
||||
"#define HAVE_AVX\n"
|
||||
"#define HAVE_AVX2\n"
|
||||
"#define HAVE_FMA3\n"
|
||||
"#define HAVE_AVX512VL\n"
|
||||
"#define HAVE_AVX512BF16\n"
|
||||
"#define SLOCAL_BUFFER_SIZE\t20480\n"
|
||||
"#define DLOCAL_BUFFER_SIZE\t12288\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t12288\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t8192\n")
|
||||
set(HAVE_CMOV 1)
|
||||
set(HAVE_MMX 1)
|
||||
set(HAVE_SSE 1)
|
||||
set(HAVE_SSE2 1)
|
||||
set(HAVE_SSE3 1)
|
||||
set(HAVE_SSSE3 1)
|
||||
set(HAVE_SSE4_1 1)
|
||||
set(HAVE_SSE4_2 1)
|
||||
set(HAVE_AVX 1)
|
||||
set(HAVE_AVX2 1)
|
||||
set(HAVE_FMA3 1)
|
||||
set(HAVE_AVX512VL 1)
|
||||
set(HAVE_AVX512BF16 1)
|
||||
set(SBGEMM_UNROLL_M 16)
|
||||
set(SBGEMM_UNROLL_N 4)
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 16)
|
||||
set(DGEMM_UNROLL_N 2)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(CGEMM3M_UNROLL_M 8)
|
||||
set(CGEMM3M_UNROLL_N 4)
|
||||
set(ZGEMM3M_UNROLL_M 4)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "SAPPHIRERAPIDS")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L2_SIZE\t262144\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_CMOV\n"
|
||||
"#define HAVE_MMX\n"
|
||||
"#define HAVE_SSE\n"
|
||||
"#define HAVE_SSE2\n"
|
||||
"#define HAVE_SSE3\n"
|
||||
"#define HAVE_SSSE3\n"
|
||||
"#define HAVE_SSE4_1\n"
|
||||
"#define HAVE_SSE4_2\n"
|
||||
"#define HAVE_AVX\n"
|
||||
"#define HAVE_AVX2\n"
|
||||
"#define HAVE_FMA3\n"
|
||||
"#define HAVE_AVX512VL\n"
|
||||
"#define HAVE_AVX512BF16\n"
|
||||
"#define SLOCAL_BUFFER_SIZE\t20480\n"
|
||||
"#define DLOCAL_BUFFER_SIZE\t12288\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t12288\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t8192\n")
|
||||
set(HAVE_CMOV 1)
|
||||
set(HAVE_MMX 1)
|
||||
set(HAVE_SSE 1)
|
||||
set(HAVE_SSE2 1)
|
||||
set(HAVE_SSE3 1)
|
||||
set(HAVE_SSSE3 1)
|
||||
set(HAVE_SSE4_1 1)
|
||||
set(HAVE_SSE4_2 1)
|
||||
set(HAVE_AVX 1)
|
||||
set(HAVE_AVX2 1)
|
||||
set(HAVE_FMA3 1)
|
||||
set(HAVE_AVX512VL 1)
|
||||
set(HAVE_AVX512BF16 1)
|
||||
set(SBGEMM_UNROLL_M 32)
|
||||
set(SBGEMM_UNROLL_N 16)
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 16)
|
||||
set(DGEMM_UNROLL_N 2)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(CGEMM3M_UNROLL_M 8)
|
||||
set(CGEMM3M_UNROLL_N 4)
|
||||
set(ZGEMM3M_UNROLL_M 4)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "OPTERON")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L2_SIZE\t1048576\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t32\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_3DNOW\n"
|
||||
"#define HAVE_3DNOWEX\n"
|
||||
"#define HAVE_MMX\n"
|
||||
"#define HAVE_SSE\n"
|
||||
"#define HAVE_SSE2\n"
|
||||
"#define SLOCAL_BUFFER_SIZE\t15360\n"
|
||||
"#define DLOCAL_BUFFER_SIZE\t15360\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t15360\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t15360\n")
|
||||
set(HAVE_3DNOW 1)
|
||||
set(HAVE_3DNOWEX 1)
|
||||
set(HAVE_MMX 1)
|
||||
set(HAVE_SSE 1)
|
||||
set(HAVE_SSE2 1)
|
||||
set(SBGEMM_UNROLL_M 8)
|
||||
set(SBGEMM_UNROLL_N 4)
|
||||
set(SGEMM_UNROLL_M 8)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 4)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(CGEMM3M_UNROLL_M 8)
|
||||
set(CGEMM3M_UNROLL_N 4)
|
||||
set(ZGEMM3M_UNROLL_M 4)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "BARCELONA")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L2_SIZE\t524288\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_MMX\n"
|
||||
"#define HAVE_SSE\n"
|
||||
"#define HAVE_SSE2\n"
|
||||
"#define HAVE_SSE3\n"
|
||||
"#define HAVE_SSE4A\n"
|
||||
"#define HAVE_MISALIGNSSE\n"
|
||||
"#define HAVE_128BITFPU\n"
|
||||
"#define HAVE_FASTMOVU\n"
|
||||
"#define SLOCAL_BUFFER_SIZE\t14336\n"
|
||||
"#define DLOCAL_BUFFER_SIZE\t14336\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t14336\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t14336\n")
|
||||
set(HAVE_SSE 1)
|
||||
set(HAVE_SSE2 1)
|
||||
set(HAVE_SSE3 1)
|
||||
set(HAVE_SSE4A 1)
|
||||
set(HAVE_MISALIGNSSE 1)
|
||||
set(HAVE_128BITFPU 1)
|
||||
set(HAVE_FASTMOVU 1)
|
||||
set(SBGEMM_UNROLL_M 8)
|
||||
set(SBGEMM_UNROLL_N 4)
|
||||
set(SGEMM_UNROLL_M 8)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 4)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(CGEMM3M_UNROLL_M 8)
|
||||
set(CGEMM3M_UNROLL_N 4)
|
||||
set(ZGEMM3M_UNROLL_M 4)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "BULLDOZER")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t49152\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L2_SIZE\t1024000\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t32\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_MMX\n"
|
||||
"#define HAVE_SSE\n"
|
||||
"#define HAVE_SSE2\n"
|
||||
"#define HAVE_SSE3\n"
|
||||
"#define HAVE_SSE4A\n"
|
||||
"#define HAVE_AVX\n"
|
||||
"#define HAVE_MISALIGNSSE\n"
|
||||
"#define HAVE_128BITFPU\n"
|
||||
"#define HAVE_FASTMOVU\n"
|
||||
"#define SLOCAL_BUFFER_SIZE\t5376\n"
|
||||
"#define DLOCAL_BUFFER_SIZE\t5376\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t14336\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t14336\n")
|
||||
set(HAVE_SSE 1)
|
||||
set(HAVE_SSE2 1)
|
||||
set(HAVE_SSE3 1)
|
||||
set(HAVE_SSE4A 1)
|
||||
set(HAVE_AVX 1)
|
||||
set(HAVE_MISALIGNSSE 1)
|
||||
set(HAVE_128BITFPU 1)
|
||||
set(HAVE_FASTMOVU 1)
|
||||
set(SBGEMM_UNROLL_M 8)
|
||||
set(SBGEMM_UNROLL_N 4)
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 2)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 2)
|
||||
set(CGEMM_UNROLL_M 2)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(CGEMM3M_UNROLL_M 8)
|
||||
set(CGEMM3M_UNROLL_N 4)
|
||||
set(ZGEMM3M_UNROLL_M 4)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "PILEDRIVER")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t16384\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L2_SIZE\t2097152\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_MMX\n"
|
||||
"#define HAVE_SSE\n"
|
||||
"#define HAVE_SSE2\n"
|
||||
"#define HAVE_SSE3\n"
|
||||
"#define HAVE_SSE4_1\n"
|
||||
"#define HAVE_SSE4_2\n"
|
||||
"#define HAVE_SSE4A\n"
|
||||
"#define HAVE_AVX\n"
|
||||
"#define HAVE_MISALIGNSSE\n"
|
||||
"#define HAVE_128BITFPU\n"
|
||||
"#define HAVE_FASTMOVU\n"
|
||||
"#define HAVE_CFLUSH\n"
|
||||
"#define HAVE_FMA3\n"
|
||||
"#define SLOCAL_BUFFER_SIZE\t6144\n"
|
||||
"#define DLOCAL_BUFFER_SIZE\t5376\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t10752\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t10752\n")
|
||||
set(HAVE_SSE 1)
|
||||
set(HAVE_SSE2 1)
|
||||
set(HAVE_SSE3 1)
|
||||
set(HAVE_SSE4_1 1)
|
||||
set(HAVE_SSE4_2 1)
|
||||
set(HAVE_SSE4A 1)
|
||||
set(HAVE_AVX 1)
|
||||
set(HAVE_FMA3 1)
|
||||
set(HAVE_MISALIGNSSE 1)
|
||||
set(HAVE_128BITFPU 1)
|
||||
set(HAVE_FASTMOVU 1)
|
||||
set(HAVE_CFLUSH 1)
|
||||
set(SBGEMM_UNROLL_M 8)
|
||||
set(SBGEMM_UNROLL_N 4)
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 2)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 2)
|
||||
set(CGEMM_UNROLL_M 4)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(CGEMM3M_UNROLL_M 8)
|
||||
set(CGEMM3M_UNROLL_N 4)
|
||||
set(ZGEMM3M_UNROLL_M 4)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "STEAMROLLER")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t16384\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L2_SIZE\t2097152\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_MMX\n"
|
||||
"#define HAVE_SSE\n"
|
||||
"#define HAVE_SSE2\n"
|
||||
"#define HAVE_SSE3\n"
|
||||
"#define HAVE_SSE4_1\n"
|
||||
"#define HAVE_SSE4_2\n"
|
||||
"#define HAVE_SSE4A\n"
|
||||
"#define HAVE_AVX\n"
|
||||
"#define HAVE_MISALIGNSSE\n"
|
||||
"#define HAVE_128BITFPU\n"
|
||||
"#define HAVE_FASTMOVU\n"
|
||||
"#define HAVE_CFLUSH\n"
|
||||
"#define HAVE_FMA3\n"
|
||||
"#define SLOCAL_BUFFER_SIZE\t6144\n"
|
||||
"#define DLOCAL_BUFFER_SIZE\t5120\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t10240\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t10240\n")
|
||||
set(HAVE_SSE 1)
|
||||
set(HAVE_SSE2 1)
|
||||
set(HAVE_SSE3 1)
|
||||
set(HAVE_SSE4_1 1)
|
||||
set(HAVE_SSE4_2 1)
|
||||
set(HAVE_SSE4A 1)
|
||||
set(HAVE_AVX 1)
|
||||
set(HAVE_FMA3 1)
|
||||
set(HAVE_MISALIGNSSE 1)
|
||||
set(HAVE_128BITFPU 1)
|
||||
set(HAVE_FASTMOVU 1)
|
||||
set(HAVE_CFLUSH 1)
|
||||
set(SBGEMM_UNROLL_M 8)
|
||||
set(SBGEMM_UNROLL_N 4)
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 2)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 2)
|
||||
set(CGEMM_UNROLL_M 4)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(CGEMM3M_UNROLL_M 8)
|
||||
set(CGEMM3M_UNROLL_N 4)
|
||||
set(ZGEMM3M_UNROLL_M 4)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "EXCAVATOR")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t16384\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L2_SIZE\t2097152\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_MMX\n"
|
||||
"#define HAVE_SSE\n"
|
||||
"#define HAVE_SSE2\n"
|
||||
"#define HAVE_SSE3\n"
|
||||
"#define HAVE_SSE4_1\n"
|
||||
"#define HAVE_SSE4_2\n"
|
||||
"#define HAVE_SSE4A\n"
|
||||
"#define HAVE_AVX\n"
|
||||
"#define HAVE_MISALIGNSSE\n"
|
||||
"#define HAVE_128BITFPU\n"
|
||||
"#define HAVE_FASTMOVU\n"
|
||||
"#define HAVE_CFLUSH\n"
|
||||
"#define HAVE_FMA3\n"
|
||||
"#define SLOCAL_BUFFER_SIZE\t6144\n"
|
||||
"#define DLOCAL_BUFFER_SIZE\t5120\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t10240\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t10240\n")
|
||||
set(HAVE_SSE 1)
|
||||
set(HAVE_SSE2 1)
|
||||
set(HAVE_SSE3 1)
|
||||
set(HAVE_SSE4_1 1)
|
||||
set(HAVE_SSE4_2 1)
|
||||
set(HAVE_SSE4A 1)
|
||||
set(HAVE_AVX 1)
|
||||
set(HAVE_FMA3 1)
|
||||
set(HAVE_MISALIGNSSE 1)
|
||||
set(HAVE_128BITFPU 1)
|
||||
set(HAVE_FASTMOVU 1)
|
||||
set(HAVE_CFLUSH 1)
|
||||
set(SBGEMM_UNROLL_M 8)
|
||||
set(SBGEMM_UNROLL_N 4)
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 2)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 2)
|
||||
set(CGEMM_UNROLL_M 4)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(CGEMM3M_UNROLL_M 8)
|
||||
set(CGEMM3M_UNROLL_N 4)
|
||||
set(ZGEMM3M_UNROLL_M 4)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "ZEN")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L2_SIZE\t524288\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_MMX\n"
|
||||
"#define HAVE_SSE\n"
|
||||
"#define HAVE_SSE2\n"
|
||||
"#define HAVE_SSE3\n"
|
||||
"#define HAVE_SSE4_1\n"
|
||||
"#define HAVE_SSE4_2\n"
|
||||
"#define HAVE_SSE4A\n"
|
||||
"#define HAVE_MISALIGNSSE\n"
|
||||
"#define HAVE_128BITFPU\n"
|
||||
"#define HAVE_FASTMOVU\n"
|
||||
"#define HAVE_CFLUSH\n"
|
||||
"#define HAVE_AVX\n"
|
||||
"#define HAVE_AVX2\n"
|
||||
"#define HAVE_FMA3\n"
|
||||
"#define SLOCAL_BUFFER_SIZE\t20480\n"
|
||||
"#define DLOCAL_BUFFER_SIZE\t32768\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t12288\n")
|
||||
set(HAVE_SSE 1)
|
||||
set(HAVE_SSE2 1)
|
||||
set(HAVE_SSE3 1)
|
||||
set(HAVE_SSE4_1 1)
|
||||
set(HAVE_SSE4_2 1)
|
||||
set(HAVE_AVX 1)
|
||||
set(HAVE_AVX2 1)
|
||||
set(HAVE_FMA3 1)
|
||||
set(HAVE_SSE4A 1)
|
||||
set(HAVE_MISALIGNSSE 1)
|
||||
set(HAVE_128BITFPU 1)
|
||||
set(HAVE_FASTMOVU 1)
|
||||
set(HAVE_CFLUSH 1)
|
||||
set(SBGEMM_UNROLL_M 8)
|
||||
set(SBGEMM_UNROLL_N 4)
|
||||
set(SGEMM_UNROLL_M 8)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
set(DGEMM_UNROLL_N 8)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(CGEMM3M_UNROLL_M 8)
|
||||
set(CGEMM3M_UNROLL_N 4)
|
||||
set(ZGEMM3M_UNROLL_M 4)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "ARMV7")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
|
|
@ -199,12 +879,12 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
"#define HAVE_VFP\n"
|
||||
"#define HAVE_NEON\n"
|
||||
"#define ARMV8\n")
|
||||
if ("${TCORE}" STREQUAL "CORTEXA57")
|
||||
if ("${TCORE}" STREQUAL "CORTEXA57")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
else ()
|
||||
set(SGEMM_UNROLL_M 8)
|
||||
set(SGEMM_UNROLL_N 8)
|
||||
set(SGEMM_UNROLL_N 8)
|
||||
endif ()
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
|
|
@ -581,6 +1261,15 @@ endif ()
|
|||
set(ZGEMM_UNROLL_M 8)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(SYMV_P 8)
|
||||
elseif ("${TCORE}" STREQUAL "GENERIC")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE 32768\n"
|
||||
"#define L1_DATA_LINESIZE 128\n"
|
||||
"#define L2_SIZE 524288\n"
|
||||
"#define L2_LINESIZE 128 \n"
|
||||
"#define DTB_DEFAULT_ENTRIES 128\n"
|
||||
"#define DTB_SIZE 4096\n"
|
||||
"#define L2_ASSOCIATIVE 8\n")
|
||||
endif()
|
||||
set(SBGEMM_UNROLL_M 8)
|
||||
set(SBGEMM_UNROLL_N 4)
|
||||
|
|
@ -603,7 +1292,7 @@ endif ()
|
|||
"#define GEMM_MULTITHREAD_THRESHOLD\t${GEMM_MULTITHREAD_THRESHOLD}\n")
|
||||
# Move to where gen_config_h would place it
|
||||
file(MAKE_DIRECTORY ${TARGET_CONF_DIR})
|
||||
file(RENAME ${TARGET_CONF_TEMP} "${TARGET_CONF_DIR}/${TARGET_CONF}")
|
||||
file(RENAME ${TARGET_CONF_TEMP} "${TARGET_CONF_DIR}/${TARGET_CONF}")
|
||||
|
||||
else(NOT CMAKE_CROSSCOMPILING)
|
||||
# compile getarch
|
||||
|
|
@ -639,7 +1328,7 @@ else(NOT CMAKE_CROSSCOMPILING)
|
|||
OUTPUT_VARIABLE GETARCH_LOG
|
||||
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
|
||||
)
|
||||
|
||||
|
||||
if (NOT ${GETARCH_RESULT})
|
||||
MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
|
||||
endif ()
|
||||
|
|
|
|||
|
|
@ -284,8 +284,15 @@ if (NOT NOFORTRAN)
|
|||
# Fortran Compiler dependent settings
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
|
||||
else ()
|
||||
set(NO_LAPACK 1)
|
||||
set(NO_LAPACKE 1)
|
||||
if (NOT XXXX)
|
||||
set(C_LAPACK 1)
|
||||
if (INTERFACE64)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -DLAPACK_ILP64")
|
||||
endif ()
|
||||
set(TIMER "NONE")
|
||||
else ()
|
||||
set (NO_LAPACK 1)
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (BINARY64)
|
||||
|
|
@ -552,6 +559,14 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
|
|||
endforeach ()
|
||||
endif ()
|
||||
|
||||
if ("${F_COMPILER}" STREQUAL "NAG" OR "${F_COMPILER}" STREQUAL "CRAY")
|
||||
set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512")
|
||||
foreach (FILTER_FLAG ${FILTER_FLAGS})
|
||||
string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS})
|
||||
string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS})
|
||||
endforeach ()
|
||||
endif ()
|
||||
|
||||
if ("${F_COMPILER}" STREQUAL "GFORTRAN")
|
||||
# lapack-netlib is rife with uninitialized warnings -hpa
|
||||
set(LAPACK_FFLAGS "${LAPACK_FFLAGS} -Wno-maybe-uninitialized")
|
||||
|
|
|
|||
|
|
@ -31,7 +31,11 @@ endif()
|
|||
|
||||
# Pretty thorough determination of arch. Add more if needed
|
||||
if(CMAKE_CL_64 OR MINGW64)
|
||||
set(X86_64 1)
|
||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)")
|
||||
set(ARM64 1)
|
||||
else()
|
||||
set(X86_64 1)
|
||||
endif()
|
||||
elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING))
|
||||
set(X86 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
|
||||
|
|
|
|||
|
|
@ -33,9 +33,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifndef COMMON_ARM64
|
||||
#define COMMON_ARM64
|
||||
|
||||
#ifdef C_MSVC
|
||||
#include <intrin.h>
|
||||
#define MB __dmb(_ARM64_BARRIER_ISH)
|
||||
#define WMB __dmb(_ARM64_BARRIER_ISHST)
|
||||
#define RMB __dmb(_ARM64_BARRIER_ISHLD)
|
||||
#else
|
||||
#define MB __asm__ __volatile__ ("dmb ish" : : : "memory")
|
||||
#define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory")
|
||||
#define RMB __asm__ __volatile__ ("dmb ishld" : : : "memory")
|
||||
#endif
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
|
|
@ -53,6 +60,7 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
|||
BLASULONG ret;
|
||||
|
||||
do {
|
||||
#ifndef C_MSVC
|
||||
__asm__ __volatile__(
|
||||
"mov x4, #1 \n\t"
|
||||
"sevl \n\t"
|
||||
|
|
@ -70,7 +78,10 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
|||
|
||||
|
||||
);
|
||||
|
||||
#else
|
||||
while (*address) {YIELDING;}
|
||||
ret=InterlockedExchange64((volatile LONG64 *)(address), 1);
|
||||
#endif
|
||||
|
||||
} while (ret);
|
||||
|
||||
|
|
@ -80,6 +91,14 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
|||
|
||||
#if !defined(OS_DARWIN) && !defined (OS_ANDROID)
|
||||
static __inline BLASULONG rpcc(void){
|
||||
#ifdef C_MSVC
|
||||
const int64_t pmccntr_el0 = (((3 & 1) << 14) | // op0
|
||||
((3 & 7) << 11) | // op1
|
||||
((9 & 15) << 7) | // crn
|
||||
((13 & 15) << 3) | // crm
|
||||
((0 & 7) << 0)); // op2
|
||||
return _ReadStatusReg(pmccntr_el0);
|
||||
#else
|
||||
BLASULONG ret = 0;
|
||||
blasint shift;
|
||||
|
||||
|
|
@ -87,6 +106,7 @@ static __inline BLASULONG rpcc(void){
|
|||
__asm__ __volatile__ ("mrs %0,cntfrq_el0; clz %w0, %w0":"=&r"(shift));
|
||||
|
||||
return ret << shift;
|
||||
#endif
|
||||
}
|
||||
|
||||
#define RPCC_DEFINED
|
||||
|
|
|
|||
|
|
@ -2610,8 +2610,9 @@
|
|||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\
|
||||
|| defined(ARCH_LOONGARCH64) || defined(ARCH_E2K)
|
||||
#if !defined(DYNAMIC_ARCH) \
|
||||
&& (defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) \
|
||||
|| defined(ARCH_LOONGARCH64) || defined(ARCH_E2K))
|
||||
extern BLASLONG gemm_offset_a;
|
||||
extern BLASLONG gemm_offset_b;
|
||||
extern BLASLONG sbgemm_p;
|
||||
|
|
|
|||
|
|
@ -92,7 +92,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#define SEEK_ADDRESS
|
||||
|
||||
#if defined(C910V)
|
||||
#include <riscv-vector.h>
|
||||
#include <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -6,12 +6,14 @@
|
|||
#include "../cblas.h"
|
||||
#include "cpp_thread_safety_common.h"
|
||||
|
||||
void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){
|
||||
void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize)
|
||||
{
|
||||
const blasint inc = 1;
|
||||
cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]){
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used
|
||||
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
|
||||
uint32_t numTestRounds = 16; //number of testing rounds before success exit
|
||||
|
|
@ -20,20 +22,23 @@ int main(int argc, char* argv[]){
|
|||
if (maxHwThreads < 52)
|
||||
numConcurrentThreads = maxHwThreads;
|
||||
|
||||
if (argc > 4){
|
||||
if (argc > 4)
|
||||
{
|
||||
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
|
||||
abort();
|
||||
}
|
||||
if(argc == 4){
|
||||
}
|
||||
if(argc == 4)
|
||||
{
|
||||
std::vector<std::string> cliArgs;
|
||||
for (int i = 1; i < argc; i++){
|
||||
for (int i = 1; i < argc; i++)
|
||||
{
|
||||
cliArgs.push_back(argv[i]);
|
||||
std::cout<<argv[i]<<std::endl;
|
||||
}
|
||||
}
|
||||
randomMatSize = std::stoul(cliArgs.at(0));
|
||||
numConcurrentThreads = std::stoul(cliArgs.at(1));
|
||||
numTestRounds = std::stoul(cliArgs.at(2));
|
||||
}
|
||||
}
|
||||
|
||||
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
|
||||
std::vector<std::vector<double>> matBlock(numConcurrentThreads);
|
||||
|
|
@ -56,15 +61,18 @@ int main(int argc, char* argv[]){
|
|||
|
||||
std::cout<<"Preparing to test CBLAS DGEMV thread safety\n";
|
||||
std::cout<<"Allocating matrices..."<<std::flush;
|
||||
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||
for(uint32_t i=0; i<numConcurrentThreads; i++)
|
||||
{
|
||||
matBlock.at(i).resize(randomMatSize*randomMatSize);
|
||||
}
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
std::cout<<"Allocating vectors..."<<std::flush;
|
||||
for(uint32_t i=0; i<(numConcurrentThreads*2); i++){
|
||||
for(uint32_t i=0; i<(numConcurrentThreads*2); i++)
|
||||
{
|
||||
vecBlock.at(i).resize(randomMatSize);
|
||||
}
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
|
||||
//pauser();
|
||||
|
||||
std::cout<<"Filling matrices with random numbers..."<<std::flush;
|
||||
|
|
@ -77,31 +85,35 @@ int main(int argc, char* argv[]){
|
|||
|
||||
std::cout<<"Testing CBLAS DGEMV thread safety"<<std::endl;
|
||||
omp_set_num_threads(numConcurrentThreads);
|
||||
for(uint32_t R=0; R<numTestRounds; R++){
|
||||
for(uint32_t R=0; R<numTestRounds; R++)
|
||||
{
|
||||
std::cout<<"DGEMV round #"<<R<<std::endl;
|
||||
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
|
||||
#pragma omp parallel for default(none) shared(futureBlock, matBlock, vecBlock, randomMatSize, numConcurrentThreads)
|
||||
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||
for(uint32_t i=0; i<numConcurrentThreads; i++)
|
||||
{
|
||||
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemv, &matBlock[i][0], &vecBlock[i*2][0], &vecBlock[i*2+1][0], randomMatSize);
|
||||
}
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
std::cout<<"Waiting for threads to finish..."<<std::flush;
|
||||
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||
for(uint32_t i=0; i<numConcurrentThreads; i++)
|
||||
{
|
||||
futureBlock[i].get();
|
||||
}
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
std::cout<<"Comparing results from different threads..."<<std::flush;
|
||||
for(uint32_t i=2; i<(numConcurrentThreads*2); i+=2){ //i is the index of vector x, for a given thread
|
||||
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
|
||||
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++)
|
||||
{
|
||||
if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread
|
||||
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+1<<std::endl;
|
||||
std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout<<"OK!\n"<<std::endl;
|
||||
}
|
||||
std::cout<<"OK!\n"<<std::endl;
|
||||
}
|
||||
std::cout<<"CBLAS DGEMV thread safety test PASSED!\n"<<std::endl;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -45,6 +45,10 @@ size_t length64=sizeof(value64);
|
|||
#define CPU_NEOVERSEN1 11
|
||||
#define CPU_NEOVERSEV1 16
|
||||
#define CPU_NEOVERSEN2 17
|
||||
#define CPU_CORTEXX1 18
|
||||
#define CPU_CORTEXX2 19
|
||||
#define CPU_CORTEXA510 20
|
||||
#define CPU_CORTEXA710 21
|
||||
// Qualcomm
|
||||
#define CPU_FALKOR 6
|
||||
// Cavium
|
||||
|
|
@ -59,6 +63,8 @@ size_t length64=sizeof(value64);
|
|||
#define CPU_VORTEX 13
|
||||
// Fujitsu
|
||||
#define CPU_A64FX 15
|
||||
// Phytium
|
||||
#define CPU_FT2000 22
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKNOWN",
|
||||
|
|
@ -73,12 +79,17 @@ static char *cpuname[] = {
|
|||
"TSV110",
|
||||
"EMAG8180",
|
||||
"NEOVERSEN1",
|
||||
"NEOVERSEV1"
|
||||
"NEOVERSEN2"
|
||||
"THUNDERX3T110",
|
||||
"VORTEX",
|
||||
"CORTEXA55",
|
||||
"A64FX"
|
||||
"A64FX",
|
||||
"NEOVERSEV1",
|
||||
"NEOVERSEN2",
|
||||
"CORTEXX1",
|
||||
"CORTEXX2",
|
||||
"CORTEXA510",
|
||||
"CORTEXA710",
|
||||
"FT2000"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
|
|
@ -94,12 +105,17 @@ static char *cpuname_lower[] = {
|
|||
"tsv110",
|
||||
"emag8180",
|
||||
"neoversen1",
|
||||
"neoversev1",
|
||||
"neoversen2",
|
||||
"thunderx3t110",
|
||||
"vortex",
|
||||
"cortexa55",
|
||||
"a64fx"
|
||||
"a64fx",
|
||||
"neoversev1",
|
||||
"neoversen2",
|
||||
"cortexx1",
|
||||
"cortexx2",
|
||||
"cortexa510",
|
||||
"cortexa710",
|
||||
"ft2000"
|
||||
};
|
||||
|
||||
int get_feature(char *search)
|
||||
|
|
@ -182,6 +198,14 @@ int detect(void)
|
|||
return CPU_NEOVERSEN2;
|
||||
else if (strstr(cpu_part, "0xd05"))
|
||||
return CPU_CORTEXA55;
|
||||
else if (strstr(cpu_part, "0xd46"))
|
||||
return CPU_CORTEXA510;
|
||||
else if (strstr(cpu_part, "0xd47"))
|
||||
return CPU_CORTEXA710;
|
||||
else if (strstr(cpu_part, "0xd44"))
|
||||
return CPU_CORTEXX1;
|
||||
else if (strstr(cpu_part, "0xd4c"))
|
||||
return CPU_CORTEXX2;
|
||||
}
|
||||
// Qualcomm
|
||||
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
|
||||
|
|
@ -202,6 +226,13 @@ int detect(void)
|
|||
// Fujitsu
|
||||
else if (strstr(cpu_implementer, "0x46") && strstr(cpu_part, "0x001"))
|
||||
return CPU_A64FX;
|
||||
// Apple
|
||||
else if (strstr(cpu_implementer, "0x61") && strstr(cpu_part, "0x022"))
|
||||
return CPU_VORTEX;
|
||||
// Phytium
|
||||
else if (strstr(cpu_implementer, "0x70") && (strstr(cpu_part, "0x660") || strstr(cpu_part, "0x661")
|
||||
|| strstr(cpu_part, "0x662") || strstr(cpu_part, "0x663")))
|
||||
return CPU_FT2000;
|
||||
}
|
||||
|
||||
p = (char *) NULL ;
|
||||
|
|
@ -382,7 +413,24 @@ void get_cpuconfig(void)
|
|||
printf("#define DTB_DEFAULT_ENTRIES 48\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
|
||||
case CPU_CORTEXA510:
|
||||
case CPU_CORTEXA710:
|
||||
case CPU_CORTEXX1:
|
||||
case CPU_CORTEXX2:
|
||||
printf("#define ARMV9\n");
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
printf("#define L1_CODE_SIZE 65536\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 4\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 4\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
case CPU_FALKOR:
|
||||
printf("#define FALKOR\n");
|
||||
printf("#define L1_CODE_SIZE 65536\n");
|
||||
|
|
@ -469,9 +517,9 @@ void get_cpuconfig(void)
|
|||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
#ifdef __APPLE__
|
||||
case CPU_VORTEX:
|
||||
printf("#define VORTEX \n");
|
||||
#ifdef __APPLE__
|
||||
sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0);
|
||||
printf("#define L1_CODE_SIZE %lld \n",value64);
|
||||
sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0);
|
||||
|
|
@ -480,10 +528,10 @@ void get_cpuconfig(void)
|
|||
printf("#define L1_DATA_SIZE %lld \n",value64);
|
||||
sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0);
|
||||
printf("#define L2_SIZE %lld \n",value64);
|
||||
#endif
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
#endif
|
||||
case CPU_A64FX:
|
||||
printf("#define A64FX\n");
|
||||
printf("#define L1_CODE_SIZE 65535\n");
|
||||
|
|
@ -494,6 +542,16 @@ void get_cpuconfig(void)
|
|||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
case CPU_FT2000:
|
||||
printf("#define FT2000\n");
|
||||
printf("#define L1_CODE_SIZE 32768\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 33554432\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
break;
|
||||
}
|
||||
get_cpucount();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -33,30 +33,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include <stdint.h>
|
||||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_LOONGSON3R5 1
|
||||
/* If LASX extension instructions supported,
|
||||
* using core LOONGSON3R5
|
||||
* If only LSX extension instructions supported,
|
||||
* using core LOONGSON2K1000
|
||||
* If neither LASX nor LSX extension instructions supported,
|
||||
* using core LOONGSONGENERIC (As far as I know, there is no such
|
||||
* CPU yet)
|
||||
*/
|
||||
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_LOONGSON3R5 1
|
||||
#define CPU_LOONGSON2K1000 2
|
||||
|
||||
#define LOONGARCH_CFG2 0x02
|
||||
#define LOONGARCH_LASX 1<<7
|
||||
#define LOONGARCH_LSX 1<<6
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKNOWN",
|
||||
"LOONGSON3R5"
|
||||
"LOONGSONGENERIC",
|
||||
"LOONGSON3R5",
|
||||
"LOONGSON2K1000"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"loongsongeneric",
|
||||
"loongson3r5",
|
||||
"loongson2k1000"
|
||||
};
|
||||
|
||||
int detect(void) {
|
||||
uint32_t reg = 0;
|
||||
#ifdef __linux
|
||||
uint32_t reg = 0;
|
||||
|
||||
__asm__ volatile (
|
||||
"cpucfg %0, %1 \n\t"
|
||||
: "+&r"(reg)
|
||||
: "r"(LOONGARCH_CFG2)
|
||||
);
|
||||
__asm__ volatile (
|
||||
"cpucfg %0, %1 \n\t"
|
||||
: "+&r"(reg)
|
||||
: "r"(LOONGARCH_CFG2)
|
||||
);
|
||||
|
||||
if (reg & LOONGARCH_LASX)
|
||||
return CPU_LOONGSON3R5;
|
||||
else
|
||||
return CPU_UNKNOWN;
|
||||
if (reg & LOONGARCH_LASX)
|
||||
return CPU_LOONGSON3R5;
|
||||
else if (reg & LOONGARCH_LSX)
|
||||
return CPU_LOONGSON2K1000;
|
||||
else
|
||||
return CPU_GENERIC;
|
||||
#endif
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
||||
char *get_corename(void) {
|
||||
|
|
@ -68,11 +91,8 @@ void get_architecture(void) {
|
|||
}
|
||||
|
||||
void get_subarchitecture(void) {
|
||||
if (detect() == CPU_LOONGSON3R5) {
|
||||
printf("LOONGSON3R5");
|
||||
} else {
|
||||
printf("UNKNOWN");
|
||||
}
|
||||
int d = detect();
|
||||
printf("%s", cpuname[d]);
|
||||
}
|
||||
|
||||
void get_subdirname(void) {
|
||||
|
|
@ -80,31 +100,44 @@ void get_subdirname(void) {
|
|||
}
|
||||
|
||||
void get_cpuconfig(void) {
|
||||
if (detect() == CPU_LOONGSON3R5) {
|
||||
printf("#define LOONGSON3R5\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
} else {
|
||||
printf("#define LOONGSON3R5\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
int d = detect();
|
||||
switch (d) {
|
||||
case CPU_LOONGSON3R5:
|
||||
printf("#define LOONGSON3R5\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
break;
|
||||
|
||||
case CPU_LOONGSON2K1000:
|
||||
printf("#define LOONGSON2K1000\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 262144\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
break;
|
||||
|
||||
default:
|
||||
printf("#define LOONGSONGENERIC\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 262144\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void get_libname(void){
|
||||
if (detect() == CPU_LOONGSON3R5) {
|
||||
printf("loongson3r5\n");
|
||||
} else {
|
||||
printf("loongarch64\n");
|
||||
}
|
||||
int d = detect();
|
||||
printf("%s", cpuname_lower[d]);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
/*****************************************************************************
|
||||
Copyright (c) 2011-2014, The OpenBLAS Project
|
||||
Copyright (c) 2011-2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
|
@ -13,9 +13,9 @@ met:
|
|||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
|
|
@ -70,16 +70,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_C910V 1
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_C910V 1
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKOWN",
|
||||
"RISCV64_GENERIC",
|
||||
"C910V"
|
||||
};
|
||||
|
||||
int detect(void){
|
||||
return CPU_UNKNOWN;
|
||||
#ifdef __linux
|
||||
FILE *infile;
|
||||
char buffer[512],isa_buffer[512],model_buffer[512];
|
||||
const char* check_c910_str = "T-HEAD C910";
|
||||
char *pmodel = NULL, *pisa = NULL;
|
||||
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if(!strncmp(buffer, "model name", 10)){
|
||||
strcpy(model_buffer, buffer);
|
||||
pmodel = strchr(isa_buffer, ':') + 1;
|
||||
}
|
||||
|
||||
if(!strncmp(buffer, "isa", 3)){
|
||||
strcpy(isa_buffer, buffer);
|
||||
pisa = strchr(isa_buffer, '4') + 1;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
if (!pmodel)
|
||||
return(CPU_GENERIC);
|
||||
|
||||
if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v'))
|
||||
return CPU_C910V;
|
||||
|
||||
return CPU_GENERIC;
|
||||
#endif
|
||||
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
||||
char *get_corename(void){
|
||||
|
|
@ -91,6 +121,7 @@ void get_architecture(void){
|
|||
}
|
||||
|
||||
void get_subarchitecture(void){
|
||||
printf("%s",cpuname[detect()]);
|
||||
}
|
||||
|
||||
void get_subdirname(void){
|
||||
|
|
@ -98,7 +129,7 @@ void get_subdirname(void){
|
|||
}
|
||||
|
||||
void get_cpuconfig(void){
|
||||
printf("#define UNKNOWN\n");
|
||||
printf("#define %s\n", cpuname[detect()]);
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 32\n");
|
||||
printf("#define L2_SIZE 512488\n");
|
||||
|
|
|
|||
54
cpuid_x86.c
54
cpuid_x86.c
|
|
@ -1707,8 +1707,18 @@ int get_cpuname(void){
|
|||
if (model == 0xf && stepping < 0xe)
|
||||
return CPUTYPE_NANO;
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 0x7:
|
||||
switch (exmodel) {
|
||||
case 5:
|
||||
if (support_avx2())
|
||||
return CPUTYPE_ZEN;
|
||||
else
|
||||
return CPUTYPE_DUNNINGTON;
|
||||
default:
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
default:
|
||||
if (family >= 0x7)
|
||||
if (family >= 0x8)
|
||||
return CPUTYPE_NEHALEM;
|
||||
else
|
||||
return CPUTYPE_VIAC3;
|
||||
|
|
@ -1716,7 +1726,20 @@ int get_cpuname(void){
|
|||
}
|
||||
|
||||
if (vendor == VENDOR_ZHAOXIN){
|
||||
return CPUTYPE_NEHALEM;
|
||||
switch (family) {
|
||||
case 0x7:
|
||||
switch (exmodel) {
|
||||
case 5:
|
||||
if (support_avx2())
|
||||
return CPUTYPE_ZEN;
|
||||
else
|
||||
return CPUTYPE_DUNNINGTON;
|
||||
default:
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
default:
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_RISE){
|
||||
|
|
@ -2416,8 +2439,18 @@ int get_coretype(void){
|
|||
if (model == 0xf && stepping < 0xe)
|
||||
return CORE_NANO;
|
||||
return CORE_NEHALEM;
|
||||
case 0x7:
|
||||
switch (exmodel) {
|
||||
case 5:
|
||||
if (support_avx2())
|
||||
return CORE_ZEN;
|
||||
else
|
||||
return CORE_DUNNINGTON;
|
||||
default:
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
default:
|
||||
if (family >= 0x7)
|
||||
if (family >= 0x8)
|
||||
return CORE_NEHALEM;
|
||||
else
|
||||
return CORE_VIAC3;
|
||||
|
|
@ -2425,7 +2458,20 @@ int get_coretype(void){
|
|||
}
|
||||
|
||||
if (vendor == VENDOR_ZHAOXIN) {
|
||||
return CORE_NEHALEM;
|
||||
switch (family) {
|
||||
case 0x7:
|
||||
switch (exmodel) {
|
||||
case 5:
|
||||
if (support_avx2())
|
||||
return CORE_ZEN;
|
||||
else
|
||||
return CORE_DUNNINGTON;
|
||||
default:
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
default:
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
}
|
||||
|
||||
return CORE_UNKNOWN;
|
||||
|
|
|
|||
4
ctest.c
4
ctest.c
|
|
@ -44,6 +44,10 @@ COMPILER_DEC
|
|||
COMPILER_GNU
|
||||
#endif
|
||||
|
||||
#if defined(__fcc_version__) || defined(__FCC_version__)
|
||||
COMPILER_FUJITSU
|
||||
#endif
|
||||
|
||||
#if defined(__ANDROID__)
|
||||
OS_ANDROID
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -1,7 +1,9 @@
|
|||
include_directories(${PROJECT_SOURCE_DIR})
|
||||
include_directories(${PROJECT_BINARY_DIR})
|
||||
|
||||
if (NOT NOFORTRAN)
|
||||
enable_language(Fortran)
|
||||
endif()
|
||||
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS")
|
||||
if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
|
||||
|
|
@ -28,14 +30,24 @@ foreach(float_type ${FLOAT_TYPES})
|
|||
continue()
|
||||
endif()
|
||||
#level1
|
||||
if (NOT NOFORTRAN)
|
||||
add_executable(x${float_char}cblat1
|
||||
c_${float_char}blat1.f
|
||||
c_${float_char}blas1.c)
|
||||
else()
|
||||
add_executable(x${float_char}cblat1
|
||||
c_${float_char}blat1c.c
|
||||
c_${float_char}blas1.c)
|
||||
endif()
|
||||
target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD")
|
||||
target_link_libraries(x${float_char}cblat1 m)
|
||||
endif()
|
||||
add_test(NAME "x${float_char}cblat1"
|
||||
COMMAND $<TARGET_FILE:x${float_char}cblat1>)
|
||||
|
||||
#level2
|
||||
if (NOT NOFORTRAN)
|
||||
add_executable(x${float_char}cblat2
|
||||
c_${float_char}blat2.f
|
||||
c_${float_char}blas2.c
|
||||
|
|
@ -43,11 +55,24 @@ foreach(float_type ${FLOAT_TYPES})
|
|||
auxiliary.c
|
||||
c_xerbla.c
|
||||
constant.c)
|
||||
else()
|
||||
add_executable(x${float_char}cblat2
|
||||
c_${float_char}blat2c.c
|
||||
c_${float_char}blas2.c
|
||||
c_${float_char}2chke.c
|
||||
auxiliary.c
|
||||
c_xerbla.c
|
||||
constant.c)
|
||||
endif()
|
||||
target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD")
|
||||
target_link_libraries(x${float_char}cblat2 m)
|
||||
endif()
|
||||
add_test(NAME "x${float_char}cblat2"
|
||||
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat2> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2")
|
||||
|
||||
#level3
|
||||
if (NOT NOFORTRAN)
|
||||
add_executable(x${float_char}cblat3
|
||||
c_${float_char}blat3.f
|
||||
c_${float_char}blas3.c
|
||||
|
|
@ -55,7 +80,19 @@ foreach(float_type ${FLOAT_TYPES})
|
|||
auxiliary.c
|
||||
c_xerbla.c
|
||||
constant.c)
|
||||
else()
|
||||
add_executable(x${float_char}cblat3
|
||||
c_${float_char}blat3c.c
|
||||
c_${float_char}blas3.c
|
||||
c_${float_char}3chke.c
|
||||
auxiliary.c
|
||||
c_xerbla.c
|
||||
constant.c)
|
||||
endif()
|
||||
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD")
|
||||
target_link_libraries(x${float_char}cblat3 m)
|
||||
endif()
|
||||
add_test(NAME "x${float_char}cblat3"
|
||||
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
|
||||
|
||||
|
|
|
|||
|
|
@ -43,11 +43,7 @@ ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o
|
|||
ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o
|
||||
|
||||
|
||||
ifeq ($(NOFORTRAN),1)
|
||||
all ::
|
||||
else
|
||||
all :: all1 all2 all3
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_SINGLE),1)
|
||||
all1targets += xscblat1
|
||||
|
|
@ -222,53 +218,83 @@ endif
|
|||
|
||||
ifeq ($(BUILD_SINGLE),1)
|
||||
# Single real
|
||||
ifeq ($(NOFORTRAN), $(filter 0 2,$(NOFORTRAN)))
|
||||
xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME)
|
||||
$(FC) $(FLDFLAGS) -o xscblat1 c_sblat1.o $(stestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
|
||||
xscblat2: $(stestl2o) c_sblat2.o $(TOPDIR)/$(LIBNAME)
|
||||
$(FC) $(FLDFLAGS) -o xscblat2 c_sblat2.o $(stestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
|
||||
xscblat3: $(stestl3o) c_sblat3.o $(TOPDIR)/$(LIBNAME)
|
||||
$(FC) $(FLDFLAGS) -o xscblat3 c_sblat3.o $(stestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
else
|
||||
xscblat1: $(stestl1o) c_sblat1c.o $(TOPDIR)/$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o xscblat1 c_sblat1c.o $(stestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||
xscblat2: $(stestl2o) c_sblat2c.o $(TOPDIR)/$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o xscblat2 c_sblat2c.o $(stestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||
xscblat3: $(stestl3o) c_sblat3c.o $(TOPDIR)/$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o xscblat3 c_sblat3c.o $(stestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_DOUBLE),1)
|
||||
# Double real
|
||||
ifeq ($(NOFORTRAN),0)
|
||||
xdcblat1: $(dtestl1o) c_dblat1.o $(TOPDIR)/$(LIBNAME)
|
||||
$(FC) $(FLDFLAGS) -o xdcblat1 c_dblat1.o $(dtestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
xdcblat2: $(dtestl2o) c_dblat2.o $(TOPDIR)/$(LIBNAME)
|
||||
$(FC) $(FLDFLAGS) -o xdcblat2 c_dblat2.o $(dtestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
xdcblat3: $(dtestl3o) c_dblat3.o $(TOPDIR)/$(LIBNAME)
|
||||
$(FC) $(FLDFLAGS) -o xdcblat3 c_dblat3.o $(dtestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
else
|
||||
xdcblat1: $(dtestl1o) c_dblat1c.o $(TOPDIR)/$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o xdcblat1 c_dblat1c.o $(dtestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||
xdcblat2: $(dtestl2o) c_dblat2c.o $(TOPDIR)/$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o xdcblat2 c_dblat2c.o $(dtestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||
xdcblat3: $(dtestl3o) c_dblat3c.o $(TOPDIR)/$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o xdcblat3 c_dblat3c.o $(dtestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
ifeq ($(BUILD_COMPLEX),1)
|
||||
# Single complex
|
||||
ifeq ($(NOFORTRAN),0)
|
||||
xccblat1: $(ctestl1o) c_cblat1.o $(TOPDIR)/$(LIBNAME)
|
||||
$(FC) $(FLDFLAGS) -o xccblat1 c_cblat1.o $(ctestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME)
|
||||
$(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME)
|
||||
$(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
|
||||
xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME)
|
||||
$(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
else
|
||||
xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||
xccblat2: $(ctestl2o) c_cblat2c.o $(TOPDIR)/$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||
xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
ifeq ($(BUILD_COMPLEX16),1)
|
||||
# Double complex
|
||||
ifeq ($(NOFORTRAN),0)
|
||||
xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME)
|
||||
$(FC) $(FLDFLAGS) -o xzcblat1 c_zblat1.o $(ztestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME)
|
||||
$(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME)
|
||||
$(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
|
||||
|
||||
xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME)
|
||||
$(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
|
||||
else
|
||||
xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||
xzcblat2: $(ztestl2o) c_zblat2c.o $(TOPDIR)/$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||
xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
|
||||
endif
|
||||
endif
|
||||
|
||||
include $(TOPDIR)/Makefile.tail
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -27,11 +27,15 @@ else
|
|||
ifeq ($(ARCH),mips64)
|
||||
COMMONOBJS += dynamic_mips64.$(SUFFIX)
|
||||
else
|
||||
ifeq ($(ARCH),loongarch64)
|
||||
COMMONOBJS += dynamic_loongarch64.$(SUFFIX)
|
||||
else
|
||||
COMMONOBJS += dynamic.$(SUFFIX)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
COMMONOBJS += parameter.$(SUFFIX)
|
||||
endif
|
||||
|
|
@ -99,11 +103,15 @@ else
|
|||
ifeq ($(ARCH),mips64)
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX)
|
||||
else
|
||||
ifeq ($(ARCH),loongarch64)
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_loongarch64.$(SUFFIX)
|
||||
else
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
|
||||
endif
|
||||
|
|
|
|||
|
|
@ -352,6 +352,20 @@ int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set)
|
|||
|
||||
return pthread_setaffinity_np(thread, cpusetsize, cpu_set);
|
||||
}
|
||||
int openblas_getaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) {
|
||||
const int active_threads = openblas_get_num_threads();
|
||||
|
||||
if (thread_idx < 0 || thread_idx >= active_threads) {
|
||||
errno = EINVAL;
|
||||
return -1;
|
||||
}
|
||||
|
||||
pthread_t thread = (thread_idx == active_threads - 1)
|
||||
? pthread_self()
|
||||
: blas_threads[thread_idx];
|
||||
|
||||
return pthread_getaffinity_np(thread, cpusetsize, cpu_set);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void* blas_thread_server(void *arg){
|
||||
|
|
|
|||
|
|
@ -403,6 +403,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
|||
break;
|
||||
}
|
||||
|
||||
if (openblas_omp_adaptive_env() != 0) {
|
||||
#pragma omp parallel for num_threads(num) schedule(OMP_SCHED)
|
||||
for (i = 0; i < num; i ++) {
|
||||
|
||||
|
|
@ -412,6 +413,17 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
|||
|
||||
exec_threads(&queue[i], buf_index);
|
||||
}
|
||||
} else {
|
||||
#pragma omp parallel for schedule(OMP_SCHED)
|
||||
for (i = 0; i < num; i ++) {
|
||||
|
||||
#ifndef USE_SIMPLE_THREADED_LEVEL3
|
||||
queue[i].position = i;
|
||||
#endif
|
||||
|
||||
exec_threads(&queue[i], buf_index);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAVE_C11
|
||||
atomic_store(&blas_buffer_inuse[buf_index], false);
|
||||
|
|
|
|||
|
|
@ -96,7 +96,7 @@ extern gotoblas_t gotoblas_BARCELONA;
|
|||
#endif
|
||||
#ifdef DYN_ATOM
|
||||
extern gotoblas_t gotoblas_ATOM;
|
||||
elif defined(DYN_NEHALEM)
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_ATOM gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_ATOM gotoblas_PRESCOTT
|
||||
|
|
@ -855,7 +855,11 @@ static gotoblas_t *get_coretype(void){
|
|||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
} else if (exfamily == 10) {
|
||||
} else if (exfamily == 10) {
|
||||
if(support_avx512_bf16())
|
||||
return &gotoblas_COOPERLAKE;
|
||||
if(support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx())
|
||||
return &gotoblas_ZEN;
|
||||
else{
|
||||
|
|
@ -863,7 +867,7 @@ static gotoblas_t *get_coretype(void){
|
|||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else {
|
||||
return &gotoblas_BARCELONA;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -875,14 +879,37 @@ static gotoblas_t *get_coretype(void){
|
|||
if (model == 0xf && stepping < 0xe)
|
||||
return &gotoblas_NANO;
|
||||
return &gotoblas_NEHALEM;
|
||||
case 0x7:
|
||||
switch (exmodel) {
|
||||
case 5:
|
||||
if (support_avx2())
|
||||
return &gotoblas_ZEN;
|
||||
else
|
||||
return &gotoblas_DUNNINGTON;
|
||||
default:
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
default:
|
||||
if (family >= 0x7)
|
||||
if (family >= 0x8)
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_ZHAOXIN) {
|
||||
return &gotoblas_NEHALEM;
|
||||
switch (family) {
|
||||
case 0x7:
|
||||
switch (exmodel) {
|
||||
case 5:
|
||||
if (support_avx2())
|
||||
return &gotoblas_ZEN;
|
||||
else
|
||||
return &gotoblas_DUNNINGTON;
|
||||
default:
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
default:
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
|
|
|
|||
|
|
@ -99,6 +99,16 @@ extern gotoblas_t gotoblas_NEOVERSEN1;
|
|||
#else
|
||||
#define gotoblas_NEOVERSEN1 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_NEOVERSEV1
|
||||
extern gotoblas_t gotoblas_NEOVERSEV1;
|
||||
#else
|
||||
#define gotoblas_NEOVERSEV1 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_NEOVERSEN2
|
||||
extern gotoblas_t gotoblas_NEOVERSEN2;
|
||||
#else
|
||||
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_CORTEX_A55
|
||||
extern gotoblas_t gotoblas_CORTEXA55;
|
||||
#else
|
||||
|
|
@ -115,6 +125,8 @@ extern gotoblas_t gotoblas_THUNDERX2T99;
|
|||
extern gotoblas_t gotoblas_TSV110;
|
||||
extern gotoblas_t gotoblas_EMAG8180;
|
||||
extern gotoblas_t gotoblas_NEOVERSEN1;
|
||||
extern gotoblas_t gotoblas_NEOVERSEV1;
|
||||
extern gotoblas_t gotoblas_NEOVERSEN2;
|
||||
extern gotoblas_t gotoblas_THUNDERX3T110;
|
||||
extern gotoblas_t gotoblas_CORTEXA55;
|
||||
#endif
|
||||
|
|
@ -166,8 +178,10 @@ char *gotoblas_corename(void) {
|
|||
if (gotoblas == &gotoblas_TSV110) return corename[ 8];
|
||||
if (gotoblas == &gotoblas_EMAG8180) return corename[ 9];
|
||||
if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10];
|
||||
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11];
|
||||
if (gotoblas == &gotoblas_CORTEXA55) return corename[12];
|
||||
if (gotoblas == &gotoblas_NEOVERSEV1) return corename[11];
|
||||
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12];
|
||||
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
|
||||
if (gotoblas == &gotoblas_CORTEXA55) return corename[14];
|
||||
return corename[NUM_CORETYPES];
|
||||
}
|
||||
|
||||
|
|
@ -198,8 +212,10 @@ static gotoblas_t *force_coretype(char *coretype) {
|
|||
case 8: return (&gotoblas_TSV110);
|
||||
case 9: return (&gotoblas_EMAG8180);
|
||||
case 10: return (&gotoblas_NEOVERSEN1);
|
||||
case 11: return (&gotoblas_THUNDERX3T110);
|
||||
case 12: return (&gotoblas_CORTEXA55);
|
||||
case 11: return (&gotoblas_NEOVERSEV1);
|
||||
case 12: return (&gotoblas_NEOVERSEN2);
|
||||
case 13: return (&gotoblas_THUNDERX3T110);
|
||||
case 14: return (&gotoblas_CORTEXA55);
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
|
|
@ -258,6 +274,10 @@ static gotoblas_t *get_coretype(void) {
|
|||
return &gotoblas_CORTEXA73;
|
||||
case 0xd0c: // Neoverse N1
|
||||
return &gotoblas_NEOVERSEN1;
|
||||
case 0xd49:
|
||||
return &gotoblas_NEOVERSEN2;
|
||||
case 0xd40:
|
||||
return &gotoblas_NEOVERSEV1;
|
||||
case 0xd05: // Cortex A55
|
||||
return &gotoblas_CORTEXA55;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,128 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
extern gotoblas_t gotoblas_LOONGSON3R5;
|
||||
extern gotoblas_t gotoblas_LOONGSON2K1000;
|
||||
extern gotoblas_t gotoblas_LOONGSONGENERIC;
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
#define NUM_CORETYPES 3
|
||||
|
||||
static char *corename[] = {
|
||||
"loongson3r5",
|
||||
"loongson2k1000",
|
||||
"loongsongeneric",
|
||||
"unknown"
|
||||
};
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_LOONGSON3R5) return corename[0];
|
||||
if (gotoblas == &gotoblas_LOONGSON2K1000) return corename[1];
|
||||
if (gotoblas == &gotoblas_LOONGSONGENERIC) return corename[2];
|
||||
return corename[NUM_CORETYPES];
|
||||
}
|
||||
|
||||
static gotoblas_t *force_coretype(char *coretype) {
|
||||
int i;
|
||||
int found = -1;
|
||||
char message[128];
|
||||
|
||||
for ( i=0 ; i < NUM_CORETYPES; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype, corename[i], 20))
|
||||
{
|
||||
found = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
switch (found)
|
||||
{
|
||||
case 0: return (&gotoblas_LOONGSON3R5);
|
||||
case 1: return (&gotoblas_LOONGSON2K1000);
|
||||
case 2: return (&gotoblas_LOONGSONGENERIC);
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#define LASX_MASK 1<<7
|
||||
#define LSX_MASK 1<<6
|
||||
#define LOONGARCH_CFG2 0x02
|
||||
|
||||
static gotoblas_t *get_coretype(void) {
|
||||
int ret = 0;
|
||||
__asm__ volatile (
|
||||
"cpucfg %0, %1 \n\t"
|
||||
: "+&r"(ret)
|
||||
: "r"(LOONGARCH_CFG2)
|
||||
);
|
||||
|
||||
if (ret & LASX_MASK)
|
||||
return &gotoblas_LOONGSON3R5;
|
||||
else if (ret & LSX_MASK)
|
||||
return &gotoblas_LOONGSON2K1000;
|
||||
else
|
||||
return &gotoblas_LOONGSONGENERIC;
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_init(void) {
|
||||
char coremsg[128];
|
||||
char coren[22];
|
||||
char *p;
|
||||
|
||||
if (gotoblas) return;
|
||||
|
||||
p = getenv("OPENBLAS_CORETYPE");
|
||||
if ( p )
|
||||
{
|
||||
gotoblas = force_coretype(p);
|
||||
}
|
||||
else
|
||||
{
|
||||
gotoblas = get_coretype();
|
||||
}
|
||||
|
||||
if (gotoblas && gotoblas->init) {
|
||||
strncpy(coren, gotoblas_corename(), 20);
|
||||
sprintf(coremsg, "Core: %s\n", coren);
|
||||
openblas_warning(2, coremsg);
|
||||
gotoblas -> init();
|
||||
} else {
|
||||
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_quit(void) {
|
||||
gotoblas = NULL;
|
||||
}
|
||||
|
|
@ -877,21 +877,21 @@ void gotoblas_affinity_init(void) {
|
|||
nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
|
||||
#if !defined(__GLIBC_PREREQ)
|
||||
common->num_procs = nums;
|
||||
common->num_procs = nums >0 ? nums : 2;
|
||||
#else
|
||||
|
||||
#if !__GLIBC_PREREQ(2, 3)
|
||||
common->num_procs = nums;
|
||||
common->num_procs = nums >0 ? nums : 2;
|
||||
#elif __GLIBC_PREREQ(2, 7)
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
cpusetp = CPU_ALLOC(nums>0? nums:1024);
|
||||
if (cpusetp == NULL) {
|
||||
common->num_procs = nums;
|
||||
common->num_procs = nums>0 ? nums: 2;
|
||||
} else {
|
||||
size_t size;
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
size = CPU_ALLOC_SIZE(nums>0? nums: 1024);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0)
|
||||
common->num_procs = nums;
|
||||
common->num_procs = nums >0 ? nums : 1;
|
||||
else
|
||||
common->num_procs = CPU_COUNT_S(size,cpusetp);
|
||||
}
|
||||
|
|
@ -899,12 +899,12 @@ void gotoblas_affinity_init(void) {
|
|||
#else
|
||||
ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset);
|
||||
if (ret!=0) {
|
||||
common->num_procs = nums;
|
||||
common->num_procs = nums >0 ? nums : 2;
|
||||
} else {
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
int i;
|
||||
int n = 0;
|
||||
for (i=0;i<nums;i++)
|
||||
for (i=0;i<(nums >0 ?nums:1024) ;i++)
|
||||
if (CPU_ISSET(i,&cpuset)) n++;
|
||||
common->num_procs = n;
|
||||
}
|
||||
|
|
@ -1022,7 +1022,7 @@ void gotoblas_set_affinity2(int threads) {};
|
|||
|
||||
void gotoblas_affinity_reschedule(void) {};
|
||||
|
||||
int get_num_procs(void) { return sysconf(_SC_NPROCESSORS_CONF); }
|
||||
int get_num_procs(void) { int num = sysconf(_SC_NPROCESSORS_CONF); return (nums >0 ? nums : 2); }
|
||||
|
||||
int get_num_nodes(void) { return 1; }
|
||||
|
||||
|
|
|
|||
|
|
@ -252,23 +252,23 @@ int get_num_procs(void) {
|
|||
ret = omp_get_num_places();
|
||||
if (ret >0 ) nums = ret;
|
||||
#endif
|
||||
return nums;
|
||||
return (nums > 0 ? nums : 2);
|
||||
#endif
|
||||
|
||||
#if !defined(OS_LINUX)
|
||||
return nums;
|
||||
return (nums > 0 ? nums : 2);
|
||||
#endif
|
||||
|
||||
#if !defined(__GLIBC_PREREQ)
|
||||
return nums;
|
||||
return (nums > 0 ? nums :2);
|
||||
#else
|
||||
#if !__GLIBC_PREREQ(2, 3)
|
||||
return nums;
|
||||
return (nums > 0 ? nums :2);
|
||||
#endif
|
||||
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
|
||||
if (ret!=0) return nums;
|
||||
if (ret!=0) return (nums > 0 ? nums :2);
|
||||
n=0;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
for (i=0;i<nums;i++)
|
||||
|
|
@ -277,31 +277,31 @@ int get_num_procs(void) {
|
|||
#else
|
||||
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
|
||||
#endif
|
||||
return nums;
|
||||
return (nums > 0 ? nums :2);
|
||||
#else
|
||||
if (nums >= CPU_SETSIZE) {
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
if (cpusetp == NULL) {
|
||||
return nums;
|
||||
return (nums > 0 ? nums :2);
|
||||
}
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) {
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
return (nums > 0 ? nums :2);
|
||||
}
|
||||
ret = CPU_COUNT_S(size,cpusetp);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
return (nums > 0 ? nums :2);
|
||||
} else {
|
||||
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
|
||||
if (ret!=0) {
|
||||
return nums;
|
||||
return (nums > 0 ? nums :2);
|
||||
}
|
||||
ret = CPU_COUNT(&cpuset);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
return nums;
|
||||
return (nums > 0 ? nums :2);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
|
@ -1823,56 +1823,56 @@ int get_num_procs(void) {
|
|||
ret = omp_get_num_places();
|
||||
if (ret >0 ) nums = ret;
|
||||
#endif
|
||||
return nums;
|
||||
return (nums > 0 ? nums :2);
|
||||
#endif
|
||||
|
||||
#if !defined(OS_LINUX)
|
||||
return nums;
|
||||
return (nums > 0 ? nums :2);
|
||||
#endif
|
||||
|
||||
#if !defined(__GLIBC_PREREQ)
|
||||
return nums;
|
||||
return (nums > 0 ? nums :2);
|
||||
#else
|
||||
#if !__GLIBC_PREREQ(2, 3)
|
||||
return nums;
|
||||
return (nums > 0 ? nums :2);
|
||||
#endif
|
||||
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
|
||||
if (ret!=0) return nums;
|
||||
if (ret!=0) return (nums > 0 ? nums :2);
|
||||
n=0;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
for (i=0;i<nums;i++)
|
||||
for (i=0;i<(nums > 0 ? nums :2);i++)
|
||||
if (CPU_ISSET(i,&cpuset)) n++;
|
||||
nums=n;
|
||||
#else
|
||||
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
|
||||
#endif
|
||||
return nums;
|
||||
return (nums > 0 ? nums :2);
|
||||
#else
|
||||
if (nums >= CPU_SETSIZE) {
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
if (cpusetp == NULL) {
|
||||
return nums;
|
||||
return (nums > 0 ? nums :2);
|
||||
}
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) {
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
return (nums > 0 ? nums :2);
|
||||
}
|
||||
ret = CPU_COUNT_S(size,cpusetp);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
return (nums > 0 ? nums :2);
|
||||
} else {
|
||||
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
|
||||
if (ret!=0) {
|
||||
return nums;
|
||||
return (nums > 0 ? nums :2);
|
||||
}
|
||||
ret = CPU_COUNT(&cpuset);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
return nums;
|
||||
return (nums > 0 ? nums :2);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -39,6 +39,7 @@ static int openblas_env_block_factor=0;
|
|||
static int openblas_env_openblas_num_threads=0;
|
||||
static int openblas_env_goto_num_threads=0;
|
||||
static int openblas_env_omp_num_threads=0;
|
||||
static int openblas_env_omp_adaptive=0;
|
||||
|
||||
int openblas_verbose() { return openblas_env_verbose;}
|
||||
unsigned int openblas_thread_timeout() { return openblas_env_thread_timeout;}
|
||||
|
|
@ -46,6 +47,7 @@ int openblas_block_factor() { return openblas_env_block_factor;}
|
|||
int openblas_num_threads_env() { return openblas_env_openblas_num_threads;}
|
||||
int openblas_goto_num_threads_env() { return openblas_env_goto_num_threads;}
|
||||
int openblas_omp_num_threads_env() { return openblas_env_omp_num_threads;}
|
||||
int openblas_omp_adaptive_env() { return openblas_env_omp_adaptive;}
|
||||
|
||||
void openblas_read_env() {
|
||||
int ret=0;
|
||||
|
|
@ -79,6 +81,11 @@ void openblas_read_env() {
|
|||
if(ret<0) ret=0;
|
||||
openblas_env_omp_num_threads=ret;
|
||||
|
||||
ret=0;
|
||||
if (readenv(p,"OMP_ADAPTIVE")) ret = atoi(p);
|
||||
if(ret<0) ret=0;
|
||||
openblas_env_omp_adaptive=ret;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -60,6 +60,9 @@ static char* openblas_config_str=""
|
|||
#ifdef USE_OPENMP
|
||||
"USE_OPENMP "
|
||||
#endif
|
||||
#ifdef USE_TLS
|
||||
"USE_TLS "
|
||||
#endif
|
||||
#ifndef DYNAMIC_ARCH
|
||||
CHAR_CORENAME
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -2,6 +2,12 @@ TOPDIR = ..
|
|||
|
||||
include ../Makefile.system
|
||||
|
||||
ifdef USE_PERL
|
||||
GENSYM = gensymbol.pl
|
||||
else
|
||||
GENSYM = gensymbol
|
||||
endif
|
||||
|
||||
ifndef EXPRECISION
|
||||
EXPRECISION = 0
|
||||
endif
|
||||
|
|
@ -119,11 +125,11 @@ dll : ../$(LIBDLLNAME)
|
|||
-shared -o ../$(LIBDLLNAME) -Wl,--out-implib,../$(IMPLIBNAME) \
|
||||
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB)
|
||||
|
||||
$(LIBPREFIX).def : gensymbol
|
||||
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
|
||||
$(LIBPREFIX).def : $(GENSYM)
|
||||
./$(GENSYM) win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
|
||||
|
||||
libgoto_hpl.def : gensymbol
|
||||
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
|
||||
libgoto_hpl.def : $(GENSYM)
|
||||
./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
|
||||
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
|
|
@ -265,24 +271,24 @@ static : ../$(LIBNAME)
|
|||
$(AR) -cq ../$(LIBNAME) goto.$(SUFFIX)
|
||||
rm -f goto.$(SUFFIX)
|
||||
|
||||
osx.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
|
||||
osx.def : $(GENSYM) ../Makefile.system ../getarch.c
|
||||
./$(GENSYM) osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
|
||||
|
||||
aix.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
|
||||
aix.def : $(GENSYM) ../Makefile.system ../getarch.c
|
||||
./$(GENSYM) aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
|
||||
|
||||
objcopy.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
|
||||
objcopy.def : $(GENSYM) ../Makefile.system ../getarch.c
|
||||
./$(GENSYM) objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
|
||||
|
||||
objconv.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
|
||||
objconv.def : $(GENSYM) ../Makefile.system ../getarch.c
|
||||
./$(GENSYM) objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
|
||||
|
||||
test : linktest.c
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
|
||||
rm -f linktest
|
||||
|
||||
linktest.c : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c
|
||||
linktest.c : $(GENSYM) ../Makefile.system ../getarch.c
|
||||
./$(GENSYM) linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c
|
||||
|
||||
clean ::
|
||||
@rm -f *.def *.dylib __.SYMDEF* *.renamed
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -1,6 +1,16 @@
|
|||
#!/usr/bin/env perl
|
||||
#!/bin/sh
|
||||
|
||||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
||||
split() {
|
||||
set -f
|
||||
old_ifs=$IFS
|
||||
IFS=$2
|
||||
set -- $1
|
||||
printf '%s ' "$@"
|
||||
IFS=$old_ifs
|
||||
set +f
|
||||
}
|
||||
|
||||
hostos="$(uname -s | sed 's/\-.*//')"
|
||||
|
||||
#
|
||||
# 1. Not specified
|
||||
|
|
@ -12,407 +22,397 @@ $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
|||
# 2.2.2 Path is not correct, invalid compiler name, then gfortran is default with NOFORTRAN definition
|
||||
#
|
||||
|
||||
$makefile = shift(@ARGV);
|
||||
$config = shift(@ARGV);
|
||||
makefile="$1"
|
||||
config="$2"
|
||||
|
||||
$nofortran = 0;
|
||||
nofortran=0
|
||||
|
||||
shift 2
|
||||
compiler="$*"
|
||||
compiler_bin="$1"
|
||||
|
||||
$compiler = join(" ", @ARGV);
|
||||
$compiler_bin = shift(@ARGV);
|
||||
|
||||
# f77 is too ambiguous
|
||||
$compiler = "" if $compiler eq "f77";
|
||||
[ "$compiler" = "f77" ] && compiler=''
|
||||
|
||||
@path = split(/:/, $ENV{"PATH"});
|
||||
path=`split "$PATH" ':'`
|
||||
|
||||
if ($compiler eq "") {
|
||||
if [ -z "$compiler" ]; then
|
||||
|
||||
@lists = ("gfortran", "g95", "frt", "fort", "openf90", "openf95",
|
||||
"sunf77", "sunf90", "sunf95",
|
||||
"xlf95", "xlf90", "xlf",
|
||||
"ppuf77", "ppuf95", "ppuf90", "ppuxlf",
|
||||
"pathf90", "pathf95",
|
||||
"pgf95", "pgf90", "pgf77", "pgfortran", "nvfortran",
|
||||
"flang", "egfortran",
|
||||
"ifort", "nagfor");
|
||||
lists="gfortran g95 frt fort openf90 openf95
|
||||
sunf77 sunf90 sunf95
|
||||
xlf95 xlf90 xlf
|
||||
ppuf77 ppuf95 ppuf90 ppuxlf
|
||||
pathf90 pathf95
|
||||
pgf95 pgf90 pgf77 pgfortran nvfortran
|
||||
flang egfortran
|
||||
ifort nagfor ifx ftn crayftn"
|
||||
|
||||
OUTER:
|
||||
foreach $lists (@lists) {
|
||||
foreach $path (@path) {
|
||||
if (-x $path . "/" . $lists) {
|
||||
$compiler = $lists;
|
||||
$compiler_bin = $lists;
|
||||
last OUTER;
|
||||
for list in $lists; do
|
||||
for p in $path; do
|
||||
if [ -x "$p/$list" ]; then
|
||||
compiler=$list
|
||||
compiler_bin=$list
|
||||
break 2
|
||||
fi
|
||||
done
|
||||
done
|
||||
fi
|
||||
|
||||
if [ -z "$compiler" ]; then
|
||||
|
||||
nofortran=1
|
||||
compiler=gfortran
|
||||
vendor=GFORTRAN
|
||||
bu="_"
|
||||
|
||||
else
|
||||
{
|
||||
data="$(command -v "$compiler_bin" >/dev/null 2>&1)"
|
||||
vendor=""
|
||||
} && {
|
||||
data=`$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`
|
||||
if [ -z "$data" ]; then
|
||||
data=`$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.c && rm -f ftest.c`
|
||||
fi
|
||||
|
||||
case "$data" in *zhoge_*) bu=_ ;; esac
|
||||
|
||||
case "$data" in
|
||||
*Fujitsu*)
|
||||
vendor=FUJITSU
|
||||
openmp='-Kopenmp'
|
||||
;;
|
||||
*Cray*)
|
||||
vendor=CRAY
|
||||
openmp='-fopenmp'
|
||||
;;
|
||||
*GNU*|*GCC*)
|
||||
|
||||
v="${data#*GCC: *\) }"
|
||||
v="${v%%\"*}"
|
||||
|
||||
major="${v%%.*}"
|
||||
|
||||
if [ "$major" -ge 4 ]; then
|
||||
vendor=GFORTRAN
|
||||
openmp='-fopenmp'
|
||||
else
|
||||
case "$compiler" in
|
||||
*flang*)
|
||||
vendor=FLANG
|
||||
openmp='-fopenmp'
|
||||
;;
|
||||
*ifx*)
|
||||
vendor=INTEL
|
||||
openmp='-fopenmp'
|
||||
;;
|
||||
*pgf*|*nvf*)
|
||||
vendor=PGI
|
||||
openmp='-mp'
|
||||
;;
|
||||
*)
|
||||
vendor=G77
|
||||
openmp=''
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
;;
|
||||
*g95*)
|
||||
vendor=G95
|
||||
openmp=''
|
||||
;;
|
||||
*Intel*)
|
||||
vendor=INTEL
|
||||
openmp='-fopenmp'
|
||||
;;
|
||||
*'Sun Fortran'*)
|
||||
vendor=SUN
|
||||
openmp='-xopenmp=parallel'
|
||||
;;
|
||||
*PathScale*)
|
||||
vendor=PATHSCALE
|
||||
openmp='-openmp'
|
||||
;;
|
||||
*Open64*)
|
||||
vendor=OPEN64
|
||||
openmp='-mp'
|
||||
;;
|
||||
*PGF*|*NVF*)
|
||||
vendor=PGI
|
||||
openmp='-mp'
|
||||
;;
|
||||
*'IBM XL'*)
|
||||
vendor=IBM
|
||||
openmp='-openmp'
|
||||
;;
|
||||
*NAG*)
|
||||
vendor=NAG
|
||||
openmp='-openmp'
|
||||
;;
|
||||
esac
|
||||
|
||||
# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
|
||||
data=`$compiler -O2 -S ftest3.f >/dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`
|
||||
|
||||
[ -z "$data" ] && {
|
||||
data=`$compiler -O2 -S ftest3.f >/dev/null 2>&1 && cat ftest3.c && rm -f ftest3.c`
|
||||
}
|
||||
|
||||
case "$data" in *' zho_ge__'*) need2bu=1 ;; esac
|
||||
case "$vendor" in *G95*) [ "$NO_LAPACKE" != 1 ] && need2bu='' ;; esac
|
||||
}
|
||||
|
||||
if [ -z "$vendor" ]; then
|
||||
case "$compiler" in
|
||||
*g77*)
|
||||
vendor=G77
|
||||
bu=_
|
||||
openmp=''
|
||||
;;
|
||||
*g95*)
|
||||
vendor=G95
|
||||
bu=_
|
||||
openmp=''
|
||||
;;
|
||||
*gfortran*)
|
||||
vendor=GFORTRAN
|
||||
bu=_
|
||||
openmp='-fopenmp'
|
||||
;;
|
||||
*ifort*|*ifx*)
|
||||
vendor=INTEL
|
||||
bu=_
|
||||
openmp='-fopenmp'
|
||||
;;
|
||||
*pathf*)
|
||||
vendor=PATHSCALE
|
||||
bu=_
|
||||
openmp='-mp'
|
||||
;;
|
||||
*pgf*|*nvf*)
|
||||
vendor=PGI
|
||||
bu=_
|
||||
openmp='-mp'
|
||||
;;
|
||||
*ftn*)
|
||||
vendor=PGI
|
||||
bu=_
|
||||
openmp=-openmp
|
||||
;;
|
||||
*frt*)
|
||||
vendor=FUJITSU
|
||||
bu=_
|
||||
openmp='-openmp'
|
||||
;;
|
||||
*sunf77*|*sunf90*|*sunf95*)
|
||||
vendor=SUN
|
||||
bu=_
|
||||
openmp='-xopenmp=parallel'
|
||||
;;
|
||||
*ppuf*|*xlf*)
|
||||
vendor=IBM
|
||||
openmp='-openmp'
|
||||
;;
|
||||
*open64*)
|
||||
vendor=OPEN64
|
||||
openmp='-mp'
|
||||
;;
|
||||
*flang*)
|
||||
vendor=FLANG
|
||||
bu=_
|
||||
openmp='-fopenmp'
|
||||
;;
|
||||
*nagfor*)
|
||||
vendor=NAG
|
||||
bu=_
|
||||
openmp='-openmp'
|
||||
;;
|
||||
esac
|
||||
|
||||
if [ -z "$vendor" ]; then
|
||||
nofortran=1
|
||||
compiler="gfortran"
|
||||
vendor=GFORTRAN
|
||||
bu=_
|
||||
openmp=''
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
{
|
||||
data=`command -v $compiler_bin >/dev/null 2>&1`
|
||||
} && {
|
||||
|
||||
binary=$BINARY
|
||||
|
||||
[ "$USE_OPENMP" != 1 ] && openmp=''
|
||||
|
||||
case "$binary" in
|
||||
32)
|
||||
{
|
||||
link=`$compiler $openmp -m32 -v ftest2.f 2>&1 && rm -f a.out a.exe`
|
||||
} || {
|
||||
link=`$compiler $openmp -q32 -v ftest2.f 2>&1 && rm -f a.out a.exe`
|
||||
} || {
|
||||
# for AIX
|
||||
link=`$compiler $openmp -maix32 -v ftest2.f 2>&1 && rm -f a.out a.exe`
|
||||
} || {
|
||||
# for gfortran MIPS
|
||||
mips_data=`$compiler_bin -E -dM - < /dev/null`
|
||||
case "$mips_data" in
|
||||
*_MIPS_ISA_MIPS64*)
|
||||
link=`$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`
|
||||
;;
|
||||
*)
|
||||
link=`$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`
|
||||
;;
|
||||
esac
|
||||
} || {
|
||||
binary=''
|
||||
}
|
||||
}
|
||||
}
|
||||
;;
|
||||
64)
|
||||
{
|
||||
link=`$compiler $openmp -m64 -v ftest2.f 2>&1 && rm -f a.out a.exe`
|
||||
} || {
|
||||
link=`$compiler $openmp -q64 -v ftest2.f 2>&1 && rm -f a.out a.exe`
|
||||
} || {
|
||||
# for AIX
|
||||
link=`$compiler $openmp -maix64 -v ftest2.f 2>&1 && rm -f a.out a.exe`
|
||||
} || {
|
||||
# for gfortran MIPS
|
||||
link=`$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`
|
||||
} || {
|
||||
# for nagfor
|
||||
link=`$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`
|
||||
} || {
|
||||
binary=''
|
||||
}
|
||||
;;
|
||||
esac
|
||||
|
||||
if [ -z "$binary" ]; then
|
||||
link=`$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`
|
||||
fi
|
||||
}
|
||||
|
||||
if ($compiler eq "") {
|
||||
if [ "$vendor" = "NAG" ]; then
|
||||
link=`$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`
|
||||
fi
|
||||
if [ "$vendor" = "CRAY" ]; then
|
||||
link=`$compiler $openmp -hnopattern ftest2.f 2>&1 && rm -f a.out a.exe`
|
||||
fi
|
||||
linker_L=""
|
||||
linker_l=""
|
||||
linker_a=""
|
||||
|
||||
$nofortran = 1;
|
||||
$compiler = "gfortran";
|
||||
$vendor = GFORTRAN;
|
||||
$bu = "_";
|
||||
if [ -n "$link" ]; then
|
||||
|
||||
} else {
|
||||
link=`echo "$link" | sed 's/\-Y[[:space:]]P\,/\-Y/g'`
|
||||
|
||||
$data = `which $compiler_bin > /dev/null 2> /dev/null`;
|
||||
$vendor = "";
|
||||
link=`echo "$link" | sed 's/\-R[[:space:]]*/\-rpath\%/g'`
|
||||
|
||||
if (!$?) {
|
||||
link=`echo "$link" | sed 's/\-rpath[[:space:]]+/\-rpath\%/g'`
|
||||
|
||||
$data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`;
|
||||
if ($data eq "") {
|
||||
$data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.c && rm -f ftest.c`;
|
||||
}
|
||||
if ($data =~ /zhoge_/) {
|
||||
$bu = "_";
|
||||
}
|
||||
link=`echo "$link" | sed 's/\-rpath-link[[:space:]]+/\-rpath-link\%/g'`
|
||||
|
||||
if ($data =~ /Fujitsu/) {
|
||||
|
||||
$vendor = FUJITSU;
|
||||
$openmp = "-Kopenmp";
|
||||
|
||||
} elsif ($data =~ /GNU/ || $data =~ /GCC/ ) {
|
||||
|
||||
$data =~ s/\(+.*?\)+//g;
|
||||
$data =~ /(\d+)\.(\d+).(\d+)/;
|
||||
$major = $1;
|
||||
$minor = $2;
|
||||
|
||||
if ($major >= 4) {
|
||||
$vendor = GFORTRAN;
|
||||
$openmp = "-fopenmp";
|
||||
} else {
|
||||
if ($compiler =~ /flang/) {
|
||||
$vendor = FLANG;
|
||||
$openmp = "-fopenmp";
|
||||
} elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
|
||||
$vendor = PGI;
|
||||
$openmp = "-mp";
|
||||
} else {
|
||||
$vendor = G77;
|
||||
$openmp = "";
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if ($data =~ /g95/) {
|
||||
$vendor = G95;
|
||||
$openmp = "";
|
||||
}
|
||||
|
||||
if ($data =~ /Intel/) {
|
||||
$vendor = INTEL;
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($data =~ /Sun Fortran/) {
|
||||
$vendor = SUN;
|
||||
$openmp = "-xopenmp=parallel";
|
||||
}
|
||||
|
||||
if ($data =~ /PathScale/) {
|
||||
$vendor = PATHSCALE;
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($data =~ /Open64/) {
|
||||
$vendor = OPEN64;
|
||||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($data =~ /PGF/ || $data =~ /NVF/) {
|
||||
$vendor = PGI;
|
||||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($data =~ /IBM XL/) {
|
||||
$vendor = IBM;
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($data =~ /NAG/) {
|
||||
$vendor = NAG;
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
|
||||
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
|
||||
if ($data eq "") {
|
||||
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.c && rm -f ftest3.c`;
|
||||
}
|
||||
if ($data =~ / zho_ge__/) {
|
||||
$need2bu = 1;
|
||||
}
|
||||
if ($vendor =~ /G95/) {
|
||||
if ($ENV{NO_LAPACKE} != 1) {
|
||||
$need2bu = "";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($vendor eq "") {
|
||||
|
||||
if ($compiler =~ /g77/) {
|
||||
$vendor = G77;
|
||||
$bu = "_";
|
||||
$openmp = "";
|
||||
}
|
||||
|
||||
if ($compiler =~ /g95/) {
|
||||
$vendor = G95;
|
||||
$bu = "_";
|
||||
$openmp = "";
|
||||
}
|
||||
|
||||
if ($compiler =~ /gfortran/) {
|
||||
$vendor = GFORTRAN;
|
||||
$bu = "_";
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /ifort/) {
|
||||
$vendor = INTEL;
|
||||
$bu = "_";
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /pathf/) {
|
||||
$vendor = PATHSCALE;
|
||||
$bu = "_";
|
||||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
|
||||
$vendor = PGI;
|
||||
$bu = "_";
|
||||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /ftn/) {
|
||||
$vendor = PGI;
|
||||
$bu = "_";
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /frt/) {
|
||||
$vendor = FUJITSU;
|
||||
$bu = "_";
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /sunf77|sunf90|sunf95/) {
|
||||
$vendor = SUN;
|
||||
$bu = "_";
|
||||
$openmp = "-xopenmp=parallel";
|
||||
}
|
||||
|
||||
if ($compiler =~ /ppuf/) {
|
||||
$vendor = IBM;
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /xlf/) {
|
||||
$vendor = IBM;
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /open64/) {
|
||||
$vendor = OPEN64;
|
||||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /flang/) {
|
||||
$vendor = FLANG;
|
||||
$bu = "_";
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /nagfor/) {
|
||||
$vendor = NAG;
|
||||
$bu = "_";
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($vendor eq "") {
|
||||
$nofortran = 1;
|
||||
$compiler = "gfortran";
|
||||
$vendor = GFORTRAN;
|
||||
$bu = "_";
|
||||
$openmp = "";
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
$data = `which $compiler_bin > /dev/null 2> /dev/null`;
|
||||
|
||||
if (!$?) {
|
||||
|
||||
$binary = $ENV{"BINARY"};
|
||||
|
||||
$openmp = "" if $ENV{USE_OPENMP} != 1;
|
||||
|
||||
if ($binary == 32) {
|
||||
$link = `$compiler $openmp -m32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
if ($?) {
|
||||
$link = `$compiler $openmp -q32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
# for AIX
|
||||
if ($?) {
|
||||
$link = `$compiler $openmp -maix32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
#For gfortran MIPS
|
||||
if ($?) {
|
||||
$mips_data = `$compiler_bin -E -dM - < /dev/null`;
|
||||
if ($mips_data =~ /_MIPS_ISA_MIPS64/) {
|
||||
$link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
} else {
|
||||
$link = `$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
}
|
||||
$binary = "" if ($?);
|
||||
}
|
||||
|
||||
if ($binary == 64) {
|
||||
$link = `$compiler $openmp -m64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
if ($?) {
|
||||
$link = `$compiler $openmp -q64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
# for AIX
|
||||
if ($?) {
|
||||
$link = `$compiler $openmp -maix64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
#For gfortran MIPS
|
||||
if ($?) {
|
||||
$link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
#For nagfor
|
||||
if ($?) {
|
||||
$link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
$binary = "" if ($?);
|
||||
}
|
||||
if ($binary eq "") {
|
||||
$link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
}
|
||||
|
||||
if ( $vendor eq "NAG") {
|
||||
$link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
$linker_L = "";
|
||||
$linker_l = "";
|
||||
$linker_a = "";
|
||||
|
||||
if ($link ne "") {
|
||||
|
||||
$link =~ s/\-Y\sP\,/\-Y/g;
|
||||
|
||||
$link =~ s/\-R\s*/\-rpath\%/g;
|
||||
|
||||
$link =~ s/\-rpath\s+/\-rpath\%/g;
|
||||
|
||||
$link =~ s/\-rpath-link\s+/\-rpath-link\%/g;
|
||||
|
||||
@flags = split(/[\s\,\n]/, $link);
|
||||
flags=`echo "$link" | tr "',\n" " "`
|
||||
# remove leading and trailing quotes from each flag.
|
||||
@flags = map {s/^['"]|['"]$//g; $_} @flags;
|
||||
#@flags = map {s/^['"]|['"]$//g; $_} @flags;
|
||||
|
||||
foreach $flags (@flags) {
|
||||
if (
|
||||
($flags =~ /^\-L/)
|
||||
&& ($flags !~ /^-LIST:/)
|
||||
&& ($flags !~ /^-LANG:/)
|
||||
) {
|
||||
$linker_L .= $flags . " ";
|
||||
}
|
||||
for flag in $flags; do
|
||||
case "$flag" in -L*)
|
||||
case "$flag" in
|
||||
-LIST:*|-LANG:*) ;;
|
||||
*) linker_L="$linker_L $flag" ;;
|
||||
esac
|
||||
esac
|
||||
|
||||
if ($flags =~ /^\-Y/) {
|
||||
next if ($hostos eq 'SunOS');
|
||||
$linker_L .= "-Wl,". $flags . " ";
|
||||
}
|
||||
case "$flag" in -Y*)
|
||||
[ "$hostos" = "SunOS" ] && continue
|
||||
linker_L="$linker_L -Wl,$flag"
|
||||
;;
|
||||
esac
|
||||
|
||||
if ($flags =~ /^\--exclude-libs/) {
|
||||
$linker_L .= "-Wl,". $flags . " ";
|
||||
$flags="";
|
||||
}
|
||||
case "$flag" in --exclude-libs*)
|
||||
linker_L="$linker_L -Wl,$flag"
|
||||
flag=""
|
||||
;;
|
||||
esac
|
||||
|
||||
case "$flag" in -rpath%*)
|
||||
flag=`echo "$flag" | sed 's/\%/\,/g'`
|
||||
linker_L="$linker_L -Wl,$flag"
|
||||
esac
|
||||
|
||||
if ($flags =~ /^\-rpath\%/) {
|
||||
$flags =~ s/\%/\,/g;
|
||||
$linker_L .= "-Wl,". $flags . " " ;
|
||||
}
|
||||
case "$flag" in -rpath-link%*)
|
||||
flag=`echo "$flag" | sed 's/\%/\,/g'`
|
||||
linker_L="$linker_L -Wl,$flag"
|
||||
;;
|
||||
esac
|
||||
|
||||
if ($flags =~ /^\-rpath-link\%/) {
|
||||
$flags =~ s/\%/\,/g;
|
||||
$linker_L .= "-Wl,". $flags . " " ;
|
||||
}
|
||||
if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) {
|
||||
$flags = "-lomp";
|
||||
}
|
||||
case "$flag" in *-lgomp*)
|
||||
case "$CC" in *clang*)
|
||||
flag="-lomp"
|
||||
;;
|
||||
esac
|
||||
esac
|
||||
|
||||
if (
|
||||
($flags =~ /^\-l/)
|
||||
&& ($flags !~ /ibrary/)
|
||||
&& ($flags !~ /gfortranbegin/)
|
||||
&& ($flags !~ /flangmain/)
|
||||
&& ($flags !~ /frtbegin/)
|
||||
&& ($flags !~ /pathfstart/)
|
||||
&& ($flags !~ /crt[0-9]/)
|
||||
&& ($flags !~ /gcc/)
|
||||
&& ($flags !~ /user32/)
|
||||
&& ($flags !~ /kernel32/)
|
||||
&& ($flags !~ /advapi32/)
|
||||
&& ($flags !~ /shell32/)
|
||||
&& ($flags !~ /omp/ || ($vendor !~ /PGI/ && $vendor !~ /FUJITSU/ && $flags =~ /omp/))
|
||||
&& ($flags !~ /[0-9]+/ || ($vendor == FUJITSU && $flags =~ /^-lfj90/))
|
||||
&& ($flags !~ /^\-l$/)
|
||||
) {
|
||||
$linker_l .= $flags . " ";
|
||||
}
|
||||
case "$flag" in -l*)
|
||||
case "$flag" in
|
||||
*ibrary*|*gfortranbegin*|*flangmain*|*frtbegin*|*pathfstart*|\
|
||||
*crt[0-9]*|*gcc*|*user32*|*kernel32*|*advapi32*|*shell32*|\
|
||||
-l) ;;
|
||||
*omp*)
|
||||
case "$vendor" in
|
||||
*PGI*|*FUJITSU*) ;;
|
||||
*) linker_l="$linker_l $flag" ;;
|
||||
esac
|
||||
;;
|
||||
*[0-9]*)
|
||||
if [ "$vendor" = "FUJITSU" ]; then
|
||||
case "$flag" in
|
||||
-lfj90*) linker_l="$linker_l $flag" ;;
|
||||
*) ;;
|
||||
esac
|
||||
fi
|
||||
;;
|
||||
*) linker_l="$linker_l $flag" ;;
|
||||
esac
|
||||
esac
|
||||
|
||||
if ( $flags =~ /quickfit.o/ && $vendor == NAG) {
|
||||
$linker_l .= $flags . " ";
|
||||
}
|
||||
if ( $flags =~ /safefit.o/ && $vendor == NAG) {
|
||||
$linker_l .= $flags . " ";
|
||||
}
|
||||
if ( $flags =~ /thsafe.o/ && $vendor == NAG) {
|
||||
$linker_l .= $flags . " ";
|
||||
}
|
||||
case "$flag" in *quickfit.o*)
|
||||
[ "$vendor" = "NAG" ] && linker_l="$linker_l $flag" ;;
|
||||
esac
|
||||
|
||||
$linker_a .= $flags . " " if $flags =~ /\.a$/;
|
||||
}
|
||||
case "$flag" in *safefit.o*)
|
||||
[ "$vendor" = "NAG" ] && linker_l="$linker_l $flag" ;;
|
||||
esac
|
||||
|
||||
}
|
||||
case "$flag" in *thsafe.o*)
|
||||
[ "$vendor" = "NAG" ] && linker_l="$linker_l $flag" ;;
|
||||
esac
|
||||
|
||||
if ($vendor eq "FLANG"){
|
||||
$linker_a .= "-lflang"
|
||||
}
|
||||
case "$flag" in *.a) linker_a="$linker_a $flag" ;; esac
|
||||
done
|
||||
fi
|
||||
|
||||
open(MAKEFILE, ">> $makefile") || die "Can't append $makefile";
|
||||
open(CONFFILE, ">> $config" ) || die "Can't append $config";
|
||||
if [ "$vendor" = "FLANG" ]; then
|
||||
linker_a="$linker_a -lflang"
|
||||
fi
|
||||
|
||||
print MAKEFILE "F_COMPILER=$vendor\n";
|
||||
print MAKEFILE "FC=$compiler\n";
|
||||
print MAKEFILE "BU=$bu\n" if $bu ne "";
|
||||
print MAKEFILE "NOFORTRAN=1\n" if $nofortran == 1;
|
||||
printf "F_COMPILER=%s\n" "$vendor" >> "$makefile"
|
||||
printf "FC=%s\n" "$compiler" >> "$makefile"
|
||||
[ -n "$bu" ] && printf 'BU=%s\n' "$bu" >> "$makefile"
|
||||
[ "$nofortran" -eq 1 ] && printf 'NOFORTRAN=1\n' >> "$makefile"
|
||||
|
||||
print CONFFILE "#define BUNDERSCORE\t$bu\n" if $bu ne "";
|
||||
print CONFFILE "#define NEEDBUNDERSCORE\t1\n" if $bu ne "";
|
||||
print CONFFILE "#define NEED2UNDERSCORES\t1\n" if $need2bu ne "";
|
||||
[ -n "$bu" ] && printf '#define BUNDERSCORE\t%s\n' "$bu" >> "$config"
|
||||
[ -n "$bu" ] && printf '#define NEEDBUNDERSCORE\t1\n' >> "$config"
|
||||
[ -n "$need2bu" ] && printf "#define NEED2UNDERSCORES\t1\n" >> "$config"
|
||||
|
||||
print MAKEFILE "NEED2UNDERSCORES=1\n" if $need2bu ne "";
|
||||
[ -n "$need2bu" ] && printf "#define NEED2UNDERSCORES=1\n" >> "$config"
|
||||
|
||||
if (($linker_l ne "") || ($linker_a ne "")) {
|
||||
print MAKEFILE "FEXTRALIB=$linker_L $linker_l $linker_a\n";
|
||||
}
|
||||
if [ -n "$linker_l" ] || [ -n "$linker_a" ]; then
|
||||
printf "FEXTRALIB=%s %s %s\n" "$linker_L" "$linker_l" "$linker_a" >> "$makefile"
|
||||
fi
|
||||
|
||||
close(MAKEFILE);
|
||||
close(CONFFILE);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,429 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
||||
|
||||
#
|
||||
# 1. Not specified
|
||||
# 1.1 Automatically detect, then check compiler
|
||||
# 1.2 If no fortran compiler is detected, gfortran is default with NOFORTRAN definition
|
||||
# 2. Specified
|
||||
# 2.1 If path is correct, check compiler
|
||||
# 2.2 If path is not correct, but still valid compiler name, force setting
|
||||
# 2.2.2 Path is not correct, invalid compiler name, then gfortran is default with NOFORTRAN definition
|
||||
#
|
||||
|
||||
$makefile = shift(@ARGV);
|
||||
$config = shift(@ARGV);
|
||||
|
||||
$nofortran = 0;
|
||||
|
||||
$compiler = join(" ", @ARGV);
|
||||
$compiler_bin = shift(@ARGV);
|
||||
|
||||
# f77 is too ambiguous
|
||||
$compiler = "" if $compiler eq "f77";
|
||||
|
||||
@path = split(/:/, $ENV{"PATH"});
|
||||
|
||||
if ($compiler eq "") {
|
||||
|
||||
@lists = ("gfortran", "g95", "frt", "fort", "openf90", "openf95",
|
||||
"sunf77", "sunf90", "sunf95",
|
||||
"xlf95", "xlf90", "xlf",
|
||||
"ppuf77", "ppuf95", "ppuf90", "ppuxlf",
|
||||
"pathf90", "pathf95",
|
||||
"pgf95", "pgf90", "pgf77", "pgfortran", "nvfortran",
|
||||
"flang", "egfortran",
|
||||
"ifort", "nagfor", "ifx", "ftn", "crayftn");
|
||||
|
||||
OUTER:
|
||||
foreach $lists (@lists) {
|
||||
foreach $path (@path) {
|
||||
if (-x $path . "/" . $lists) {
|
||||
$compiler = $lists;
|
||||
$compiler_bin = $lists;
|
||||
last OUTER;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if ($compiler eq "") {
|
||||
|
||||
$nofortran = 1;
|
||||
$compiler = "gfortran";
|
||||
$vendor = GFORTRAN;
|
||||
$bu = "_";
|
||||
|
||||
} else {
|
||||
|
||||
$data = `which $compiler_bin > /dev/null 2> /dev/null`;
|
||||
$vendor = "";
|
||||
|
||||
if (!$?) {
|
||||
|
||||
$data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`;
|
||||
if ($data eq "") {
|
||||
$data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.c && rm -f ftest.c`;
|
||||
}
|
||||
if ($data =~ /zhoge_/) {
|
||||
$bu = "_";
|
||||
}
|
||||
|
||||
if ($data =~ /Fujitsu/) {
|
||||
|
||||
$vendor = FUJITSU;
|
||||
$openmp = "-Kopenmp";
|
||||
|
||||
} elsif ($data =~ /Cray/) {
|
||||
|
||||
$vendor = CRAY;
|
||||
$openmp = "-fopenmp";
|
||||
|
||||
} elsif ($data =~ /GNU/ || $data =~ /GCC/ ) {
|
||||
|
||||
$data =~ s/\(+.*?\)+//g;
|
||||
$data =~ /(\d+)\.(\d+).(\d+)/;
|
||||
$major = $1;
|
||||
$minor = $2;
|
||||
|
||||
if ($major >= 4) {
|
||||
$vendor = GFORTRAN;
|
||||
$openmp = "-fopenmp";
|
||||
} else {
|
||||
if ($compiler =~ /flang/) {
|
||||
$vendor = FLANG;
|
||||
$openmp = "-fopenmp";
|
||||
} elsif ($compiler =~ /ifx/) {
|
||||
$vendor = INTEL;
|
||||
$openmp = "-fopenmp";
|
||||
} elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
|
||||
$vendor = PGI;
|
||||
$openmp = "-mp";
|
||||
} else {
|
||||
$vendor = G77;
|
||||
$openmp = "";
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if ($data =~ /g95/) {
|
||||
$vendor = G95;
|
||||
$openmp = "";
|
||||
}
|
||||
|
||||
if ($data =~ /Intel/) {
|
||||
$vendor = INTEL;
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($data =~ /Sun Fortran/) {
|
||||
$vendor = SUN;
|
||||
$openmp = "-xopenmp=parallel";
|
||||
}
|
||||
|
||||
if ($data =~ /PathScale/) {
|
||||
$vendor = PATHSCALE;
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($data =~ /Open64/) {
|
||||
$vendor = OPEN64;
|
||||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($data =~ /PGF/ || $data =~ /NVF/) {
|
||||
$vendor = PGI;
|
||||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($data =~ /IBM XL/) {
|
||||
$vendor = IBM;
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($data =~ /NAG/) {
|
||||
$vendor = NAG;
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
|
||||
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
|
||||
if ($data eq "") {
|
||||
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.c && rm -f ftest3.c`;
|
||||
}
|
||||
if ($data =~ / zho_ge__/) {
|
||||
$need2bu = 1;
|
||||
}
|
||||
if ($vendor =~ /G95/) {
|
||||
if ($ENV{NO_LAPACKE} != 1) {
|
||||
$need2bu = "";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($vendor eq "") {
|
||||
|
||||
if ($compiler =~ /g77/) {
|
||||
$vendor = G77;
|
||||
$bu = "_";
|
||||
$openmp = "";
|
||||
}
|
||||
|
||||
if ($compiler =~ /g95/) {
|
||||
$vendor = G95;
|
||||
$bu = "_";
|
||||
$openmp = "";
|
||||
}
|
||||
|
||||
if ($compiler =~ /gfortran/) {
|
||||
$vendor = GFORTRAN;
|
||||
$bu = "_";
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /ifort/ || $compiler =~ /ifx/) {
|
||||
$vendor = INTEL;
|
||||
$bu = "_";
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /pathf/) {
|
||||
$vendor = PATHSCALE;
|
||||
$bu = "_";
|
||||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
|
||||
$vendor = PGI;
|
||||
$bu = "_";
|
||||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /ftn/) {
|
||||
$vendor = PGI;
|
||||
$bu = "_";
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /frt/) {
|
||||
$vendor = FUJITSU;
|
||||
$bu = "_";
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /sunf77|sunf90|sunf95/) {
|
||||
$vendor = SUN;
|
||||
$bu = "_";
|
||||
$openmp = "-xopenmp=parallel";
|
||||
}
|
||||
|
||||
if ($compiler =~ /ppuf/) {
|
||||
$vendor = IBM;
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /xlf/) {
|
||||
$vendor = IBM;
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /open64/) {
|
||||
$vendor = OPEN64;
|
||||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /flang/) {
|
||||
$vendor = FLANG;
|
||||
$bu = "_";
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /nagfor/) {
|
||||
$vendor = NAG;
|
||||
$bu = "_";
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($vendor eq "") {
|
||||
$nofortran = 1;
|
||||
$compiler = "gfortran";
|
||||
$vendor = GFORTRAN;
|
||||
$bu = "_";
|
||||
$openmp = "";
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
$data = `which $compiler_bin > /dev/null 2> /dev/null`;
|
||||
|
||||
if (!$?) {
|
||||
|
||||
$binary = $ENV{"BINARY"};
|
||||
|
||||
$openmp = "" if $ENV{USE_OPENMP} != 1;
|
||||
|
||||
if ($binary == 32) {
|
||||
$link = `$compiler $openmp -m32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
if ($?) {
|
||||
$link = `$compiler $openmp -q32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
# for AIX
|
||||
if ($?) {
|
||||
$link = `$compiler $openmp -maix32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
#For gfortran MIPS
|
||||
if ($?) {
|
||||
$mips_data = `$compiler_bin -E -dM - < /dev/null`;
|
||||
if ($mips_data =~ /_MIPS_ISA_MIPS64/) {
|
||||
$link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
} else {
|
||||
$link = `$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
}
|
||||
$binary = "" if ($?);
|
||||
}
|
||||
|
||||
if ($binary == 64) {
|
||||
$link = `$compiler $openmp -m64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
if ($?) {
|
||||
$link = `$compiler $openmp -q64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
# for AIX
|
||||
if ($?) {
|
||||
$link = `$compiler $openmp -maix64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
#For gfortran MIPS
|
||||
if ($?) {
|
||||
$link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
#For nagfor
|
||||
if ($?) {
|
||||
$link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
$binary = "" if ($?);
|
||||
}
|
||||
if ($binary eq "") {
|
||||
$link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
}
|
||||
|
||||
if ( $vendor eq "NAG") {
|
||||
$link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
if ( $vendor eq "CRAY") {
|
||||
$link = `$compiler $openmp -hnopattern ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
$linker_L = "";
|
||||
$linker_l = "";
|
||||
$linker_a = "";
|
||||
|
||||
if ($link ne "") {
|
||||
|
||||
$link =~ s/\-Y\sP\,/\-Y/g;
|
||||
|
||||
$link =~ s/\-R\s*/\-rpath\%/g;
|
||||
|
||||
$link =~ s/\-rpath\s+/\-rpath\%/g;
|
||||
|
||||
$link =~ s/\-rpath-link\s+/\-rpath-link\%/g;
|
||||
|
||||
@flags = split(/[\s\,\n]/, $link);
|
||||
# remove leading and trailing quotes from each flag.
|
||||
@flags = map {s/^['"]|['"]$//g; $_} @flags;
|
||||
|
||||
foreach $flags (@flags) {
|
||||
if (
|
||||
($flags =~ /^\-L/)
|
||||
&& ($flags !~ /^-LIST:/)
|
||||
&& ($flags !~ /^-LANG:/)
|
||||
) {
|
||||
$linker_L .= $flags . " ";
|
||||
}
|
||||
|
||||
if ($flags =~ /^\-Y/) {
|
||||
next if ($hostos eq 'SunOS');
|
||||
$linker_L .= "-Wl,". $flags . " ";
|
||||
}
|
||||
|
||||
if ($flags =~ /^\--exclude-libs/) {
|
||||
$linker_L .= "-Wl,". $flags . " ";
|
||||
$flags="";
|
||||
}
|
||||
|
||||
|
||||
if ($flags =~ /^\-rpath\%/) {
|
||||
$flags =~ s/\%/\,/g;
|
||||
$linker_L .= "-Wl,". $flags . " " ;
|
||||
}
|
||||
|
||||
if ($flags =~ /^\-rpath-link\%/) {
|
||||
$flags =~ s/\%/\,/g;
|
||||
$linker_L .= "-Wl,". $flags . " " ;
|
||||
}
|
||||
if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) {
|
||||
$flags = "-lomp";
|
||||
}
|
||||
|
||||
if (
|
||||
($flags =~ /^\-l/)
|
||||
&& ($flags !~ /ibrary/)
|
||||
&& ($flags !~ /gfortranbegin/)
|
||||
&& ($flags !~ /flangmain/)
|
||||
&& ($flags !~ /frtbegin/)
|
||||
&& ($flags !~ /pathfstart/)
|
||||
&& ($flags !~ /crt[0-9]/)
|
||||
&& ($flags !~ /gcc/)
|
||||
&& ($flags !~ /user32/)
|
||||
&& ($flags !~ /kernel32/)
|
||||
&& ($flags !~ /advapi32/)
|
||||
&& ($flags !~ /shell32/)
|
||||
&& ($flags !~ /omp/ || ($vendor !~ /PGI/ && $vendor !~ /FUJITSU/ && $flags =~ /omp/))
|
||||
&& ($flags !~ /[0-9]+/ || ($vendor == FUJITSU && $flags =~ /^-lfj90/))
|
||||
&& ($flags !~ /^\-l$/)
|
||||
) {
|
||||
$linker_l .= $flags . " ";
|
||||
}
|
||||
|
||||
if ( $flags =~ /quickfit.o/ && $vendor == NAG) {
|
||||
$linker_l .= $flags . " ";
|
||||
}
|
||||
if ( $flags =~ /safefit.o/ && $vendor == NAG) {
|
||||
$linker_l .= $flags . " ";
|
||||
}
|
||||
if ( $flags =~ /thsafe.o/ && $vendor == NAG) {
|
||||
$linker_l .= $flags . " ";
|
||||
}
|
||||
|
||||
$linker_a .= $flags . " " if $flags =~ /\.a$/;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if ($vendor eq "FLANG"){
|
||||
$linker_a .= "-lflang"
|
||||
}
|
||||
|
||||
open(MAKEFILE, ">> $makefile") || die "Can't append $makefile";
|
||||
open(CONFFILE, ">> $config" ) || die "Can't append $config";
|
||||
|
||||
print MAKEFILE "F_COMPILER=$vendor\n";
|
||||
print MAKEFILE "FC=$compiler\n";
|
||||
print MAKEFILE "BU=$bu\n" if $bu ne "";
|
||||
print MAKEFILE "NOFORTRAN=1\n" if $nofortran == 1;
|
||||
|
||||
print CONFFILE "#define BUNDERSCORE\t$bu\n" if $bu ne "";
|
||||
print CONFFILE "#define NEEDBUNDERSCORE\t1\n" if $bu ne "";
|
||||
print CONFFILE "#define NEED2UNDERSCORES\t1\n" if $need2bu ne "";
|
||||
|
||||
print MAKEFILE "NEED2UNDERSCORES=1\n" if $need2bu ne "";
|
||||
|
||||
if (($linker_l ne "") || ($linker_a ne "")) {
|
||||
print MAKEFILE "FEXTRALIB=$linker_L $linker_l $linker_a\n";
|
||||
}
|
||||
|
||||
close(MAKEFILE);
|
||||
close(CONFFILE);
|
||||
162
getarch.c
162
getarch.c
|
|
@ -94,14 +94,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include <sys/sysinfo.h>
|
||||
#endif
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64)
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
|
||||
#else
|
||||
#ifndef NO_AVX512
|
||||
#define NO_AVX512
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
/* #define FORCE_P2 */
|
||||
/* #define FORCE_KATMAI */
|
||||
/* #define FORCE_COPPERMINE */
|
||||
|
|
@ -140,9 +132,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/* #define FORCE_PPC440FP2 */
|
||||
/* #define FORCE_CELL */
|
||||
/* #define FORCE_SICORTEX */
|
||||
/* #define FORCE_LOONGSON3R3 */
|
||||
/* #define FORCE_LOONGSON3R4 */
|
||||
/* #define FORCE_LOONGSON3R5 */
|
||||
/* #define FORCE_LOONGSON3R3 */
|
||||
/* #define FORCE_LOONGSON3R4 */
|
||||
/* #define FORCE_LOONGSON3R5 */
|
||||
/* #define FORCE_LOONGSON2K1000 */
|
||||
/* #define FORCE_LOONGSONGENERIC */
|
||||
/* #define FORCE_I6400 */
|
||||
/* #define FORCE_P6600 */
|
||||
/* #define FORCE_P5600 */
|
||||
|
|
@ -977,6 +971,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_LOONGSON2K1000
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "LOONGARCH"
|
||||
#define SUBARCHITECTURE "LOONGSON2K1000"
|
||||
#define SUBDIRNAME "loongarch64"
|
||||
#define ARCHCONFIG "-DLOONGSON2K1000 " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 "
|
||||
#define LIBNAME "loongson2k1000"
|
||||
#define CORENAME "LOONGSON2K1000"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_LOONGSONGENERIC
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "LOONGARCH"
|
||||
#define SUBARCHITECTURE "LOONGSONGENERIC"
|
||||
#define SUBDIRNAME "loongarch64"
|
||||
#define ARCHCONFIG "-DLOONGSONGENERIC " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 "
|
||||
#define LIBNAME "loongsongeneric"
|
||||
#define CORENAME "LOONGSONGENERIC"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_I6400
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "MIPS"
|
||||
|
|
@ -1240,7 +1262,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "cortexa53"
|
||||
#define CORENAME "CORTEXA53"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXA57
|
||||
|
|
@ -1256,7 +1277,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "cortexa57"
|
||||
#define CORENAME "CORTEXA57"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXA72
|
||||
|
|
@ -1272,7 +1292,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "cortexa72"
|
||||
#define CORENAME "CORTEXA72"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXA73
|
||||
|
|
@ -1288,7 +1307,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "cortexa73"
|
||||
#define CORENAME "CORTEXA73"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXX1
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "CORTEXX1"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DCORTEXX1 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "cortexx1"
|
||||
#define CORENAME "CORTEXX1"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXX2
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "CORTEXX2"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DCORTEXX2 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 -DARMV9"
|
||||
#define LIBNAME "cortexx2"
|
||||
#define CORENAME "CORTEXX2"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXA510
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "CORTEXA510"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DCORTEXA510 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 -DARMV9"
|
||||
#define LIBNAME "cortexa510"
|
||||
#define CORENAME "CORTEXA510"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXA710
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "CORTEXA710"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DCORTEXA710 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 -DARMV9"
|
||||
#define LIBNAME "cortexa710"
|
||||
#define CORENAME "CORTEXA710"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_NEOVERSEN1
|
||||
|
|
@ -1305,7 +1379,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-march=armv8.2-a -mtune=neoverse-n1"
|
||||
#define LIBNAME "neoversen1"
|
||||
#define CORENAME "NEOVERSEN1"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_NEOVERSEV1
|
||||
|
|
@ -1322,7 +1395,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-march=armv8.4-a -mtune=neoverse-v1"
|
||||
#define LIBNAME "neoversev1"
|
||||
#define CORENAME "NEOVERSEV1"
|
||||
#else
|
||||
#endif
|
||||
|
||||
|
||||
|
|
@ -1340,7 +1412,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-march=armv8.5-a -mtune=neoverse-n2"
|
||||
#define LIBNAME "neoversen2"
|
||||
#define CORENAME "NEOVERSEN2"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXA55
|
||||
|
|
@ -1356,7 +1427,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "cortexa55"
|
||||
#define CORENAME "CORTEXA55"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_FALKOR
|
||||
|
|
@ -1372,7 +1442,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "falkor"
|
||||
#define CORENAME "FALKOR"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_THUNDERX
|
||||
|
|
@ -1387,7 +1456,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "thunderx"
|
||||
#define CORENAME "THUNDERX"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_THUNDERX2T99
|
||||
|
|
@ -1405,7 +1473,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "thunderx2t99"
|
||||
#define CORENAME "THUNDERX2T99"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_TSV110
|
||||
|
|
@ -1421,7 +1488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "tsv110"
|
||||
#define CORENAME "TSV110"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_EMAG8180
|
||||
|
|
@ -1456,7 +1522,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "thunderx3t110"
|
||||
#define CORENAME "THUNDERX3T110"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_VORTEX
|
||||
|
|
@ -1488,7 +1553,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8"
|
||||
#define LIBNAME "a64fx"
|
||||
#define CORENAME "A64FX"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_FT2000
|
||||
#define ARMV8
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "FT2000"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DFT2000 " \
|
||||
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
|
||||
"-DL2_SIZE=33554426-DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "ft2000"
|
||||
#define CORENAME "FT2000"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_ZARCH_GENERIC
|
||||
|
|
@ -1524,6 +1604,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifdef FORCE_C910V
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "RISCV64"
|
||||
#ifdef NO_RV64GV
|
||||
#define SUBARCHITECTURE "RISCV64_GENERIC"
|
||||
#define SUBDIRNAME "riscv64"
|
||||
#define ARCHCONFIG "-DRISCV64_GENERIC " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
|
||||
#define LIBNAME "riscv64_generic"
|
||||
#define CORENAME "RISCV64_GENERIC"
|
||||
#else
|
||||
#define SUBARCHITECTURE "C910V"
|
||||
#define SUBDIRNAME "riscv64"
|
||||
#define ARCHCONFIG "-DC910V " \
|
||||
|
|
@ -1532,6 +1622,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
|
||||
#define LIBNAME "c910v"
|
||||
#define CORENAME "C910V"
|
||||
#endif
|
||||
#else
|
||||
#endif
|
||||
|
||||
|
|
@ -1632,17 +1723,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
static int get_num_cores(void) {
|
||||
|
||||
int count;
|
||||
#ifdef OS_WINDOWS
|
||||
SYSTEM_INFO sysinfo;
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
int m[2], count;
|
||||
int m[2];
|
||||
size_t len;
|
||||
#endif
|
||||
|
||||
#if defined(linux) || defined(__sun__)
|
||||
//returns the number of processors which are currently online
|
||||
return sysconf(_SC_NPROCESSORS_CONF);
|
||||
|
||||
count = sysconf(_SC_NPROCESSORS_CONF);
|
||||
if (count <= 0) count = 2;
|
||||
return count;
|
||||
|
||||
#elif defined(OS_WINDOWS)
|
||||
|
||||
GetSystemInfo(&sysinfo);
|
||||
|
|
@ -1653,13 +1747,15 @@ static int get_num_cores(void) {
|
|||
m[1] = HW_NCPU;
|
||||
len = sizeof(int);
|
||||
sysctl(m, 2, &count, &len, NULL, 0);
|
||||
|
||||
if (count <= 0) count = 2;
|
||||
|
||||
return count;
|
||||
|
||||
#elif defined(AIX)
|
||||
//returns the number of processors which are currently online
|
||||
return sysconf(_SC_NPROCESSORS_ONLN);
|
||||
|
||||
count = sysconf(_SC_NPROCESSORS_ONLN);
|
||||
if (count <= 0) count = 2;
|
||||
|
||||
#else
|
||||
return 2;
|
||||
#endif
|
||||
|
|
@ -1681,7 +1777,7 @@ int main(int argc, char *argv[]){
|
|||
#ifdef FORCE
|
||||
printf("CORE=%s\n", CORENAME);
|
||||
#else
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv)
|
||||
printf("CORE=%s\n", get_corename());
|
||||
#endif
|
||||
#endif
|
||||
|
|
@ -1829,7 +1925,7 @@ printf("ELF_VERSION=2\n");
|
|||
#ifdef FORCE
|
||||
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
|
||||
#else
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv)
|
||||
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -531,8 +531,11 @@ $(BLASOBJS) $(BLASOBJS_P) : functable.h
|
|||
$(BLASOBJS) $(BLASOBJS_P) : override CFLAGS += -DPROFILE_FUNC_NAME=interface_$(*F)
|
||||
|
||||
functable.h : Makefile
|
||||
ifndef USE_PERL
|
||||
./create $(FUNCALLFILES) > functable.h
|
||||
|
||||
else
|
||||
./create.pl $(FUNCALLFILES) > functable.h
|
||||
endif
|
||||
endif
|
||||
|
||||
clean ::
|
||||
|
|
|
|||
|
|
@ -1,22 +1,22 @@
|
|||
#!/usr/bin/env perl
|
||||
#!/bin/sh
|
||||
|
||||
$count = 0;
|
||||
count=0
|
||||
|
||||
foreach (@ARGV) {
|
||||
print "#define\tinterface_", $_, "\t\t", $count, "\n";
|
||||
$count ++;
|
||||
}
|
||||
for arg in "$@"; do
|
||||
printf "#define\tinterface_%s\t\t%d\n" "$arg" "$count"
|
||||
count=`expr $count + 1`
|
||||
done
|
||||
|
||||
print "#ifdef USE_FUNCTABLE\n";
|
||||
printf "#ifdef USE_FUNCTABLE\n"
|
||||
|
||||
print "#define MAX_PROF_TABLE ", $count, "\n";
|
||||
printf "#define MAX_PROF_TABLE %d\n" "$count"
|
||||
|
||||
print "static char *func_table[] = {\n";
|
||||
printf "static char *func_table[] = {\n"
|
||||
|
||||
foreach (@ARGV) {
|
||||
print "\"", $_, "\",\n";
|
||||
}
|
||||
for arg in "$@"; do
|
||||
printf "\"%s\",\n" "$arg"
|
||||
done
|
||||
|
||||
print "};\n";
|
||||
print "#endif\n";
|
||||
printf "};\n"
|
||||
printf "#endif\n"
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,22 @@
|
|||
#!/usr/bin/env perl
|
||||
|
||||
$count = 0;
|
||||
|
||||
foreach (@ARGV) {
|
||||
print "#define\tinterface_", $_, "\t\t", $count, "\n";
|
||||
$count ++;
|
||||
}
|
||||
|
||||
print "#ifdef USE_FUNCTABLE\n";
|
||||
|
||||
print "#define MAX_PROF_TABLE ", $count, "\n";
|
||||
|
||||
print "static char *func_table[] = {\n";
|
||||
|
||||
foreach (@ARGV) {
|
||||
print "\"", $_, "\",\n";
|
||||
}
|
||||
|
||||
print "};\n";
|
||||
print "#endif\n";
|
||||
|
||||
|
|
@ -678,7 +678,7 @@ endif ()
|
|||
set(SBGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c)
|
||||
endif ()
|
||||
if (NOT DEFINED SBGEMM_SMALL_K_B0_TT)
|
||||
set($SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
|
||||
set(SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "BFLOAT16")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16")
|
||||
|
|
@ -854,49 +854,49 @@ endif ()
|
|||
# Makefile.LA
|
||||
if(NOT NO_LAPACK)
|
||||
foreach (float_type ${FLOAT_TYPES})
|
||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||
if (${float_type} STREQUAL "BFLOAT16")
|
||||
set (float_char "SB")
|
||||
endif ()
|
||||
if (NOT DEFINED ${float_char}NEG_TCOPY)
|
||||
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C" OR ${float_char} STREQUAL "X")
|
||||
set(${float_char}NEG_TCOPY ../generic/zneg_tcopy.c)
|
||||
set(${float_char}NEG_TCOPY ../generic/zneg_tcopy_${${float_char}GEMM_UNROLL_M}.c)
|
||||
else ()
|
||||
set(${float_char}NEG_TCOPY ../generic/neg_tcopy.c)
|
||||
set(${float_char}NEG_TCOPY ../generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c)
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED ${float_char}LASWP_NCOPY)
|
||||
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C" OR ${float_char} STREQUAL "X")
|
||||
set(${float_char}LASWP_NCOPY ../generic/zlaswp_ncopy.c)
|
||||
set(${float_char}LASWP_NCOPY ../generic/zlaswp_ncopy_${${float_char}GEMM_UNROLL_N}.c)
|
||||
else ()
|
||||
set(${float_char}LASWP_NCOPY ../generic/laswp_ncopy.c)
|
||||
set(${float_char}LASWP_NCOPY ../generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c)
|
||||
endif ()
|
||||
endif ()
|
||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}NEG_TCOPY}_${${float_char}GEMM_UNROLL_M}" "" "neg_tcopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}LASWP_NCOPY}_${${float_char}GEMM_UNROLL_N}" "" "laswp_ncopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}NEG_TCOPY}" "" "neg_tcopy" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}LASWP_NCOPY}" "" "laswp_ncopy" false "" "" false ${float_type})
|
||||
endforeach()
|
||||
if (BUILD_COMPLEX AND NOT BUILD_SINGLE)
|
||||
if (NOT DEFINED SNEG_TCOPY)
|
||||
set(SNEG_TCOPY ../generic/neg_tcopy.c)
|
||||
set(SNEG_TCOPY ../generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c)
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED SLASWP_NCOPY)
|
||||
set(SLASWP_NCOPY ../generic/laswp_ncopy.c)
|
||||
set(SLASWP_NCOPY ../generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c)
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${SNEG_TCOPY}_${SGEMM_UNROLL_M}" "" "neg_tcopy" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SLASWP_NCOPY}_${SGEMM_UNROLL_N}" "" "laswp_ncopy" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SNEG_TCOPY}" "" "neg_tcopy" false "" "" false "SINGLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${SLASWP_NCOPY}" "" "laswp_ncopy" false "" "" false "SINGLE")
|
||||
endif()
|
||||
if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
|
||||
if (NOT DEFINED DNEG_TCOPY)
|
||||
set(DNEG_TCOPY ../generic/neg_tcopy.c)
|
||||
set(DNEG_TCOPY ../generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c)
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED DLASWP_NCOPY)
|
||||
set(DLASWP_NCOPY ../generic/laswp_ncopy.c)
|
||||
set(DLASWP_NCOPY ../generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c)
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${DNEG_TCOPY}_${DGEMM_UNROLL_M}" "" "neg_tcopy" false "" "" false "DOUBLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${DLASWP_NCOPY}_${DGEMM_UNROLL_N}" "" "laswp_ncopy" false "" "" false "DOUBLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${DNEG_TCOPY}_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" "" false "DOUBLE")
|
||||
GenerateNamedObjects("${KERNELDIR}/${DLASWP_NCOPY}_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" "" false "DOUBLE")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,216 @@
|
|||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
STRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
STRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
STRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
DTRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
DTRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
DTRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
DTRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
TRSMCOPYLN_M = trsm_lncopy_sve.c
|
||||
TRSMCOPYLT_M = trsm_ltcopy_sve.c
|
||||
TRSMCOPYUN_M = trsm_uncopy_sve.c
|
||||
TRSMCOPYUT_M = trsm_utcopy_sve.c
|
||||
|
||||
CTRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
CTRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
CTRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
CTRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c
|
||||
ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c
|
||||
ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c
|
||||
ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c
|
||||
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = axpy.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
|
||||
SASUMKERNEL = asum.S
|
||||
DASUMKERNEL = asum.S
|
||||
CASUMKERNEL = casum.S
|
||||
ZASUMKERNEL = zasum.S
|
||||
|
||||
SCOPYKERNEL = copy.S
|
||||
DCOPYKERNEL = copy.S
|
||||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
|
||||
SSWAPKERNEL = swap.S
|
||||
DSWAPKERNEL = swap.S
|
||||
CSWAPKERNEL = swap.S
|
||||
ZSWAPKERNEL = swap.S
|
||||
|
||||
ISAMAXKERNEL = iamax.S
|
||||
IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
DDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
else
|
||||
SDOTKERNEL = dot.S
|
||||
endif
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
|
||||
|
||||
SGEMMINCOPY = sgemm_ncopy_sve_v1.c
|
||||
SGEMMITCOPY = sgemm_tcopy_sve_v1.c
|
||||
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRMMUNCOPY_M = trmm_uncopy_sve_v1.c
|
||||
STRMMLNCOPY_M = trmm_lncopy_sve_v1.c
|
||||
STRMMUTCOPY_M = trmm_utcopy_sve_v1.c
|
||||
STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
|
||||
|
||||
SSYMMUCOPY_M = symm_ucopy_sve.c
|
||||
SSYMMLCOPY_M = symm_lcopy_sve.c
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
|
||||
|
||||
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
|
||||
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c
|
||||
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c
|
||||
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c
|
||||
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
|
||||
|
||||
DSYMMUCOPY_M = symm_ucopy_sve.c
|
||||
DSYMMLCOPY_M = symm_lcopy_sve.c
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
|
||||
CGEMMINCOPY = cgemm_ncopy_sve_v1.c
|
||||
CGEMMITCOPY = cgemm_tcopy_sve_v1.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
|
||||
CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
|
||||
CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
|
||||
CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
|
||||
|
||||
CHEMMLTCOPY_M = zhemm_ltcopy_sve.c
|
||||
CHEMMUTCOPY_M = zhemm_utcopy_sve.c
|
||||
|
||||
CSYMMUCOPY_M = zsymm_ucopy_sve.c
|
||||
CSYMMLCOPY_M = zsymm_lcopy_sve.c
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
|
||||
ZGEMMINCOPY = zgemm_ncopy_sve_v1.c
|
||||
ZGEMMITCOPY = zgemm_tcopy_sve_v1.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
|
||||
ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
|
||||
ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
|
||||
ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
|
||||
|
||||
ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c
|
||||
ZHEMMUTCOPY_M = zhemm_utcopy_sve.c
|
||||
|
||||
ZSYMMUCOPY_M = zsymm_ucopy_sve.c
|
||||
ZSYMMLCOPY_M = zsymm_lcopy_sve.c
|
||||
|
|
@ -0,0 +1,216 @@
|
|||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
STRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
STRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
STRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
DTRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
DTRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
DTRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
DTRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
TRSMCOPYLN_M = trsm_lncopy_sve.c
|
||||
TRSMCOPYLT_M = trsm_ltcopy_sve.c
|
||||
TRSMCOPYUN_M = trsm_uncopy_sve.c
|
||||
TRSMCOPYUT_M = trsm_utcopy_sve.c
|
||||
|
||||
CTRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
CTRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
CTRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
CTRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c
|
||||
ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c
|
||||
ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c
|
||||
ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c
|
||||
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = axpy.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
|
||||
SASUMKERNEL = asum.S
|
||||
DASUMKERNEL = asum.S
|
||||
CASUMKERNEL = casum.S
|
||||
ZASUMKERNEL = zasum.S
|
||||
|
||||
SCOPYKERNEL = copy.S
|
||||
DCOPYKERNEL = copy.S
|
||||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
|
||||
SSWAPKERNEL = swap.S
|
||||
DSWAPKERNEL = swap.S
|
||||
CSWAPKERNEL = swap.S
|
||||
ZSWAPKERNEL = swap.S
|
||||
|
||||
ISAMAXKERNEL = iamax.S
|
||||
IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
DDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
else
|
||||
SDOTKERNEL = dot.S
|
||||
endif
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
|
||||
|
||||
SGEMMINCOPY = sgemm_ncopy_sve_v1.c
|
||||
SGEMMITCOPY = sgemm_tcopy_sve_v1.c
|
||||
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRMMUNCOPY_M = trmm_uncopy_sve_v1.c
|
||||
STRMMLNCOPY_M = trmm_lncopy_sve_v1.c
|
||||
STRMMUTCOPY_M = trmm_utcopy_sve_v1.c
|
||||
STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
|
||||
|
||||
SSYMMUCOPY_M = symm_ucopy_sve.c
|
||||
SSYMMLCOPY_M = symm_lcopy_sve.c
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
|
||||
|
||||
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
|
||||
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c
|
||||
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c
|
||||
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c
|
||||
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
|
||||
|
||||
DSYMMUCOPY_M = symm_ucopy_sve.c
|
||||
DSYMMLCOPY_M = symm_lcopy_sve.c
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
|
||||
CGEMMINCOPY = cgemm_ncopy_sve_v1.c
|
||||
CGEMMITCOPY = cgemm_tcopy_sve_v1.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
|
||||
CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
|
||||
CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
|
||||
CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
|
||||
|
||||
CHEMMLTCOPY_M = zhemm_ltcopy_sve.c
|
||||
CHEMMUTCOPY_M = zhemm_utcopy_sve.c
|
||||
|
||||
CSYMMUCOPY_M = zsymm_ucopy_sve.c
|
||||
CSYMMLCOPY_M = zsymm_lcopy_sve.c
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
|
||||
ZGEMMINCOPY = zgemm_ncopy_sve_v1.c
|
||||
ZGEMMITCOPY = zgemm_tcopy_sve_v1.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
|
||||
ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
|
||||
ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
|
||||
ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
|
||||
|
||||
ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c
|
||||
ZHEMMUTCOPY_M = zhemm_utcopy_sve.c
|
||||
|
||||
ZSYMMUCOPY_M = zsymm_ucopy_sve.c
|
||||
ZSYMMLCOPY_M = zsymm_lcopy_sve.c
|
||||
|
|
@ -0,0 +1 @@
|
|||
include $(KERNELDIR)/KERNEL.CORTEXA57
|
||||
|
|
@ -0,0 +1,216 @@
|
|||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
STRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
STRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
STRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
DTRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
DTRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
DTRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
DTRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
TRSMCOPYLN_M = trsm_lncopy_sve.c
|
||||
TRSMCOPYLT_M = trsm_ltcopy_sve.c
|
||||
TRSMCOPYUN_M = trsm_uncopy_sve.c
|
||||
TRSMCOPYUT_M = trsm_utcopy_sve.c
|
||||
|
||||
CTRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
CTRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
CTRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
CTRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c
|
||||
ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c
|
||||
ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c
|
||||
ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c
|
||||
|
||||
ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c
|
||||
ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c
|
||||
ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c
|
||||
ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c
|
||||
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = axpy.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
|
||||
SASUMKERNEL = asum.S
|
||||
DASUMKERNEL = asum.S
|
||||
CASUMKERNEL = casum.S
|
||||
ZASUMKERNEL = zasum.S
|
||||
|
||||
SCOPYKERNEL = copy.S
|
||||
DCOPYKERNEL = copy.S
|
||||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
|
||||
SSWAPKERNEL = swap.S
|
||||
DSWAPKERNEL = swap.S
|
||||
CSWAPKERNEL = swap.S
|
||||
ZSWAPKERNEL = swap.S
|
||||
|
||||
ISAMAXKERNEL = iamax.S
|
||||
IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
DDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
else
|
||||
SDOTKERNEL = dot.S
|
||||
endif
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
|
||||
|
||||
SGEMMINCOPY = sgemm_ncopy_sve_v1.c
|
||||
SGEMMITCOPY = sgemm_tcopy_sve_v1.c
|
||||
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRMMUNCOPY_M = trmm_uncopy_sve_v1.c
|
||||
STRMMLNCOPY_M = trmm_lncopy_sve_v1.c
|
||||
STRMMUTCOPY_M = trmm_utcopy_sve_v1.c
|
||||
STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
|
||||
|
||||
SSYMMUCOPY_M = symm_ucopy_sve.c
|
||||
SSYMMLCOPY_M = symm_lcopy_sve.c
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
|
||||
|
||||
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
|
||||
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c
|
||||
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c
|
||||
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c
|
||||
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
|
||||
|
||||
DSYMMUCOPY_M = symm_ucopy_sve.c
|
||||
DSYMMLCOPY_M = symm_lcopy_sve.c
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
|
||||
CGEMMINCOPY = cgemm_ncopy_sve_v1.c
|
||||
CGEMMITCOPY = cgemm_tcopy_sve_v1.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
|
||||
CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
|
||||
CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
|
||||
CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
|
||||
|
||||
CHEMMLTCOPY_M = zhemm_ltcopy_sve.c
|
||||
CHEMMUTCOPY_M = zhemm_utcopy_sve.c
|
||||
|
||||
CSYMMUCOPY_M = zsymm_ucopy_sve.c
|
||||
CSYMMLCOPY_M = zsymm_lcopy_sve.c
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
|
||||
ZGEMMINCOPY = zgemm_ncopy_sve_v1.c
|
||||
ZGEMMITCOPY = zgemm_tcopy_sve_v1.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
|
||||
ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
|
||||
ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
|
||||
ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
|
||||
|
||||
ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c
|
||||
ZHEMMUTCOPY_M = zhemm_utcopy_sve.c
|
||||
|
||||
ZSYMMUCOPY_M = zsymm_ucopy_sve.c
|
||||
ZSYMMLCOPY_M = zsymm_lcopy_sve.c
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
include $(KERNELDIR)/KERNEL.CORTEXA57
|
||||
|
||||
|
||||
|
|
@ -187,3 +187,14 @@ ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
|||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
SBGEMM_BETA = sbgemm_beta_neoversen2.c
|
||||
SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversen2.c
|
||||
SBGEMMINCOPY = sbgemm_ncopy_neoversen2.c
|
||||
SBGEMMITCOPY = sbgemm_tcopy_neoversen2.c
|
||||
SBGEMMONCOPY = sbgemm_ncopy_neoversen2.c
|
||||
SBGEMMOTCOPY = sbgemm_tcopy_neoversen2.c
|
||||
SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,173 @@
|
|||
ifndef DSDOTKERNEL
|
||||
DSDOTKERNEL = ../generic/dot.c
|
||||
endif
|
||||
SGEMM_BETA = ../generic/gemm_beta.c
|
||||
DGEMM_BETA = ../generic/gemm_beta.c
|
||||
CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
#Todo: CGEMM3MKERNEL should be 4x4 blocksizes.
|
||||
CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S
|
||||
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S
|
||||
|
||||
#Pure C for other kernels
|
||||
SAMAXKERNEL = ../arm/amax.c
|
||||
DAMAXKERNEL = ../arm/amax.c
|
||||
CAMAXKERNEL = ../arm/zamax.c
|
||||
ZAMAXKERNEL = ../arm/zamax.c
|
||||
|
||||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMAXKERNEL = ../arm/iamax.c
|
||||
IDAMAXKERNEL = ../arm/iamax.c
|
||||
ICAMAXKERNEL = ../arm/izamax.c
|
||||
IZAMAXKERNEL = ../arm/izamax.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
SASUMKERNEL = ../arm/asum.c
|
||||
DASUMKERNEL = ../arm/asum.c
|
||||
CASUMKERNEL = ../arm/zasum.c
|
||||
ZASUMKERNEL = ../arm/zasum.c
|
||||
|
||||
SSUMKERNEL = ../arm/sum.c
|
||||
DSUMKERNEL = ../arm/sum.c
|
||||
CSUMKERNEL = ../arm/zsum.c
|
||||
ZSUMKERNEL = ../arm/zsum.c
|
||||
|
||||
SAXPYKERNEL = ../arm/axpy.c
|
||||
DAXPYKERNEL = ../arm/axpy.c
|
||||
CAXPYKERNEL = ../arm/zaxpy.c
|
||||
ZAXPYKERNEL = ../arm/zaxpy.c
|
||||
|
||||
SCOPYKERNEL = ../arm/copy.c
|
||||
DCOPYKERNEL = ../arm/copy.c
|
||||
CCOPYKERNEL = ../arm/zcopy.c
|
||||
ZCOPYKERNEL = ../arm/zcopy.c
|
||||
|
||||
SDOTKERNEL = ../arm/dot.c
|
||||
DDOTKERNEL = ../arm/dot.c
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
DNRM2KERNEL = ../arm/nrm2.c
|
||||
CNRM2KERNEL = ../arm/znrm2.c
|
||||
ZNRM2KERNEL = ../arm/znrm2.c
|
||||
|
||||
SROTKERNEL = ../arm/rot.c
|
||||
DROTKERNEL = ../arm/rot.c
|
||||
CROTKERNEL = ../arm/zrot.c
|
||||
ZROTKERNEL = ../arm/zrot.c
|
||||
|
||||
SSCALKERNEL = ../arm/scal.c
|
||||
DSCALKERNEL = ../arm/scal.c
|
||||
CSCALKERNEL = ../arm/zscal.c
|
||||
ZSCALKERNEL = ../arm/zscal.c
|
||||
|
||||
SSWAPKERNEL = ../arm/swap.c
|
||||
DSWAPKERNEL = ../arm/swap.c
|
||||
CSWAPKERNEL = ../arm/zswap.c
|
||||
ZSWAPKERNEL = ../arm/zswap.c
|
||||
|
||||
SGEMVNKERNEL = ../arm/gemv_n.c
|
||||
DGEMVNKERNEL = ../arm/gemv_n.c
|
||||
CGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
ZGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
|
||||
SGEMVTKERNEL = ../arm/gemv_t.c
|
||||
DGEMVTKERNEL = ../arm/gemv_t.c
|
||||
CGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
ZGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
|
||||
SSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
SSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
DSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
DSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
QSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
QSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
CSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
CSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
ZSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
ZSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
XSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
XSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
|
||||
ZHEMV_U_KERNEL = ../generic/zhemv_k.c
|
||||
ZHEMV_L_KERNEL = ../generic/zhemv_k.c
|
||||
|
||||
LSAME_KERNEL = ../generic/lsame.c
|
||||
SCABS_KERNEL = ../generic/cabs.c
|
||||
DCABS_KERNEL = ../generic/cabs.c
|
||||
QCABS_KERNEL = ../generic/cabs.c
|
||||
|
||||
#Dump kernel
|
||||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
|
|
@ -404,6 +404,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
#else
|
||||
nrm2_compute(n, x, inc_x, &ssq, &scale);
|
||||
#endif
|
||||
if (fabs(scale) <1.e-300) return 0.;
|
||||
ssq = sqrt(ssq) * scale;
|
||||
|
||||
return ssq;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,83 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) 2022, The OpenBLAS Project
|
||||
* All rights reserved.
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* 3. Neither the name of the OpenBLAS project nor the names of
|
||||
* its contributors may be used to endorse or promote products
|
||||
* derived from this software without specific prior written permission.
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
* *****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, IFLOAT *dummy2,
|
||||
BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, FLOAT *c,
|
||||
BLASLONG ldc) {
|
||||
|
||||
BLASLONG i, j;
|
||||
BLASLONG chunk, remain;
|
||||
FLOAT *c_offset1, *c_offset;
|
||||
c_offset = c;
|
||||
chunk = m >> 3;
|
||||
remain = m & 7;
|
||||
if (beta == ZERO) {
|
||||
for (j = n; j > 0; j--) {
|
||||
c_offset1 = c_offset;
|
||||
c_offset += ldc;
|
||||
for (i = chunk; i > 0; i--) {
|
||||
*(c_offset1 + 0) = ZERO;
|
||||
*(c_offset1 + 1) = ZERO;
|
||||
*(c_offset1 + 2) = ZERO;
|
||||
*(c_offset1 + 3) = ZERO;
|
||||
*(c_offset1 + 4) = ZERO;
|
||||
*(c_offset1 + 5) = ZERO;
|
||||
*(c_offset1 + 6) = ZERO;
|
||||
*(c_offset1 + 7) = ZERO;
|
||||
c_offset1 += 8;
|
||||
}
|
||||
for (i = remain; i > 0; i--) {
|
||||
*c_offset1 = ZERO;
|
||||
c_offset1++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (j = n; j > 0; j--) {
|
||||
c_offset1 = c_offset;
|
||||
c_offset += ldc;
|
||||
for (i = chunk; i > 0; i--) {
|
||||
*(c_offset1 + 0) *= beta;
|
||||
*(c_offset1 + 1) *= beta;
|
||||
*(c_offset1 + 2) *= beta;
|
||||
*(c_offset1 + 3) *= beta;
|
||||
*(c_offset1 + 4) *= beta;
|
||||
*(c_offset1 + 5) *= beta;
|
||||
*(c_offset1 + 6) *= beta;
|
||||
*(c_offset1 + 7) *= beta;
|
||||
c_offset1 += 8;
|
||||
}
|
||||
for (i = remain; i > 0; i--) {
|
||||
*c_offset1 *= beta;
|
||||
c_offset1++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
};
|
||||
|
|
@ -0,0 +1,45 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) 2022, The OpenBLAS Project
|
||||
* All rights reserved.
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* 3. Neither the name of the OpenBLAS project nor the names of
|
||||
* its contributors may be used to endorse or promote products
|
||||
* derived from this software without specific prior written permission.
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
* *****************************************************************************/
|
||||
|
||||
#include <arm_sve.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define ALPHA_ONE
|
||||
#include "sbgemm_kernel_8x4_neoversen2_impl.c"
|
||||
#undef ALPHA_ONE
|
||||
#include "sbgemm_kernel_8x4_neoversen2_impl.c"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B,
|
||||
FLOAT *C, BLASLONG ldc) {
|
||||
if (alpha == 1.0f)
|
||||
return sbgemm_kernel_neoversen2_alpha_one(m, n, k, alpha, A, B, C, ldc);
|
||||
else
|
||||
return sbgemm_kernel_neoversen2_alpha(m, n, k, alpha, A, B, C, ldc);
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,665 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) 2022, The OpenBLAS Project
|
||||
* All rights reserved.
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* 3. Neither the name of the OpenBLAS project nor the names of
|
||||
* its contributors may be used to endorse or promote products
|
||||
* derived from this software without specific prior written permission.
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
* *****************************************************************************/
|
||||
|
||||
#include <arm_sve.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef ALPHA_ONE
|
||||
#define LOAD_C(M, N) \
|
||||
mc##M##N = svld1_gather_index(pg32, ptr_c0##N + 2 * M , off_vc);
|
||||
|
||||
#define LOAD_C_LOW(M, N) \
|
||||
mc##M##N = svld1_gather_index(pg32_low, ptr_c0##N + 2 * M, off_vc);
|
||||
|
||||
#define LOAD_C_EVEN(M, N) \
|
||||
mc##M##N = svld1_gather_index(pg32_even, ptr_c0##N + 2 * M, off_vc);
|
||||
|
||||
#define LOAD_C_FIRST(M, N) \
|
||||
mc##M##N = svld1_gather_index(pg32_first, ptr_c0##N + 2 * M, off_vc);
|
||||
|
||||
#define STORE_C(M, N) \
|
||||
svst1_scatter_index(pg32, ptr_c0##N + 2 * M, off_vc, mc##M##N);
|
||||
|
||||
#define STORE_C_LOW(M, N) \
|
||||
svst1_scatter_index(pg32_low, ptr_c0##N + 2 * M, off_vc, mc##M##N);
|
||||
|
||||
#define STORE_C_EVEN(M, N) \
|
||||
svst1_scatter_index(pg32_even, ptr_c0##N + 2 * M, off_vc, mc##M##N);
|
||||
|
||||
#define STORE_C_FIRST(M, N) \
|
||||
svst1_scatter_index(pg32_first, ptr_c0##N + 2 * M, off_vc, mc##M##N);
|
||||
|
||||
#else
|
||||
#define LOAD_C(M, N) \
|
||||
mc##M##N = svdup_f32(0); \
|
||||
oc##M##N = svld1_gather_index(pg32, ptr_c0##N + 2 * M , off_vc);
|
||||
|
||||
#define LOAD_C_LOW(M, N) \
|
||||
mc##M##N = svdup_f32(0); \
|
||||
oc##M##N = svld1_gather_index(pg32_low, ptr_c0##N + 2 * M , off_vc);
|
||||
|
||||
#define LOAD_C_EVEN(M, N) \
|
||||
mc##M##N = svdup_f32(0); \
|
||||
oc##M##N = svld1_gather_index(pg32_even, ptr_c0##N + 2 * M , off_vc);
|
||||
|
||||
#define LOAD_C_FIRST(M, N) \
|
||||
mc##M##N = svdup_f32(0); \
|
||||
oc##M##N = svld1_gather_index(pg32_first, ptr_c0##N + 2 * M , off_vc);
|
||||
|
||||
#define STORE_C(M, N) \
|
||||
mc##M##N = svmad_z(pg32, svalpha, mc##M##N, oc##M##N); \
|
||||
svst1_scatter_index(pg32, ptr_c0##N + 2 * M, off_vc, mc##M##N);
|
||||
|
||||
#define STORE_C_LOW(M, N) \
|
||||
mc##M##N = svmad_z(pg32_low, svalpha, mc##M##N, oc##M##N); \
|
||||
svst1_scatter_index(pg32_low, ptr_c0##N + 2 * M, off_vc, mc##M##N);
|
||||
|
||||
#define STORE_C_EVEN(M, N) \
|
||||
mc##M##N = svmad_z(pg32_even, svalpha, mc##M##N, oc##M##N); \
|
||||
svst1_scatter_index(pg32_even, ptr_c0##N + 2 * M, off_vc, mc##M##N);
|
||||
|
||||
#define STORE_C_FIRST(M, N) \
|
||||
mc##M##N = svmad_z(pg32_first, svalpha, mc##M##N, oc##M##N); \
|
||||
svst1_scatter_index(pg32_first, ptr_c0##N + 2 * M, off_vc, mc##M##N);
|
||||
|
||||
#endif
|
||||
|
||||
#define LOAD_A(M) ma##M = svld1_bf16(pg16, ptr_a##M);
|
||||
|
||||
#define LOAD_B(N) mb##N = svld1_bf16(pg16, ptr_b##N);
|
||||
|
||||
#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N);
|
||||
|
||||
#define LOAD_KREST_1(NAME, M) \
|
||||
m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), zero, zero, zero, \
|
||||
*(ptr_##NAME##M + 1), zero, zero, zero);
|
||||
|
||||
#define LOAD_KREST_1_LOW(NAME, M) \
|
||||
m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), zero, zero, zero, zero, zero, \
|
||||
zero, zero);
|
||||
|
||||
#define LOAD_KREST_2(NAME, M) \
|
||||
m##NAME##M = \
|
||||
svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), zero, zero, \
|
||||
*(ptr_##NAME##M + 2), *(ptr_##NAME##M + 3), zero, zero);
|
||||
|
||||
#define LOAD_KREST_2_LOW(NAME, M) \
|
||||
m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), zero, \
|
||||
zero, zero, zero, zero, zero);
|
||||
|
||||
#define LOAD_KREST_3(NAME, M) \
|
||||
m##NAME##M = \
|
||||
svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), \
|
||||
*(ptr_##NAME##M + 2), zero, *(ptr_##NAME##M + 3), \
|
||||
*(ptr_##NAME##M + 4), *(ptr_##NAME##M + 5), zero);
|
||||
|
||||
#define LOAD_KREST_3_LOW(NAME, M) \
|
||||
m##NAME##M = \
|
||||
svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), \
|
||||
*(ptr_##NAME##M + 2), zero, zero, zero, zero, zero);
|
||||
|
||||
|
||||
#ifdef ALPHA_ONE
|
||||
int sbgemm_kernel_neoversen2_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc)
|
||||
#else
|
||||
int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc)
|
||||
#endif
|
||||
{
|
||||
bfloat16_t *ptr_a = (bfloat16_t *)A;
|
||||
bfloat16_t *ptr_b = (bfloat16_t *)B;
|
||||
FLOAT *ptr_c = C;
|
||||
|
||||
bfloat16_t *ptr_a0, *ptr_a1, *ptr_a2, *ptr_a3;
|
||||
bfloat16_t *ptr_b0, *ptr_b1;
|
||||
FLOAT *ptr_c00, *ptr_c01;
|
||||
|
||||
svbfloat16_t ma0, ma1, ma2, ma3, mb0, mb1;
|
||||
svfloat32_t mc00, mc01, mc10, mc11, mc20, mc21, mc30, mc31;
|
||||
#ifndef ALPHA_ONE
|
||||
svfloat32_t oc00, oc01, oc10, oc11, oc20, oc21, oc30, oc31;
|
||||
#endif
|
||||
svbool_t pg16 = svptrue_b16();
|
||||
svbool_t pg16_low = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0);
|
||||
svbool_t pg32 = svptrue_b32();
|
||||
svbool_t pg32_low = svdupq_b32(1, 1, 0, 0);
|
||||
svbool_t pg32_even = svdupq_b32(1, 0, 1, 0);
|
||||
svbool_t pg32_first = svdupq_b32(1, 0, 0, 0);
|
||||
svfloat32_t svalpha = svdup_f32(alpha);
|
||||
bfloat16 tmp = 0;
|
||||
bfloat16_t zero = *((bfloat16_t *)&tmp);
|
||||
BLASLONG krest = k & 3;
|
||||
|
||||
// 00 01 10 11
|
||||
svuint32_t off_vc = svdupq_u32(0, (uint32_t)ldc, 1, (uint32_t)ldc + 1);
|
||||
|
||||
for (BLASLONG j = 0; j < n / 4; j++) {
|
||||
ptr_c00 = ptr_c;
|
||||
ptr_c01 = ptr_c + 2 * ldc;
|
||||
ptr_c += 4 * ldc;
|
||||
|
||||
ptr_a = (bfloat16_t *)A;
|
||||
|
||||
for (BLASLONG i = 0; i < m / 8; i++) {
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_a1 = ptr_a0 + 2 * k;
|
||||
ptr_a2 = ptr_a1 + 2 * k;
|
||||
ptr_a3 = ptr_a2 + 2 * k;
|
||||
ptr_a += 8 * k;
|
||||
|
||||
ptr_b0 = ptr_b;
|
||||
ptr_b1 = ptr_b0 + 2 * k;
|
||||
|
||||
LOAD_C(0, 0); LOAD_C(0, 1);
|
||||
LOAD_C(1, 0); LOAD_C(1, 1);
|
||||
LOAD_C(2, 0); LOAD_C(2, 1);
|
||||
LOAD_C(3, 0); LOAD_C(3, 1);
|
||||
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3);
|
||||
LOAD_B(0); LOAD_B(1);
|
||||
|
||||
MATMUL(0, 0); MATMUL(0, 1);
|
||||
MATMUL(1, 0); MATMUL(1, 1);
|
||||
MATMUL(2, 0); MATMUL(2, 1);
|
||||
MATMUL(3, 0); MATMUL(3, 1);
|
||||
|
||||
ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8;
|
||||
ptr_b0 += 8; ptr_b1 += 8;
|
||||
}
|
||||
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
|
||||
LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3);
|
||||
LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
|
||||
LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3);
|
||||
LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
|
||||
LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3);
|
||||
LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1);
|
||||
}
|
||||
MATMUL(0, 0); MATMUL(0, 1);
|
||||
MATMUL(1, 0); MATMUL(1, 1);
|
||||
MATMUL(2, 0); MATMUL(2, 1);
|
||||
MATMUL(3, 0); MATMUL(3, 1);
|
||||
}
|
||||
|
||||
STORE_C(0, 0); STORE_C(0, 1);
|
||||
STORE_C(1, 0); STORE_C(1, 1);
|
||||
STORE_C(2, 0); STORE_C(2, 1);
|
||||
STORE_C(3, 0); STORE_C(3, 1);
|
||||
|
||||
ptr_c00 += 8; ptr_c01 += 8;
|
||||
}
|
||||
|
||||
if (m & 4) {
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_a1 = ptr_a0 + 2 * k;
|
||||
ptr_a += 4 * k;
|
||||
|
||||
ptr_b0 = ptr_b;
|
||||
ptr_b1 = ptr_b0 + 2 * k;
|
||||
|
||||
LOAD_C(0, 0); LOAD_C(0, 1);
|
||||
LOAD_C(1, 0); LOAD_C(1, 1);
|
||||
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
LOAD_A(0); LOAD_A(1);
|
||||
LOAD_B(0); LOAD_B(1);
|
||||
|
||||
MATMUL(0, 0); MATMUL(0, 1);
|
||||
MATMUL(1, 0); MATMUL(1, 1);
|
||||
|
||||
ptr_a0 += 8; ptr_a1 += 8;
|
||||
ptr_b0 += 8; ptr_b1 += 8;
|
||||
}
|
||||
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
|
||||
LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
|
||||
LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
|
||||
LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1);
|
||||
}
|
||||
MATMUL(0, 0); MATMUL(0, 1);
|
||||
MATMUL(1, 0); MATMUL(1, 1);
|
||||
}
|
||||
|
||||
STORE_C(0, 0); STORE_C(0, 1);
|
||||
STORE_C(1, 0); STORE_C(1, 1);
|
||||
|
||||
ptr_c00 += 4; ptr_c01 += 4;
|
||||
}
|
||||
|
||||
if (m & 2) {
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_a += 2 * k;
|
||||
|
||||
ptr_b0 = ptr_b;
|
||||
ptr_b1 = ptr_b0 + 2 * k;
|
||||
|
||||
LOAD_C(0, 0); LOAD_C(0, 1);
|
||||
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
LOAD_A(0);
|
||||
LOAD_B(0); LOAD_B(1);
|
||||
|
||||
MATMUL(0, 0); MATMUL(0, 1);
|
||||
|
||||
ptr_a0 += 8;
|
||||
ptr_b0 += 8; ptr_b1 += 8;
|
||||
}
|
||||
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1(a, 0);
|
||||
LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2(a, 0);
|
||||
LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3(a, 0);
|
||||
LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1);
|
||||
}
|
||||
MATMUL(0, 0); MATMUL(0, 1);
|
||||
}
|
||||
STORE_C(0, 0); STORE_C(0, 1);
|
||||
ptr_c00 += 2; ptr_c01 += 2;
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
ptr_a0 = ptr_a;
|
||||
|
||||
ptr_b0 = ptr_b;
|
||||
ptr_b1 = ptr_b0 + 2 * k;
|
||||
|
||||
LOAD_C_LOW(0, 0); LOAD_C_LOW(0, 1);
|
||||
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
ma0 = svld1_bf16(pg16_low, ptr_a0);
|
||||
LOAD_B(0); LOAD_B(1);
|
||||
|
||||
MATMUL(0, 0); MATMUL(0, 1);
|
||||
|
||||
ptr_a0 += 4;
|
||||
ptr_b0 += 8;
|
||||
ptr_b1 += 8;
|
||||
}
|
||||
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1_LOW(a, 0);
|
||||
LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2_LOW(a, 0);
|
||||
LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3_LOW(a, 0);
|
||||
LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1);
|
||||
}
|
||||
MATMUL(0, 0); MATMUL(0, 1);
|
||||
}
|
||||
STORE_C_LOW(0, 0); STORE_C_LOW(0, 1);
|
||||
}
|
||||
|
||||
ptr_b += 4 * k;
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
ptr_c00 = ptr_c;
|
||||
ptr_c += 2 * ldc;
|
||||
|
||||
ptr_a = (bfloat16_t *)A;
|
||||
|
||||
for (BLASLONG i = 0; i < m / 8; i++) {
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_a1 = ptr_a0 + 2 * k;
|
||||
ptr_a2 = ptr_a1 + 2 * k;
|
||||
ptr_a3 = ptr_a2 + 2 * k;
|
||||
ptr_a += 8 * k;
|
||||
|
||||
ptr_b0 = ptr_b;
|
||||
|
||||
LOAD_C(0, 0);
|
||||
LOAD_C(1, 0);
|
||||
LOAD_C(2, 0);
|
||||
LOAD_C(3, 0);
|
||||
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3);
|
||||
LOAD_B(0);
|
||||
|
||||
MATMUL(0, 0);
|
||||
MATMUL(1, 0);
|
||||
MATMUL(2, 0);
|
||||
MATMUL(3, 0);
|
||||
|
||||
ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8;
|
||||
ptr_b0 += 8;
|
||||
}
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
|
||||
LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3);
|
||||
LOAD_KREST_1(b, 0);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
|
||||
LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3);
|
||||
LOAD_KREST_2(b, 0);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
|
||||
LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3);
|
||||
LOAD_KREST_3(b, 0);
|
||||
}
|
||||
MATMUL(0, 0);
|
||||
MATMUL(1, 0);
|
||||
MATMUL(2, 0);
|
||||
MATMUL(3, 0);
|
||||
}
|
||||
|
||||
STORE_C(0, 0);
|
||||
STORE_C(1, 0);
|
||||
STORE_C(2, 0);
|
||||
STORE_C(3, 0);
|
||||
|
||||
ptr_c00 += 8;
|
||||
}
|
||||
|
||||
if (m & 4) {
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_a1 = ptr_a0 + 2 * k;
|
||||
ptr_a += 4 * k;
|
||||
|
||||
ptr_b0 = ptr_b;
|
||||
|
||||
LOAD_C(0, 0);
|
||||
LOAD_C(1, 0);
|
||||
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
LOAD_A(0); LOAD_A(1);
|
||||
LOAD_B(0);
|
||||
|
||||
MATMUL(0, 0);
|
||||
MATMUL(1, 0);
|
||||
|
||||
ptr_a0 += 8; ptr_a1 += 8;
|
||||
ptr_b0 += 8;
|
||||
}
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
|
||||
LOAD_KREST_1(b, 0);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
|
||||
LOAD_KREST_2(b, 0);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
|
||||
LOAD_KREST_3(b, 0);
|
||||
}
|
||||
MATMUL(0, 0);
|
||||
MATMUL(1, 0);
|
||||
}
|
||||
STORE_C(0, 0)
|
||||
STORE_C(1, 0)
|
||||
|
||||
ptr_c00 += 4;
|
||||
}
|
||||
|
||||
if (m & 2) {
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_a += 2 * k;
|
||||
ptr_b0 = ptr_b;
|
||||
|
||||
LOAD_C(0, 0);
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
LOAD_A(0);
|
||||
LOAD_B(0);
|
||||
MATMUL(0, 0);
|
||||
ptr_a0 += 8;
|
||||
ptr_b0 += 8;
|
||||
}
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1(a, 0);
|
||||
LOAD_KREST_1(b, 0);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2(a, 0);
|
||||
LOAD_KREST_2(b, 0);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3(a, 0);
|
||||
LOAD_KREST_3(b, 0);
|
||||
}
|
||||
MATMUL(0, 0);
|
||||
}
|
||||
STORE_C(0, 0);
|
||||
ptr_c00 += 2;
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
ptr_a0 = ptr_a;
|
||||
|
||||
ptr_b0 = ptr_b;
|
||||
|
||||
LOAD_C(0, 0);
|
||||
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
ma0 = svld1_bf16(pg16_low, ptr_a0);
|
||||
LOAD_B(0);
|
||||
MATMUL(0, 0);
|
||||
ptr_a0 += 4;
|
||||
ptr_b0 += 8;
|
||||
}
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1_LOW(a, 0);
|
||||
LOAD_KREST_1(b, 0);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2_LOW(a, 0);
|
||||
LOAD_KREST_2(b, 0);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3_LOW(a, 0);
|
||||
LOAD_KREST_3(b, 0);
|
||||
}
|
||||
MATMUL(0, 0);
|
||||
}
|
||||
STORE_C_LOW(0, 0);
|
||||
}
|
||||
|
||||
ptr_b += 2 * k;
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
ptr_c00 = ptr_c;
|
||||
ptr_a = (bfloat16_t *) A;
|
||||
|
||||
for (BLASLONG i = 0; i < m / 8; i++) {
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_a1 = ptr_a0 + 2 * k;
|
||||
ptr_a2 = ptr_a1 + 2 * k;
|
||||
ptr_a3 = ptr_a2 + 2 * k;
|
||||
ptr_a += 8 * k;
|
||||
|
||||
ptr_b0 = ptr_b;
|
||||
|
||||
LOAD_C_EVEN(0, 0);
|
||||
LOAD_C_EVEN(1, 0);
|
||||
LOAD_C_EVEN(2, 0);
|
||||
LOAD_C_EVEN(3, 0);
|
||||
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3);
|
||||
mb0 = svld1_bf16(pg16_low, ptr_b0);
|
||||
|
||||
MATMUL(0, 0);
|
||||
MATMUL(1, 0);
|
||||
MATMUL(2, 0);
|
||||
MATMUL(3, 0);
|
||||
|
||||
ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8;
|
||||
ptr_b0 += 4;
|
||||
}
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
|
||||
LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3);
|
||||
LOAD_KREST_1_LOW(b, 0);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
|
||||
LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3);
|
||||
LOAD_KREST_2_LOW(b, 0);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
|
||||
LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3);
|
||||
LOAD_KREST_3_LOW(b, 0);
|
||||
}
|
||||
MATMUL(0, 0);
|
||||
MATMUL(1, 0);
|
||||
MATMUL(2, 0);
|
||||
MATMUL(3, 0);
|
||||
}
|
||||
STORE_C_EVEN(0, 0)
|
||||
STORE_C_EVEN(1, 0);
|
||||
STORE_C_EVEN(2, 0);
|
||||
STORE_C_EVEN(3, 0);
|
||||
|
||||
ptr_c00 += 8;
|
||||
}
|
||||
|
||||
if (m & 4) {
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_a1 = ptr_a0 + 2 * k;
|
||||
ptr_a += 4 * k;
|
||||
|
||||
ptr_b0 = ptr_b;
|
||||
|
||||
LOAD_C_EVEN(0, 0);
|
||||
LOAD_C_EVEN(1, 0);
|
||||
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
LOAD_A(0); LOAD_A(1);
|
||||
mb0 = svld1_bf16(pg16_low, ptr_b0);
|
||||
|
||||
MATMUL(0, 0);
|
||||
MATMUL(1, 0);
|
||||
|
||||
ptr_a0 += 8; ptr_a1 += 8;
|
||||
ptr_b0 += 4;
|
||||
}
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
|
||||
LOAD_KREST_1_LOW(b, 0);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
|
||||
LOAD_KREST_2_LOW(b, 0);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
|
||||
LOAD_KREST_3_LOW(b, 0);
|
||||
}
|
||||
MATMUL(0, 0);
|
||||
MATMUL(1, 0);
|
||||
}
|
||||
STORE_C_EVEN(0, 0)
|
||||
STORE_C_EVEN(1, 0)
|
||||
|
||||
ptr_c00 += 4;
|
||||
}
|
||||
|
||||
if (m & 2) {
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_a += 2 * k;
|
||||
|
||||
ptr_b0 = ptr_b;
|
||||
|
||||
LOAD_C_EVEN(0, 0);
|
||||
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
LOAD_A(0);
|
||||
mb0 = svld1_bf16(pg16_low, ptr_b0);
|
||||
|
||||
MATMUL(0, 0);
|
||||
|
||||
ptr_a0 += 8;
|
||||
ptr_b0 += 4;
|
||||
}
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1(a, 0);
|
||||
LOAD_KREST_1_LOW(b, 0);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2(a, 0);
|
||||
LOAD_KREST_2_LOW(b, 0);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3(a, 0);
|
||||
LOAD_KREST_3_LOW(b, 0);
|
||||
}
|
||||
MATMUL(0, 0);
|
||||
}
|
||||
STORE_C_EVEN(0, 0);
|
||||
ptr_c00 += 2;
|
||||
}
|
||||
if (m & 1) {
|
||||
ptr_a0 = ptr_a;
|
||||
ptr_b0 = ptr_b;
|
||||
LOAD_C_FIRST(0, 0);
|
||||
for (BLASLONG p = 0; p < k / 4; p++) {
|
||||
ma0 = svld1_bf16(pg16_low, ptr_a0);
|
||||
mb0 = svld1_bf16(pg16_low, ptr_b0);
|
||||
|
||||
MATMUL(0, 0);
|
||||
|
||||
ptr_a0 += 4;
|
||||
ptr_b0 += 4;
|
||||
}
|
||||
if (krest) {
|
||||
if (krest == 1) {
|
||||
LOAD_KREST_1_LOW(a, 0);
|
||||
LOAD_KREST_1_LOW(b, 0);
|
||||
} else if (krest == 2) {
|
||||
LOAD_KREST_2_LOW(a, 0);
|
||||
LOAD_KREST_2_LOW(b, 0);
|
||||
} else if (krest == 3) {
|
||||
LOAD_KREST_3_LOW(a, 0);
|
||||
LOAD_KREST_3_LOW(b, 0);
|
||||
}
|
||||
MATMUL(0, 0);
|
||||
}
|
||||
STORE_C_FIRST(0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,101 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) 2022, The OpenBLAS Project
|
||||
* All rights reserved.
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* 3. Neither the name of the OpenBLAS project nor the names of
|
||||
* its contributors may be used to endorse or promote products
|
||||
* derived from this software without specific prior written permission.
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
* *****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
|
||||
IFLOAT *a_offset, *a_offset1, *a_offset2;
|
||||
IFLOAT *b_offset;
|
||||
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
for (BLASLONG j = 0; j < n / 2; j++) {
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset += 2 * lda;
|
||||
for (BLASLONG i = 0; i < m / 4; i++) {
|
||||
*(b_offset + 0) = *(a_offset1 + 0);
|
||||
*(b_offset + 1) = *(a_offset1 + 1);
|
||||
*(b_offset + 2) = *(a_offset1 + 2);
|
||||
*(b_offset + 3) = *(a_offset1 + 3);
|
||||
*(b_offset + 4) = *(a_offset2 + 0);
|
||||
*(b_offset + 5) = *(a_offset2 + 1);
|
||||
*(b_offset + 6) = *(a_offset2 + 2);
|
||||
*(b_offset + 7) = *(a_offset2 + 3);
|
||||
|
||||
a_offset1 += 4;
|
||||
a_offset2 += 4;
|
||||
b_offset += 8;
|
||||
}
|
||||
BLASLONG rest = m & 3;
|
||||
if (rest == 3) {
|
||||
*(b_offset + 0) = *(a_offset1 + 0);
|
||||
*(b_offset + 1) = *(a_offset1 + 1);
|
||||
*(b_offset + 2) = *(a_offset1 + 2);
|
||||
*(b_offset + 3) = *(a_offset2 + 0);
|
||||
*(b_offset + 4) = *(a_offset2 + 1);
|
||||
*(b_offset + 5) = *(a_offset2 + 2);
|
||||
b_offset += 6;
|
||||
} else if (rest == 2) {
|
||||
*(b_offset + 0) = *(a_offset1 + 0);
|
||||
*(b_offset + 1) = *(a_offset1 + 1);
|
||||
*(b_offset + 2) = *(a_offset2 + 0);
|
||||
*(b_offset + 3) = *(a_offset2 + 1);
|
||||
b_offset += 4;
|
||||
} else if (rest == 1) {
|
||||
*(b_offset + 0) = *(a_offset1 + 0);
|
||||
*(b_offset + 1) = *(a_offset2 + 0);
|
||||
b_offset += 2;
|
||||
}
|
||||
}
|
||||
if (n & 1) {
|
||||
for (BLASLONG i = 0; i < m / 4; i++) {
|
||||
*(b_offset + 0) = *(a_offset + 0);
|
||||
*(b_offset + 1) = *(a_offset + 1);
|
||||
*(b_offset + 2) = *(a_offset + 2);
|
||||
*(b_offset + 3) = *(a_offset + 3);
|
||||
|
||||
b_offset += 4;
|
||||
a_offset += 4;
|
||||
}
|
||||
BLASLONG rest = m & 3;
|
||||
if (rest == 3) {
|
||||
*(b_offset + 0) = *(a_offset + 0);
|
||||
*(b_offset + 1) = *(a_offset + 1);
|
||||
*(b_offset + 2) = *(a_offset + 2);
|
||||
} else if (rest == 2) {
|
||||
*(b_offset + 0) = *(a_offset + 0);
|
||||
*(b_offset + 1) = *(a_offset + 1);
|
||||
} else if (rest == 1) {
|
||||
*(b_offset + 0) = *(a_offset + 0);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -0,0 +1,109 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) 2022, The OpenBLAS Project
|
||||
* All rights reserved.
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* 3. Neither the name of the OpenBLAS project nor the names of
|
||||
* its contributors may be used to endorse or promote products
|
||||
* derived from this software without specific prior written permission.
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
* *****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
|
||||
IFLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
|
||||
IFLOAT *b_offset;
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
for (BLASLONG j = 0; j < n / 2; j++) {
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset3 = a_offset2 + lda;
|
||||
a_offset4 = a_offset3 + lda;
|
||||
a_offset += 2;
|
||||
|
||||
for (BLASLONG i = 0; i < m / 4; i++) {
|
||||
*(b_offset + 0) = *(a_offset1 + 0);
|
||||
*(b_offset + 1) = *(a_offset2 + 0);
|
||||
*(b_offset + 2) = *(a_offset3 + 0);
|
||||
*(b_offset + 3) = *(a_offset4 + 0);
|
||||
*(b_offset + 4) = *(a_offset1 + 1);
|
||||
*(b_offset + 5) = *(a_offset2 + 1);
|
||||
*(b_offset + 6) = *(a_offset3 + 1);
|
||||
*(b_offset + 7) = *(a_offset4 + 1);
|
||||
|
||||
b_offset += 8;
|
||||
a_offset1 += 4 * lda;
|
||||
a_offset2 += 4 * lda;
|
||||
a_offset3 += 4 * lda;
|
||||
a_offset4 += 4 * lda;
|
||||
}
|
||||
|
||||
if (m & 3) {
|
||||
BLASLONG rest = m & 3;
|
||||
if (rest == 3) {
|
||||
*(b_offset + 0) = *(a_offset1 + 0);
|
||||
*(b_offset + 1) = *(a_offset2 + 0);
|
||||
*(b_offset + 2) = *(a_offset3 + 0);
|
||||
*(b_offset + 3) = *(a_offset1 + 1);
|
||||
*(b_offset + 4) = *(a_offset2 + 1);
|
||||
*(b_offset + 5) = *(a_offset3 + 1);
|
||||
b_offset += 6;
|
||||
} else if (rest == 2) {
|
||||
*(b_offset + 0) = *(a_offset1 + 0);
|
||||
*(b_offset + 1) = *(a_offset2 + 0);
|
||||
*(b_offset + 2) = *(a_offset1 + 1);
|
||||
*(b_offset + 3) = *(a_offset2 + 1);
|
||||
b_offset += 4;
|
||||
} else if (rest == 1) {
|
||||
*(b_offset + 0) = *(a_offset1 + 0);
|
||||
*(b_offset + 1) = *(a_offset1 + 1);
|
||||
b_offset += 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (n & 1) {
|
||||
for (BLASLONG i = 0; i < m / 4; i++) {
|
||||
*(b_offset + 0) = *(a_offset);
|
||||
*(b_offset + 1) = *(a_offset + lda);
|
||||
*(b_offset + 2) = *(a_offset + lda * 2);
|
||||
*(b_offset + 3) = *(a_offset + lda * 3);
|
||||
|
||||
b_offset += 4;
|
||||
a_offset += 4 * lda;
|
||||
}
|
||||
BLASLONG rest = m & 3;
|
||||
if (rest == 3) {
|
||||
*(b_offset + 0) = *(a_offset);
|
||||
*(b_offset + 1) = *(a_offset + lda);
|
||||
*(b_offset + 2) = *(a_offset + lda * 2);
|
||||
} else if (rest == 2) {
|
||||
*(b_offset + 0) = *(a_offset);
|
||||
*(b_offset + 1) = *(a_offset + lda);
|
||||
} else if (rest == 1) {
|
||||
*(b_offset + 0) = *(a_offset);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -198,8 +198,8 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
|
|||
static void zdot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, OPENBLAS_COMPLEX_FLOAT *result)
|
||||
{
|
||||
FLOAT dotr = 0.0, doti = 0.0;
|
||||
CREAL(*result) = 0.0;
|
||||
CIMAG(*result) = 0.0;
|
||||
OPENBLAS_COMPLEX_FLOAT cf = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0);
|
||||
*result = cf;
|
||||
|
||||
if ( n < 0 ) return;
|
||||
|
||||
|
|
@ -290,8 +290,8 @@ static void zdot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON
|
|||
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
|
||||
);
|
||||
|
||||
CREAL(*result) = dotr;
|
||||
CIMAG(*result) = doti;
|
||||
cf=OPENBLAS_MAKE_COMPLEX_FLOAT(dotr, doti);
|
||||
*result = cf;
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -312,9 +312,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
|||
int nthreads;
|
||||
FLOAT dummy_alpha;
|
||||
#endif
|
||||
OPENBLAS_COMPLEX_FLOAT zdot;
|
||||
CREAL(zdot) = 0.0;
|
||||
CIMAG(zdot) = 0.0;
|
||||
OPENBLAS_COMPLEX_FLOAT zdot = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0);
|
||||
|
||||
#if defined(SMP)
|
||||
if (inc_x == 0 || inc_y == 0 || n <= 10000)
|
||||
|
|
@ -341,8 +339,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
|||
|
||||
ptr = (OPENBLAS_COMPLEX_FLOAT *)result;
|
||||
for (i = 0; i < nthreads; i++) {
|
||||
CREAL(zdot) = CREAL(zdot) + CREAL(*ptr);
|
||||
CIMAG(zdot) = CIMAG(zdot) + CIMAG(*ptr);
|
||||
zdot = OPENBLAS_MAKE_COMPLEX_FLOAT (CREAL(zdot) + CREAL(*ptr), CIMAG(zdot) + CIMAG(*ptr));
|
||||
ptr = (void *)(((char *)ptr) + sizeof(double) * 2);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -108,10 +108,10 @@ SGEMMINCOPY = ../generic/gemm_ncopy_2.c
|
|||
SGEMMITCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifndef DGEMMKERNEL
|
||||
|
|
@ -120,10 +120,10 @@ DGEMMINCOPY = ../generic/gemm_ncopy_2.c
|
|||
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_8.c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy.o
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy.o
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifndef CGEMMKERNEL
|
||||
|
|
@ -132,10 +132,10 @@ CGEMMINCOPY = ../generic/zgemm_ncopy_1.c
|
|||
CGEMMITCOPY = ../generic/zgemm_tcopy_1.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy.o
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy.o
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifndef ZGEMMKERNEL
|
||||
|
|
@ -144,10 +144,10 @@ ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
|
|||
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy.o
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy.o
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifndef SGEMM_BETA
|
||||
|
|
|
|||
|
|
@ -3,10 +3,10 @@ DGEMMINCOPY = dgemm_ncopy_16.S
|
|||
DGEMMITCOPY = dgemm_tcopy_16.S
|
||||
DGEMMONCOPY = dgemm_ncopy_4.S
|
||||
DGEMMOTCOPY = dgemm_tcopy_4.S
|
||||
DGEMMINCOPYOBJ = dgemm_incopy.o
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy.o
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
|
|
|
|||
|
|
@ -11,26 +11,26 @@ ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
|||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
|
|
|
|||
|
|
@ -53,6 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define s4 $f9
|
||||
#define ALPHA $f4
|
||||
#define max $f5
|
||||
#define INF $f6
|
||||
|
||||
PROLOGUE
|
||||
|
||||
|
|
@ -61,6 +62,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
// Init INF
|
||||
addi.d TEMP, $r0, 0x7FF
|
||||
slli.d TEMP, TEMP, 52
|
||||
MTC INF, TEMP
|
||||
|
||||
MTC s1, $r0
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
|
|
@ -198,7 +204,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
CMPEQ $fcc0, s1, a1
|
||||
fcvt.d.s ALPHA, ALPHA
|
||||
bcnez $fcc0, .L999
|
||||
|
||||
fdiv.d ALPHA, ALPHA, s1
|
||||
CMPEQ $fcc0, INF, ALPHA
|
||||
bcnez $fcc0, .L999
|
||||
|
||||
MOV max, s1
|
||||
MOV s1, a1
|
||||
MOV s2, a1
|
||||
|
|
|
|||
|
|
@ -68,6 +68,7 @@
|
|||
|
||||
#define ALPHA $f16
|
||||
#define max $f17
|
||||
#define INF $f18
|
||||
|
||||
|
||||
PROLOGUE
|
||||
|
|
@ -86,6 +87,11 @@
|
|||
move XX, X
|
||||
NOP
|
||||
|
||||
//Init INF
|
||||
lui TEMP, 0x7FF0
|
||||
dsll TEMP, TEMP, 32
|
||||
MTC1 TEMP, INF
|
||||
|
||||
LD a1, 0 * SIZE(X)
|
||||
daddiu N, N, -1
|
||||
|
||||
|
|
@ -255,6 +261,9 @@
|
|||
div.d ALPHA, ALPHA, s1
|
||||
MOV max, s1
|
||||
|
||||
CMPEQ $fcc0, ALPHA, INF
|
||||
bc1t $fcc0, .L999
|
||||
|
||||
MOV s1, a1
|
||||
MOV s2, a1
|
||||
MOV s3, a1
|
||||
|
|
|
|||
|
|
@ -1,152 +0,0 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL 1
|
||||
|
||||
static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__
|
||||
(
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"lxvp 36, 64(%2) \n\t"
|
||||
"lxvp 38, 96(%2) \n\t"
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
"lxvp 46, 224(%2) \n\t"
|
||||
|
||||
"addi %2, %2, 256 \n\t"
|
||||
"addic. %1, %1, -32 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 32, 0(%3) \n\t"
|
||||
"stxv 33, 16(%3) \n\t"
|
||||
"stxv 34, 32(%3) \n\t"
|
||||
"stxv 35, 48(%3) \n\t"
|
||||
"stxv 36, 64(%3) \n\t"
|
||||
"stxv 37, 80(%3) \n\t"
|
||||
"stxv 38, 96(%3) \n\t"
|
||||
"stxv 39, 112(%3) \n\t"
|
||||
#else
|
||||
"stxv 33, 0(%3) \n\t"
|
||||
"stxv 32, 16(%3) \n\t"
|
||||
"stxv 35, 32(%3) \n\t"
|
||||
"stxv 34, 48(%3) \n\t"
|
||||
"stxv 37, 64(%3) \n\t"
|
||||
"stxv 36, 80(%3) \n\t"
|
||||
"stxv 39, 96(%3) \n\t"
|
||||
"stxv 38, 112(%3) \n\t"
|
||||
#endif
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"lxvp 36, 64(%2) \n\t"
|
||||
"lxvp 38, 96(%2) \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 40, 128(%3) \n\t"
|
||||
"stxv 41, 144(%3) \n\t"
|
||||
"stxv 42, 160(%3) \n\t"
|
||||
"stxv 43, 176(%3) \n\t"
|
||||
"stxv 44, 192(%3) \n\t"
|
||||
"stxv 45, 208(%3) \n\t"
|
||||
"stxv 46, 224(%3) \n\t"
|
||||
"stxv 47, 240(%3) \n\t"
|
||||
#else
|
||||
"stxv 41, 128(%3) \n\t"
|
||||
"stxv 40, 144(%3) \n\t"
|
||||
"stxv 43, 160(%3) \n\t"
|
||||
"stxv 42, 176(%3) \n\t"
|
||||
"stxv 45, 192(%3) \n\t"
|
||||
"stxv 44, 208(%3) \n\t"
|
||||
"stxv 47, 224(%3) \n\t"
|
||||
"stxv 46, 240(%3) \n\t"
|
||||
#endif
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
"lxvp 46, 224(%2) \n\t"
|
||||
|
||||
|
||||
"addi %3, %3, 256 \n\t"
|
||||
"addi %2, %2, 256 \n\t"
|
||||
|
||||
"addic. %1, %1, -32 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 32, 0(%3) \n\t"
|
||||
"stxv 33, 16(%3) \n\t"
|
||||
"stxv 34, 32(%3) \n\t"
|
||||
"stxv 35, 48(%3) \n\t"
|
||||
"stxv 36, 64(%3) \n\t"
|
||||
"stxv 37, 80(%3) \n\t"
|
||||
"stxv 38, 96(%3) \n\t"
|
||||
"stxv 39, 112(%3) \n\t"
|
||||
"stxv 40, 128(%3) \n\t"
|
||||
"stxv 41, 144(%3) \n\t"
|
||||
"stxv 42, 160(%3) \n\t"
|
||||
"stxv 43, 176(%3) \n\t"
|
||||
"stxv 44, 192(%3) \n\t"
|
||||
"stxv 45, 208(%3) \n\t"
|
||||
"stxv 46, 224(%3) \n\t"
|
||||
"stxv 47, 240(%3) \n\t"
|
||||
#else
|
||||
"stxv 33, 0(%3) \n\t"
|
||||
"stxv 32, 16(%3) \n\t"
|
||||
"stxv 35, 32(%3) \n\t"
|
||||
"stxv 34, 48(%3) \n\t"
|
||||
"stxv 37, 64(%3) \n\t"
|
||||
"stxv 36, 80(%3) \n\t"
|
||||
"stxv 39, 96(%3) \n\t"
|
||||
"stxv 38, 112(%3) \n\t"
|
||||
"stxv 41, 128(%3) \n\t"
|
||||
"stxv 40, 144(%3) \n\t"
|
||||
"stxv 43, 160(%3) \n\t"
|
||||
"stxv 42, 176(%3) \n\t"
|
||||
"stxv 45, 192(%3) \n\t"
|
||||
"stxv 44, 208(%3) \n\t"
|
||||
"stxv 47, 224(%3) \n\t"
|
||||
"stxv 46, 240(%3) \n\t"
|
||||
#endif
|
||||
"#n=%1 x=%4=%2 y=%0=%3"
|
||||
:
|
||||
"=m" (*y),
|
||||
"+r" (n), // 1
|
||||
"+b" (x), // 2
|
||||
"+b" (y) // 3
|
||||
:
|
||||
"m" (*x)
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
|
||||
);
|
||||
}
|
||||
|
|
@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "ccopy_microk_power10.c"
|
||||
#include "copy_microk_power10.c"
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL
|
||||
|
|
@ -86,7 +86,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
BLASLONG n1 = n & -64;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
copy_kernel(n1, x, y);
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
#if defined(POWER10)
|
||||
#pragma GCC optimize "O1"
|
||||
#include "cdot_microk_power10.c"
|
||||
#else
|
||||
#ifndef HAVE_KERNEL_8
|
||||
|
|
|
|||
|
|
@ -61,37 +61,97 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
|
|||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"stxvp 32, 0(%3) \n\t"
|
||||
"stxvp 34, 32(%3) \n\t"
|
||||
"stxvp 36, 64(%3) \n\t"
|
||||
"stxvp 38, 96(%3) \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 32, 0(%3) \n\t"
|
||||
"stxv 33, 16(%3) \n\t"
|
||||
"stxv 34, 32(%3) \n\t"
|
||||
"stxv 35, 48(%3) \n\t"
|
||||
"stxv 36, 64(%3) \n\t"
|
||||
"stxv 37, 80(%3) \n\t"
|
||||
"stxv 38, 96(%3) \n\t"
|
||||
"stxv 39, 112(%3) \n\t"
|
||||
#else
|
||||
"stxv 33, 0(%3) \n\t"
|
||||
"stxv 32, 16(%3) \n\t"
|
||||
"stxv 35, 32(%3) \n\t"
|
||||
"stxv 34, 48(%3) \n\t"
|
||||
"stxv 37, 64(%3) \n\t"
|
||||
"stxv 36, 80(%3) \n\t"
|
||||
"stxv 39, 96(%3) \n\t"
|
||||
"stxv 38, 112(%3) \n\t"
|
||||
#endif
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"lxvp 36, 64(%2) \n\t"
|
||||
"lxvp 38, 96(%2) \n\t"
|
||||
|
||||
"stxvp 40, 128(%3) \n\t"
|
||||
"stxvp 42, 160(%3) \n\t"
|
||||
"stxvp 44, 192(%3) \n\t"
|
||||
"stxvp 46, 224(%3) \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 40, 128(%3) \n\t"
|
||||
"stxv 41, 144(%3) \n\t"
|
||||
"stxv 42, 160(%3) \n\t"
|
||||
"stxv 43, 176(%3) \n\t"
|
||||
"stxv 44, 192(%3) \n\t"
|
||||
"stxv 45, 208(%3) \n\t"
|
||||
"stxv 46, 224(%3) \n\t"
|
||||
"stxv 47, 240(%3) \n\t"
|
||||
#else
|
||||
"stxv 41, 128(%3) \n\t"
|
||||
"stxv 40, 144(%3) \n\t"
|
||||
"stxv 43, 160(%3) \n\t"
|
||||
"stxv 42, 176(%3) \n\t"
|
||||
"stxv 45, 192(%3) \n\t"
|
||||
"stxv 44, 208(%3) \n\t"
|
||||
"stxv 47, 224(%3) \n\t"
|
||||
"stxv 46, 240(%3) \n\t"
|
||||
#endif
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
"lxvp 46, 224(%2) \n\t"
|
||||
|
||||
"stxvp 48, 256(%3) \n\t"
|
||||
"stxvp 50, 288(%3) \n\t"
|
||||
"stxvp 52, 320(%3) \n\t"
|
||||
"stxvp 54, 352(%3) \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 48, 256(%3) \n\t"
|
||||
"stxv 49, 272(%3) \n\t"
|
||||
"stxv 50, 288(%3) \n\t"
|
||||
"stxv 51, 304(%3) \n\t"
|
||||
"stxv 52, 320(%3) \n\t"
|
||||
"stxv 53, 336(%3) \n\t"
|
||||
"stxv 54, 352(%3) \n\t"
|
||||
"stxv 55, 368(%3) \n\t"
|
||||
#else
|
||||
"stxv 49, 256(%3) \n\t"
|
||||
"stxv 48, 272(%3) \n\t"
|
||||
"stxv 51, 288(%3) \n\t"
|
||||
"stxv 50, 304(%3) \n\t"
|
||||
"stxv 53, 320(%3) \n\t"
|
||||
"stxv 52, 336(%3) \n\t"
|
||||
"stxv 55, 352(%3) \n\t"
|
||||
"stxv 54, 368(%3) \n\t"
|
||||
#endif
|
||||
"lxvp 48, 256(%2) \n\t"
|
||||
"lxvp 50, 288(%2) \n\t"
|
||||
"lxvp 52, 320(%2) \n\t"
|
||||
"lxvp 54, 352(%2) \n\t"
|
||||
|
||||
"stxvp 56, 384(%3) \n\t"
|
||||
"stxvp 58, 416(%3) \n\t"
|
||||
"stxvp 60, 448(%3) \n\t"
|
||||
"stxvp 62, 480(%3) \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 56, 384(%3) \n\t"
|
||||
"stxv 57, 400(%3) \n\t"
|
||||
"stxv 58, 416(%3) \n\t"
|
||||
"stxv 59, 432(%3) \n\t"
|
||||
"stxv 60, 448(%3) \n\t"
|
||||
"stxv 61, 464(%3) \n\t"
|
||||
"stxv 62, 480(%3) \n\t"
|
||||
"stxv 63, 496(%3) \n\t"
|
||||
#else
|
||||
"stxv 57, 384(%3) \n\t"
|
||||
"stxv 56, 400(%3) \n\t"
|
||||
"stxv 59, 416(%3) \n\t"
|
||||
"stxv 58, 432(%3) \n\t"
|
||||
"stxv 61, 448(%3) \n\t"
|
||||
"stxv 60, 464(%3) \n\t"
|
||||
"stxv 63, 480(%3) \n\t"
|
||||
"stxv 62, 496(%3) \n\t"
|
||||
#endif
|
||||
"lxvp 56, 384(%2) \n\t"
|
||||
"lxvp 58, 416(%2) \n\t"
|
||||
"lxvp 60, 448(%2) \n\t"
|
||||
|
|
@ -111,22 +171,73 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
|
|||
|
||||
"two%=: \n\t"
|
||||
|
||||
"stxvp 32, 0(%3) \n\t"
|
||||
"stxvp 34, 32(%3) \n\t"
|
||||
"stxvp 36, 64(%3) \n\t"
|
||||
"stxvp 38, 96(%3) \n\t"
|
||||
"stxvp 40, 128(%3) \n\t"
|
||||
"stxvp 42, 160(%3) \n\t"
|
||||
"stxvp 44, 192(%3) \n\t"
|
||||
"stxvp 46, 224(%3) \n\t"
|
||||
"stxvp 48, 256(%3) \n\t"
|
||||
"stxvp 50, 288(%3) \n\t"
|
||||
"stxvp 52, 320(%3) \n\t"
|
||||
"stxvp 54, 352(%3) \n\t"
|
||||
"stxvp 56, 384(%3) \n\t"
|
||||
"stxvp 58, 416(%3) \n\t"
|
||||
"stxvp 60, 448(%3) \n\t"
|
||||
"stxvp 62, 480(%3) \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 32, 0(%3) \n\t"
|
||||
"stxv 33, 16(%3) \n\t"
|
||||
"stxv 34, 32(%3) \n\t"
|
||||
"stxv 35, 48(%3) \n\t"
|
||||
"stxv 36, 64(%3) \n\t"
|
||||
"stxv 37, 80(%3) \n\t"
|
||||
"stxv 38, 96(%3) \n\t"
|
||||
"stxv 39, 112(%3) \n\t"
|
||||
"stxv 40, 128(%3) \n\t"
|
||||
"stxv 41, 144(%3) \n\t"
|
||||
"stxv 42, 160(%3) \n\t"
|
||||
"stxv 43, 176(%3) \n\t"
|
||||
"stxv 44, 192(%3) \n\t"
|
||||
"stxv 45, 208(%3) \n\t"
|
||||
"stxv 46, 224(%3) \n\t"
|
||||
"stxv 47, 240(%3) \n\t"
|
||||
"stxv 48, 256(%3) \n\t"
|
||||
"stxv 49, 272(%3) \n\t"
|
||||
"stxv 50, 288(%3) \n\t"
|
||||
"stxv 51, 304(%3) \n\t"
|
||||
"stxv 52, 320(%3) \n\t"
|
||||
"stxv 53, 336(%3) \n\t"
|
||||
"stxv 54, 352(%3) \n\t"
|
||||
"stxv 55, 368(%3) \n\t"
|
||||
"stxv 56, 384(%3) \n\t"
|
||||
"stxv 57, 400(%3) \n\t"
|
||||
"stxv 58, 416(%3) \n\t"
|
||||
"stxv 59, 432(%3) \n\t"
|
||||
"stxv 60, 448(%3) \n\t"
|
||||
"stxv 61, 464(%3) \n\t"
|
||||
"stxv 62, 480(%3) \n\t"
|
||||
"stxv 63, 496(%3) \n\t"
|
||||
#else
|
||||
"stxv 33, 0(%3) \n\t"
|
||||
"stxv 32, 16(%3) \n\t"
|
||||
"stxv 35, 32(%3) \n\t"
|
||||
"stxv 34, 48(%3) \n\t"
|
||||
"stxv 37, 64(%3) \n\t"
|
||||
"stxv 36, 80(%3) \n\t"
|
||||
"stxv 39, 96(%3) \n\t"
|
||||
"stxv 38, 112(%3) \n\t"
|
||||
"stxv 41, 128(%3) \n\t"
|
||||
"stxv 40, 144(%3) \n\t"
|
||||
"stxv 43, 160(%3) \n\t"
|
||||
"stxv 42, 176(%3) \n\t"
|
||||
"stxv 45, 192(%3) \n\t"
|
||||
"stxv 44, 208(%3) \n\t"
|
||||
"stxv 47, 224(%3) \n\t"
|
||||
"stxv 46, 240(%3) \n\t"
|
||||
"stxv 49, 256(%3) \n\t"
|
||||
"stxv 48, 272(%3) \n\t"
|
||||
"stxv 51, 288(%3) \n\t"
|
||||
"stxv 50, 304(%3) \n\t"
|
||||
"stxv 53, 320(%3) \n\t"
|
||||
"stxv 52, 336(%3) \n\t"
|
||||
"stxv 55, 352(%3) \n\t"
|
||||
"stxv 54, 368(%3) \n\t"
|
||||
"stxv 57, 384(%3) \n\t"
|
||||
"stxv 56, 400(%3) \n\t"
|
||||
"stxv 59, 416(%3) \n\t"
|
||||
"stxv 58, 432(%3) \n\t"
|
||||
"stxv 61, 448(%3) \n\t"
|
||||
"stxv 60, 464(%3) \n\t"
|
||||
"stxv 63, 480(%3) \n\t"
|
||||
"stxv 62, 496(%3) \n\t"
|
||||
#endif
|
||||
|
||||
"#n=%1 x=%4=%2 y=%0=%3"
|
||||
:
|
||||
|
|
|
|||
|
|
@ -95,18 +95,38 @@ static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i)
|
|||
"xvaddsp 50, 50, 36 \n\t"
|
||||
"xvaddsp 51, 51, 37 \n\t"
|
||||
|
||||
"stxvp 48, 0(%2) \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 48, 0(%2) \n\t"
|
||||
"stxv 49, 16(%2) \n\t"
|
||||
#else
|
||||
"stxv 49, 0(%2) \n\t"
|
||||
"stxv 48, 16(%2) \n\t"
|
||||
#endif
|
||||
"xvaddsp 52, 52, 38 \n\t"
|
||||
"xvaddsp 53, 53, 39 \n\t"
|
||||
|
||||
"stxvp 50, 32(%2) \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 50, 32(%2) \n\t"
|
||||
"stxv 51, 48(%2) \n\t"
|
||||
#else
|
||||
"stxv 51, 32(%2) \n\t"
|
||||
"stxv 50, 48(%2) \n\t"
|
||||
#endif
|
||||
|
||||
"xvaddsp 54, 54, 56 \n\t"
|
||||
"xvaddsp 55, 55, 57 \n\t"
|
||||
|
||||
"stxvp 52, 64(%2) \n\t"
|
||||
"stxvp 54, 96(%2) \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 52, 64(%2) \n\t"
|
||||
"stxv 53, 80(%2) \n\t"
|
||||
"stxv 54, 96(%2) \n\t"
|
||||
"stxv 55, 112(%2) \n\t"
|
||||
#else
|
||||
"stxv 53, 64(%2) \n\t"
|
||||
"stxv 52, 80(%2) \n\t"
|
||||
"stxv 55, 96(%2) \n\t"
|
||||
"stxv 54, 112(%2) \n\t"
|
||||
#endif
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
|
|
@ -148,18 +168,39 @@ static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i)
|
|||
"xvaddsp 50, 50, 36 \n\t"
|
||||
"xvaddsp 51, 51, 37 \n\t"
|
||||
|
||||
"stxvp 48, 0(%2) \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 48, 0(%2) \n\t"
|
||||
"stxv 49, 16(%2) \n\t"
|
||||
#else
|
||||
"stxv 49, 0(%2) \n\t"
|
||||
"stxv 48, 16(%2) \n\t"
|
||||
#endif
|
||||
|
||||
"xvaddsp 52, 52, 38 \n\t"
|
||||
"xvaddsp 53, 53, 39 \n\t"
|
||||
|
||||
"stxvp 50, 32(%2) \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 50, 32(%2) \n\t"
|
||||
"stxv 51, 48(%2) \n\t"
|
||||
#else
|
||||
"stxv 51, 32(%2) \n\t"
|
||||
"stxv 50, 48(%2) \n\t"
|
||||
#endif
|
||||
|
||||
"xvaddsp 54, 54, 56 \n\t"
|
||||
"xvaddsp 55, 55, 57 \n\t"
|
||||
|
||||
"stxvp 52, 64(%2) \n\t"
|
||||
"stxvp 54, 96(%2) \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 52, 64(%2) \n\t"
|
||||
"stxv 53, 80(%2) \n\t"
|
||||
"stxv 54, 96(%2) \n\t"
|
||||
"stxv 55, 112(%2) \n\t"
|
||||
#else
|
||||
"stxv 53, 64(%2) \n\t"
|
||||
"stxv 52, 80(%2) \n\t"
|
||||
"stxv 55, 96(%2) \n\t"
|
||||
"stxv 54, 112(%2) \n\t"
|
||||
#endif
|
||||
|
||||
"#n=%1 x=%0=%2 alpha=(%3,%4)\n"
|
||||
:
|
||||
|
|
|
|||
|
|
@ -60,14 +60,25 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
|
|||
"xvmaddadp 37, 33, %x4 \n\t"
|
||||
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"stxvp 36, 0(%3) \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 36, 0(%3) \n\t"
|
||||
"stxv 37, 16(%3) \n\t"
|
||||
#else
|
||||
"stxv 37, 0(%3) \n\t"
|
||||
"stxv 36, 16(%3) \n\t"
|
||||
#endif
|
||||
|
||||
"xvmaddadp 38, 34, %x4 \n\t"
|
||||
"xvmaddadp 39, 35, %x4 \n\t"
|
||||
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"stxvp 38, 32(%3) \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 38, 32(%3) \n\t"
|
||||
"stxv 39, 48(%3) \n\t"
|
||||
#else
|
||||
"stxv 39, 32(%3) \n\t"
|
||||
"stxv 38, 48(%3) \n\t"
|
||||
#endif
|
||||
|
||||
"lxvp 36, 128(%3) \n\t"
|
||||
"lxvp 38, 160(%3) \n\t"
|
||||
|
|
@ -76,13 +87,25 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
|
|||
"xvmaddadp 45, 41, %x4 \n\t"
|
||||
|
||||
"lxvp 40, 64(%2) \n\t"
|
||||
"stxvp 44, 64(%3) \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 44, 64(%3) \n\t"
|
||||
"stxv 45, 80(%3) \n\t"
|
||||
#else
|
||||
"stxv 45, 64(%3) \n\t"
|
||||
"stxv 44, 80(%3) \n\t"
|
||||
#endif
|
||||
|
||||
"xvmaddadp 46, 42, %x4 \n\t"
|
||||
"xvmaddadp 47, 43, %x4 \n\t"
|
||||
|
||||
"lxvp 42, 96(%2) \n\t"
|
||||
"stxvp 46, 96(%3) \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 46, 96(%3) \n\t"
|
||||
"stxv 47, 112(%3) \n\t"
|
||||
#else
|
||||
"stxv 47, 96(%3) \n\t"
|
||||
"stxv 46, 112(%3) \n\t"
|
||||
#endif
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
|
@ -105,10 +128,25 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
|
|||
"xvmaddadp 46, 42, %x4 \n\t"
|
||||
"xvmaddadp 47, 43, %x4 \n\t"
|
||||
|
||||
"stxvp 36, 0(%3) \n\t"
|
||||
"stxvp 38, 32(%3) \n\t"
|
||||
"stxvp 44, 64(%3) \n\t"
|
||||
"stxvp 46, 96(%3) \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 36, 0(%3) \n\t"
|
||||
"stxv 37, 16(%3) \n\t"
|
||||
"stxv 38, 32(%3) \n\t"
|
||||
"stxv 39, 48(%3) \n\t"
|
||||
"stxv 44, 64(%3) \n\t"
|
||||
"stxv 45, 80(%3) \n\t"
|
||||
"stxv 46, 96(%3) \n\t"
|
||||
"stxv 47, 112(%3) \n\t"
|
||||
#else
|
||||
"stxv 37, 0(%3) \n\t"
|
||||
"stxv 36, 16(%3) \n\t"
|
||||
"stxv 39, 32(%3) \n\t"
|
||||
"stxv 38, 48(%3) \n\t"
|
||||
"stxv 45, 64(%3) \n\t"
|
||||
"stxv 44, 80(%3) \n\t"
|
||||
"stxv 47, 96(%3) \n\t"
|
||||
"stxv 46, 112(%3) \n\t"
|
||||
#endif
|
||||
|
||||
"#n=%1 x=%5=%2 y=%0=%3 alpha=%6 t0=%x4\n"
|
||||
:
|
||||
|
|
|
|||
|
|
@ -68,7 +68,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
|
||||
if ( n >= 16 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
for (i = 0; i < align; i++) {
|
||||
y[i] += da * x[i] ;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -87,7 +87,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
{
|
||||
if ( n >= 64 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
for (i = 0; i < align; i++) {
|
||||
y[i] = x[i] ;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -35,327 +35,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
|
||||
#if defined(HAVE_KERNEL4x8_ASM)
|
||||
static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) {
|
||||
|
||||
#if !__has_builtin(__builtin_vsx_disassemble_pair)
|
||||
#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
|
||||
#endif
|
||||
typedef __vector unsigned char vec_t;
|
||||
static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
|
||||
BLASLONG i;
|
||||
FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
|
||||
BLASLONG off2;
|
||||
BLASLONG tempR;
|
||||
__asm__(
|
||||
|
||||
"sldi %[temp],%[off], 4 \n\t" // lda * sizeof (double) *2
|
||||
"sldi %[off], %[off], 3 \n\t" // lda * sizeof (double)
|
||||
"xxlxor 34,34,34 \n\t"
|
||||
"xxlxor 35,34,34 \n\t"
|
||||
"add %[a2], %[a0], %[temp] \n\t"
|
||||
"add %[a1], %[a0], %[off] \n\t"
|
||||
"xxlxor 4,34,34 \n\t"
|
||||
"xxlxor 5,34,34 \n\t"
|
||||
"xxlxor 6,34,34 \n\t"
|
||||
"xxlxor 7,34,34 \n\t"
|
||||
"add %[a3], %[a2], %[off] \n\t"
|
||||
"add %[a4], %[a2], %[temp] \n\t"
|
||||
|
||||
"xxlxor 8,34,34 \n\t"
|
||||
"xxlxor 9,34,34 \n\t"
|
||||
"add %[a5], %[a3], %[temp] \n\t"
|
||||
"li %[off],0 \n\t"
|
||||
"li %[off2],16 \n\t"
|
||||
|
||||
"add %[a6], %[a4], %[temp] \n\t"
|
||||
"add %[a7], %[a5], %[temp] \n\t"
|
||||
|
||||
|
||||
|
||||
|
||||
"lxvp 32, 0(%[x]) \n\t"
|
||||
"lxvp 36, 0(%[a0]) \n\t"
|
||||
"lxvp 38, 0(%[a1]) \n\t"
|
||||
"lxvp 40, 0(%[a2]) \n\t"
|
||||
"lxvp 42, 0(%[a3]) \n\t"
|
||||
"lxvp 44, 0(%[a4]) \n\t"
|
||||
"lxvp 46, 0(%[a5]) \n\t"
|
||||
"lxvp 48, 0(%[a6]) \n\t"
|
||||
"lxvp 50, 0(%[a7]) \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"li %[temp],896 \n\t"
|
||||
#endif
|
||||
"addic. %[n],%[n],-4 \n\t"
|
||||
|
||||
"li %[off],32 \n\t"
|
||||
|
||||
|
||||
"ble- two%= \n\t"
|
||||
|
||||
//--------------------------------------------------
|
||||
".align 5 \n\t"
|
||||
"one%=: \n\t"
|
||||
"xvmaddadp 34,36,32 \n\t"
|
||||
"xvmaddadp 35,38,32 \n\t"
|
||||
"addi %[off2], %[off2],32 \n\t"
|
||||
"xvmaddadp 4,40,32 \n\t"
|
||||
"xvmaddadp 5,42,32 \n\t"
|
||||
"xvmaddadp 6,44,32 \n\t"
|
||||
"xvmaddadp 7,46,32 \n\t"
|
||||
"xvmaddadp 8,48,32 \n\t"
|
||||
"xvmaddadp 9,50,32 \n\t"
|
||||
|
||||
"xvmaddadp 34,37,33 \n\t"
|
||||
"xvmaddadp 35,39,33 \n\t"
|
||||
"lxvp 36, 32(%[a0]) \n\t"
|
||||
"lxvp 38, 32(%[a1]) \n\t"
|
||||
"xvmaddadp 4,41,33 \n\t"
|
||||
"xvmaddadp 5,43,33 \n\t"
|
||||
"addi %[off], %[off],32 \n\t"
|
||||
"lxvp 40, 32(%[a2]) \n\t"
|
||||
"lxvp 42, 32(%[a3]) \n\t"
|
||||
"xvmaddadp 6,45,33 \n\t"
|
||||
"xvmaddadp 7,47,33 \n\t"
|
||||
"lxvp 44, 32(%[a4]) \n\t"
|
||||
"lxvp 46, 32(%[a5]) \n\t"
|
||||
"xvmaddadp 8,49,33 \n\t"
|
||||
"xvmaddadp 9,51,33 \n\t"
|
||||
|
||||
"addic. %[n],%[n],-4 \n\t"
|
||||
"lxvp 48, 32(%[a6]) \n\t"
|
||||
"lxvp 50, 32(%[a7]) \n\t"
|
||||
"lxvp 32, 32(%[x]) \n\t"
|
||||
"ble- two%= \n\t"
|
||||
"xvmaddadp 34,36,32 \n\t"
|
||||
"xvmaddadp 35,38,32 \n\t"
|
||||
"addi %[off2], %[off2],32 \n\t"
|
||||
"xvmaddadp 4,40,32 \n\t"
|
||||
"xvmaddadp 5,42,32 \n\t"
|
||||
"xvmaddadp 6,44,32 \n\t"
|
||||
"xvmaddadp 7,46,32 \n\t"
|
||||
"xvmaddadp 8,48,32 \n\t"
|
||||
"xvmaddadp 9,50,32 \n\t"
|
||||
|
||||
"xvmaddadp 34,37,33 \n\t"
|
||||
"xvmaddadp 35,39,33 \n\t"
|
||||
"lxvp 36, 64(%[a0]) \n\t"
|
||||
"lxvp 38, 64(%[a1]) \n\t"
|
||||
"xvmaddadp 4,41,33 \n\t"
|
||||
"xvmaddadp 5,43,33 \n\t"
|
||||
"addi %[off], %[off],32 \n\t"
|
||||
"lxvp 40, 64(%[a2]) \n\t"
|
||||
"lxvp 42, 64(%[a3]) \n\t"
|
||||
"xvmaddadp 6,45,33 \n\t"
|
||||
"xvmaddadp 7,47,33 \n\t"
|
||||
"lxvp 44, 64(%[a4]) \n\t"
|
||||
"lxvp 46, 64(%[a5]) \n\t"
|
||||
"xvmaddadp 8,49,33 \n\t"
|
||||
"xvmaddadp 9,51,33 \n\t"
|
||||
|
||||
"addic. %[n],%[n],-4 \n\t"
|
||||
"lxvp 48, 64(%[a6]) \n\t"
|
||||
"lxvp 50, 64(%[a7]) \n\t"
|
||||
"lxvp 32, 64(%[x]) \n\t"
|
||||
"ble- two%= \n\t"
|
||||
"xvmaddadp 34,36,32 \n\t"
|
||||
"xvmaddadp 35,38,32 \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"addi %[temp],%[temp],128 \n\t"
|
||||
#endif
|
||||
"addi %[off2], %[off2],32 \n\t"
|
||||
"xvmaddadp 4,40,32 \n\t"
|
||||
"xvmaddadp 5,42,32 \n\t"
|
||||
"xvmaddadp 6,44,32 \n\t"
|
||||
"xvmaddadp 7,46,32 \n\t"
|
||||
"xvmaddadp 8,48,32 \n\t"
|
||||
"xvmaddadp 9,50,32 \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a0] \n\t"
|
||||
#endif
|
||||
|
||||
"xvmaddadp 34,37,33 \n\t"
|
||||
"xvmaddadp 35,39,33 \n\t"
|
||||
"lxvp 36, 96(%[a0]) \n\t"
|
||||
"lxvp 38, 96(%[a1]) \n\t"
|
||||
"xvmaddadp 4,41,33 \n\t"
|
||||
"xvmaddadp 5,43,33 \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a1] \n\t"
|
||||
#endif
|
||||
"lxvp 40, 96(%[a2]) \n\t"
|
||||
"lxvp 42, 96(%[a3]) \n\t"
|
||||
"addi %[off], %[off],32 \n\t"
|
||||
"xvmaddadp 6,45,33 \n\t"
|
||||
"xvmaddadp 7,47,33 \n\t"
|
||||
"lxvp 44, 96(%[a4]) \n\t"
|
||||
"lxvp 46, 96(%[a5]) \n\t"
|
||||
"xvmaddadp 8,49,33 \n\t"
|
||||
"xvmaddadp 9,51,33 \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a3] \n\t"
|
||||
#endif
|
||||
"lxvp 48, 96(%[a6]) \n\t"
|
||||
"lxvp 50, 96(%[a7]) \n\t"
|
||||
"lxvp 32, 96(%[x]) \n\t"
|
||||
|
||||
"addic. %[n],%[n],-4 \n\t"
|
||||
"ble- two%= \n\t"
|
||||
|
||||
"addi %[off2], %[off2],32 \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a2] \n\t"
|
||||
#endif
|
||||
"xvmaddadp 34,36,32 \n\t"
|
||||
"xvmaddadp 35,38,32 \n\t"
|
||||
"xvmaddadp 4,40,32 \n\t"
|
||||
"xvmaddadp 5,42,32 \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a4] \n\t"
|
||||
#endif
|
||||
"xvmaddadp 6,44,32 \n\t"
|
||||
"xvmaddadp 7,46,32 \n\t"
|
||||
"xvmaddadp 8,48,32 \n\t"
|
||||
"xvmaddadp 9,50,32 \n\t"
|
||||
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a5] \n\t"
|
||||
#endif
|
||||
"xvmaddadp 34,37,33 \n\t"
|
||||
"xvmaddadp 35,39,33 \n\t"
|
||||
"lxvp 36, 128(%[a0]) \n\t"
|
||||
"lxvp 38, 128(%[a1]) \n\t"
|
||||
"xvmaddadp 4,41,33 \n\t"
|
||||
"xvmaddadp 5,43,33 \n\t"
|
||||
"addi %[off], %[off],32 \n\t"
|
||||
"lxvp 40, 128(%[a2]) \n\t"
|
||||
"lxvp 42, 128(%[a3]) \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a6] \n\t"
|
||||
#endif
|
||||
"xvmaddadp 6,45,33 \n\t"
|
||||
"xvmaddadp 7,47,33 \n\t"
|
||||
"lxvp 44, 128(%[a4]) \n\t"
|
||||
"lxvp 46, 128(%[a5]) \n\t"
|
||||
"xvmaddadp 8,49,33 \n\t"
|
||||
"xvmaddadp 9,51,33 \n\t"
|
||||
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a7] \n\t"
|
||||
#endif
|
||||
"addic. %[n],%[n],-4 \n\t"
|
||||
"lxvp 48, 128(%[a6]) \n\t"
|
||||
"lxvp 50, 128(%[a7]) \n\t"
|
||||
"lxvp 32, 128(%[x]) \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[x] \n\t"
|
||||
#endif
|
||||
"addi %[a0], %[a0], 128 \n\t"
|
||||
"addi %[a1], %[a1], 128 \n\t"
|
||||
"addi %[a2], %[a2], 128 \n\t"
|
||||
"addi %[a3], %[a3], 128 \n\t"
|
||||
"addi %[a4], %[a4], 128 \n\t"
|
||||
"addi %[a5], %[a5], 128 \n\t"
|
||||
"addi %[a6], %[a6], 128 \n\t"
|
||||
"addi %[a7], %[a7], 128 \n\t"
|
||||
"addi %[x], %[x], 128 \n\t"
|
||||
"bgt+ one%= \n\t"
|
||||
".align 5 \n\t"
|
||||
"two%=: \n\t"
|
||||
//--------------------------------------------
|
||||
|
||||
"xvmaddadp 34,36,32 \n\t"
|
||||
"xvmaddadp 35,38,32 \n\t"
|
||||
"xvmaddadp 4,40,32 \n\t"
|
||||
"xvmaddadp 5,42,32 \n\t"
|
||||
"xvmaddadp 6,44,32 \n\t"
|
||||
"xvmaddadp 7,46,32 \n\t"
|
||||
"xvmaddadp 8,48,32 \n\t"
|
||||
"xvmaddadp 9,50,32 \n\t"
|
||||
XXSPLTD_S(36,%x[alpha],0)
|
||||
"xvmaddadp 34,37,33 \n\t"
|
||||
"xvmaddadp 35,39,33 \n\t"
|
||||
"xvmaddadp 4,41,33 \n\t"
|
||||
"xvmaddadp 5,43,33 \n\t"
|
||||
"xvmaddadp 6,45,33 \n\t"
|
||||
"xvmaddadp 7,47,33 \n\t"
|
||||
"xvmaddadp 8,49,33 \n\t"
|
||||
"xvmaddadp 9,51,33 \n\t"
|
||||
|
||||
"lxvp 38, 0(%[y]) \n\t"
|
||||
"lxvp 40, 32(%[y]) \n\t"
|
||||
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
XXMRGHD_S(42,34,35)
|
||||
XXMRGLD_S(43,34,35)
|
||||
|
||||
XXMRGHD_S(44,4,5)
|
||||
XXMRGLD_S(45,4,5)
|
||||
#else
|
||||
XXMRGLD_S(42,35,34)
|
||||
XXMRGHD_S(43,35,34)
|
||||
|
||||
XXMRGLD_S(44,5,4)
|
||||
XXMRGHD_S(45,5,4)
|
||||
#endif
|
||||
|
||||
"xvadddp 42,42,43 \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
XXMRGHD_S(46,6,7)
|
||||
XXMRGLD_S(47,6,7)
|
||||
#else
|
||||
XXMRGLD_S(46,7,6)
|
||||
XXMRGHD_S(47,7,6)
|
||||
#endif
|
||||
"xvadddp 44,44,45 \n\t"
|
||||
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
XXMRGHD_S(48,8,9)
|
||||
XXMRGLD_S(49,8,9)
|
||||
#else
|
||||
XXMRGLD_S(48,9,8)
|
||||
XXMRGHD_S(49,9,8)
|
||||
#endif
|
||||
"xvadddp 46,46,47 \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 38,42,36 \n\t"
|
||||
"xvmaddadp 39,44,36 \n\t"
|
||||
#else
|
||||
"xvmaddadp 39,42,36 \n\t"
|
||||
"xvmaddadp 38,44,36 \n\t"
|
||||
#endif
|
||||
"xvadddp 48,48,49 \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 41,48,36 \n\t"
|
||||
#else
|
||||
"xvmaddadp 41,46,36 \n\t"
|
||||
#endif
|
||||
"stxvp 38, 0(%[y]) \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"xvmaddadp 40,46,36 \n\t"
|
||||
#else
|
||||
"xvmaddadp 40,48,36 \n\t"
|
||||
#endif
|
||||
"stxvp 40, 32(%[y]) \n\t"
|
||||
|
||||
: [memy] "+m" (*(double (*)[8])y),
|
||||
[n] "+&r" (n),
|
||||
[a0] "=b" (a0),
|
||||
[a1] "=&b" (a1),
|
||||
[a2] "=&b" (a2),
|
||||
[a3] "=&b" (a3),
|
||||
[a4] "=&b" (a4),
|
||||
[a5] "=&b" (a5),
|
||||
[a6] "=&b" (a6),
|
||||
[a7] "=&b" (a7),
|
||||
[off] "+&b" (lda),
|
||||
[off2]"=&b" (off2),
|
||||
[temp] "=&b" (tempR)
|
||||
: [memx] "m" (*(const double (*)[n])x),
|
||||
[mem_ap] "m" (*(const double (*)[n*8]) ap),
|
||||
[alpha] "d" (alpha),
|
||||
"[a0]" (ap),
|
||||
[x] "b" (x),
|
||||
[y] "b" (y)
|
||||
: "cc","vs4","vs5","vs6","vs7","vs8","vs9" ,"vs32","vs33","vs34","vs35", "vs36", "vs37", "vs38", "vs39",
|
||||
"vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
|
||||
);
|
||||
return;
|
||||
__vector_pair vx, vp;
|
||||
vec_t res[2],res1[2];
|
||||
register __vector double temp0 = {0, 0};
|
||||
register __vector double temp1 = {0, 0};
|
||||
register __vector double temp2 = {0, 0};
|
||||
register __vector double temp3 = {0, 0};
|
||||
register __vector double temp4 = {0, 0};
|
||||
register __vector double temp5 = {0, 0};
|
||||
register __vector double temp6 = {0, 0};
|
||||
register __vector double temp7 = {0, 0};
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
a2 = a1 + lda;
|
||||
a3 = a2 + lda;
|
||||
a4 = a3 + lda;
|
||||
a5 = a4 + lda;
|
||||
a6 = a5 + lda;
|
||||
a7 = a6 + lda;
|
||||
for (i = 0; i < n/2; i += 2) {
|
||||
vp = *((__vector_pair *)((void *)&a0[i*2]));
|
||||
vx = *((__vector_pair *)((void *)&x[i*2]));
|
||||
__builtin_vsx_disassemble_pair (res, &vx);
|
||||
__builtin_vsx_disassemble_pair (res1, &vp);
|
||||
temp0 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp0);
|
||||
temp0 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp0);
|
||||
vp = *((__vector_pair *)((void *)&a1[i*2]));
|
||||
__builtin_vsx_disassemble_pair (res1, &vp);
|
||||
temp1 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp1);
|
||||
temp1 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp1);
|
||||
vp = *((__vector_pair *)((void *)&a2[i*2]));
|
||||
__builtin_vsx_disassemble_pair (res1, &vp);
|
||||
temp2 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp2);
|
||||
temp2 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp2);
|
||||
vp = *((__vector_pair *)((void *)&a3[i*2]));
|
||||
__builtin_vsx_disassemble_pair (res1, &vp);
|
||||
temp3 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp3);
|
||||
temp3 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp3);
|
||||
vp = *((__vector_pair *)((void *)&a4[i*2]));
|
||||
__builtin_vsx_disassemble_pair (res1, &vp);
|
||||
temp4 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp4);
|
||||
temp4 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp4);
|
||||
vp = *((__vector_pair *)((void *)&a5[i*2]));
|
||||
__builtin_vsx_disassemble_pair (res1, &vp);
|
||||
temp5 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp5);
|
||||
temp5 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp5);
|
||||
vp = *((__vector_pair *)((void *)&a6[i*2]));
|
||||
__builtin_vsx_disassemble_pair (res1, &vp);
|
||||
temp6 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp6);
|
||||
temp6 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp6);
|
||||
vp = *((__vector_pair *)((void *)&a7[i*2]));
|
||||
__builtin_vsx_disassemble_pair (res1, &vp);
|
||||
temp7 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp7);
|
||||
temp7 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp7);
|
||||
}
|
||||
y[0] += alpha * (temp0[0] + temp0[1]);
|
||||
y[1] += alpha * (temp1[0] + temp1[1]);
|
||||
y[2] += alpha * (temp2[0] + temp2[1]);
|
||||
y[3] += alpha * (temp3[0] + temp3[1]);
|
||||
y[4] += alpha * (temp4[0] + temp4[1]);
|
||||
y[5] += alpha * (temp5[0] + temp5[1]);
|
||||
y[6] += alpha * (temp6[0] + temp6[1]);
|
||||
y[7] += alpha * (temp7[0] + temp7[1]);
|
||||
}
|
||||
#else
|
||||
static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
|
||||
|
|
|
|||
|
|
@ -59,10 +59,25 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
|
|||
"lxvp 36, 192(%2) \n\t"
|
||||
"lxvp 38, 224(%2) \n\t"
|
||||
|
||||
"stxvp 40, 0(%2) \n\t"
|
||||
"stxvp 42, 32(%2) \n\t"
|
||||
"stxvp 44, 64(%2) \n\t"
|
||||
"stxvp 46, 96(%2) \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 40, 0(%2) \n\t"
|
||||
"stxv 41, 16(%2) \n\t"
|
||||
"stxv 42, 32(%2) \n\t"
|
||||
"stxv 43, 48(%2) \n\t"
|
||||
"stxv 44, 64(%2) \n\t"
|
||||
"stxv 45, 80(%2) \n\t"
|
||||
"stxv 46, 96(%2) \n\t"
|
||||
"stxv 47, 112(%2) \n\t"
|
||||
#else
|
||||
"stxv 41, 0(%2) \n\t"
|
||||
"stxv 40, 16(%2) \n\t"
|
||||
"stxv 43, 32(%2) \n\t"
|
||||
"stxv 42, 48(%2) \n\t"
|
||||
"stxv 45, 64(%2) \n\t"
|
||||
"stxv 44, 80(%2) \n\t"
|
||||
"stxv 47, 96(%2) \n\t"
|
||||
"stxv 46, 112(%2) \n\t"
|
||||
#endif
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
|
|
@ -81,10 +96,25 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
|
|||
"xvmuldp 46, 38, 48 \n\t"
|
||||
"xvmuldp 47, 39, 48 \n\t"
|
||||
|
||||
"stxvp 40, 0(%2) \n\t"
|
||||
"stxvp 42, 32(%2) \n\t"
|
||||
"stxvp 44, 64(%2) \n\t"
|
||||
"stxvp 46, 96(%2) \n\t"
|
||||
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
"stxv 40, 0(%2) \n\t"
|
||||
"stxv 41, 16(%2) \n\t"
|
||||
"stxv 42, 32(%2) \n\t"
|
||||
"stxv 43, 48(%2) \n\t"
|
||||
"stxv 44, 64(%2) \n\t"
|
||||
"stxv 45, 80(%2) \n\t"
|
||||
"stxv 46, 96(%2) \n\t"
|
||||
"stxv 47, 112(%2) \n\t"
|
||||
#else
|
||||
"stxv 41, 0(%2) \n\t"
|
||||
"stxv 40, 16(%2) \n\t"
|
||||
"stxv 43, 32(%2) \n\t"
|
||||
"stxv 42, 48(%2) \n\t"
|
||||
"stxv 45, 64(%2) \n\t"
|
||||
"stxv 44, 80(%2) \n\t"
|
||||
"stxv 47, 96(%2) \n\t"
|
||||
"stxv 46, 112(%2) \n\t"
|
||||
#endif
|
||||
|
||||
"#n=%1 alpha=%3 x=%0=%2"
|
||||
:
|
||||
|
|
@ -112,10 +142,14 @@ static void dscal_kernel_8_zero (long n, double *x)
|
|||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"stxvp 32, 0(%2) \n\t"
|
||||
"stxvp 32, 32(%2) \n\t"
|
||||
"stxvp 32, 64(%2) \n\t"
|
||||
"stxvp 32, 96(%2) \n\t"
|
||||
"stxv 32, 0(%2) \n\t"
|
||||
"stxv 32, 16(%2) \n\t"
|
||||
"stxv 32, 32(%2) \n\t"
|
||||
"stxv 32, 48(%2) \n\t"
|
||||
"stxv 32, 64(%2) \n\t"
|
||||
"stxv 32, 80(%2) \n\t"
|
||||
"stxv 32, 96(%2) \n\t"
|
||||
"stxv 32, 112(%2) \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
|
|
|
|||
|
|
@ -120,7 +120,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
|||
#if defined(POWER10)
|
||||
if ( n >= 32 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
for (i = 0; i < align; i++) {
|
||||
temp = y[i];
|
||||
y[i] = x[i];
|
||||
|
|
|
|||
|
|
@ -69,6 +69,7 @@ int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alph
|
|||
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
// Multi-threading execution outperforms (or approaches) the execution of the
|
||||
// small kernel.
|
||||
if (num_cpu_avail(3) > 1) {
|
||||
|
|
@ -77,6 +78,9 @@ int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alph
|
|||
} else {
|
||||
return 1;
|
||||
}
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
|||
|
|
@ -131,6 +131,10 @@
|
|||
|
||||
#define alpha f27
|
||||
|
||||
#if defined(PPC440)
|
||||
#define PREFETCHSIZE_A (3 * 4)
|
||||
#endif
|
||||
|
||||
#if defined(PPCG4)
|
||||
#define PREFETCHSIZE_A (3 * 4)
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -96,6 +96,11 @@
|
|||
#define X1 r22
|
||||
|
||||
|
||||
#if defined(PPC440)
|
||||
#define PREFETCHSIZE_A 42
|
||||
#define PREFETCHSIZE_C 7
|
||||
#endif
|
||||
|
||||
#if defined(PPCG4)
|
||||
#define PREFETCHSIZE_A 42
|
||||
#define PREFETCHSIZE_C 7
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue