Merge pull request #3717 from xianyi/develop

Update from develop for 0.3.21 release
This commit is contained in:
Martin Kroeker 2022-08-07 22:35:20 +02:00 committed by GitHub
commit 9a34217cc6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8235 changed files with 2182071 additions and 29723 deletions

View File

@ -5,27 +5,20 @@ on: [push, pull_request]
jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest]
fortran: [gfortran, flang]
build: [cmake, make]
exclude:
- os: macos-latest
fortran: flang
steps:
- name: Checkout repository
uses: actions/checkout@v2
- name: Compilation cache
uses: actions/cache@v2
with:
path: ~/.ccache
# We include the commit sha in the cache key, as new cache entries are
# only created if there is no existing entry for the key yet.
key: ${{ runner.os }}-ccache-${{ github.sha }}
# Restore any ccache cache entry, if none for
# ${{ runner.os }}-ccache-${{ github.sha }} exists
restore-keys: |
${{ runner.os }}-ccache-
uses: actions/checkout@v3
- name: Print system information
run: |
@ -34,7 +27,7 @@ jobs:
elif [ "$RUNNER_OS" == "macOS" ]; then
sysctl -a | grep machdep.cpu
else
echo "$RUNNER_OS not supported"
echo "::error::$RUNNER_OS not supported"
exit 1
fi
@ -43,61 +36,224 @@ jobs:
if [ "$RUNNER_OS" == "Linux" ]; then
sudo apt-get install -y gfortran cmake ccache
elif [ "$RUNNER_OS" == "macOS" ]; then
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
brew reinstall gcc
brew install coreutils cmake ccache
else
echo "$RUNNER_OS not supported"
echo "::error::$RUNNER_OS not supported"
exit 1
fi
ccache -M 300M # Limit the ccache size; Github's overall cache limit is 5GB
- name: gfortran build
if: matrix.build == 'make' && matrix.fortran == 'gfortran'
- name: Compilation cache
uses: actions/cache@v3
with:
path: ~/.ccache
# We include the commit sha in the cache key, as new cache entries are
# only created if there is no existing entry for the key yet.
# GNU make and cmake call the compilers differently. It looks like
# that causes the cache to mismatch. Keep the ccache for both build
# tools separate to avoid polluting each other.
key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
# Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
restore-keys: |
ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}
ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}
ccache-${{ runner.os }}-${{ matrix.build }}
- name: Configure ccache
run: |
if [ "$RUNNER_OS" == "Linux" ]; then
export PATH="/usr/lib/ccache:${PATH}"
elif [ "$RUNNER_OS" == "macOS" ]; then
export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}"
else
echo "$RUNNER_OS not supported"
exit 1
if [ "${{ matrix.build }}" = "make" ]; then
# Add ccache to path
if [ "$RUNNER_OS" = "Linux" ]; then
echo "/usr/lib/ccache" >> $GITHUB_PATH
elif [ "$RUNNER_OS" = "macOS" ]; then
echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH
else
echo "::error::$RUNNER_OS not supported"
exit 1
fi
fi
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
test -d ~/.ccache || mkdir -p ~/.ccache
echo "max_size = 300M" > ~/.ccache/ccache.conf
echo "compression = true" >> ~/.ccache/ccache.conf
ccache -s
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0
- name: flang build
if: matrix.build == 'make' && matrix.fortran == 'flang'
- name: Build OpenBLAS
run: |
if [ "$RUNNER_OS" == "Linux" ]; then
export PATH="/usr/lib/ccache:${PATH}"
elif [ "$RUNNER_OS" == "macOS" ]; then
exit 0
else
echo "$RUNNER_OS not supported"
exit 1
if [ "${{ matrix.fortran }}" = "flang" ]; then
# download and install classic flang
cd /usr/
sudo wget -nv https://github.com/flang-compiler/flang/releases/download/flang_20190329/flang-20190329-x86-70.tgz
sudo tar xf flang-20190329-x86-70.tgz
sudo rm flang-20190329-x86-70.tgz
cd -
fi
case "${{ matrix.build }}" in
"make")
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}"
;;
"cmake")
mkdir build && cd build
cmake -DDYNAMIC_ARCH=1 \
-DNOFORTRAN=0 \
-DBUILD_WITHOUT_LAPACK=0 \
-DCMAKE_VERBOSE_MAKEFILE=ON \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
..
cmake --build .
;;
*)
echo "::error::Configuration not supported"
exit 1
;;
esac
cd /usr/
sudo wget -nv https://github.com/flang-compiler/flang/releases/download/flang_20190329/flang-20190329-x86-70.tgz
sudo tar xf flang-20190329-x86-70.tgz
sudo rm flang-20190329-x86-70.tgz
cd -
- name: Show ccache status
continue-on-error: true
run: ccache -s
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC=flang
- name: CMake gfortran build
if: matrix.build == 'cmake' && matrix.fortran == 'gfortran'
- name: Run tests
timeout-minutes: 60
run: |
if [ "$RUNNER_OS" == "Linux" ]; then
export PATH="/usr/lib/ccache:${PATH}"
elif [ "$RUNNER_OS" == "macOS" ]; then
export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}"
else
echo "$RUNNER_OS not supported"
exit 1
fi
case "${{ matrix.build }}" in
"make")
MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0'
echo "::group::Tests in 'test' directory"
make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
echo "::endgroup::"
echo "::group::Tests in 'ctest' directory"
make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
echo "::endgroup::"
echo "::group::Tests in 'utest' directory"
make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
echo "::endgroup::"
;;
"cmake")
cd build && ctest
;;
*)
echo "::error::Configuration not supported"
exit 1
;;
esac
mkdir build
cd build
cmake -DDYNAMIC_ARCH=1 -DNOFORTRAN=0 -DBUILD_WITHOUT_LAPACK=0 -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_BUILD_TYPE=Release ..
make -j$(nproc)
msys2:
runs-on: windows-latest
strategy:
fail-fast: false
matrix:
msystem: [MINGW64, MINGW32, CLANG64]
idx: [int32, int64]
include:
- msystem: MINGW64
idx: int32
target-prefix: mingw-w64-x86_64
fc-pkg: mingw-w64-x86_64-gcc-fortran
- msystem: MINGW32
idx: int32
target-prefix: mingw-w64-i686
fc-pkg: mingw-w64-i686-gcc-fortran
- msystem: CLANG64
idx: int32
target-prefix: mingw-w64-clang-x86_64
c-lapack-flags: -DC_LAPACK=ON
- msystem: MINGW64
idx: int64
idx64-flags: -DBINARY=64 -DINTERFACE64=1
target-prefix: mingw-w64-x86_64
fc-pkg: mingw-w64-x86_64-gcc-fortran
- msystem: CLANG64
idx: int64
idx64-flags: -DBINARY=64 -DINTERFACE64=1
target-prefix: mingw-w64-clang-x86_64
c-lapack-flags: -DC_LAPACK=ON
exclude:
- msystem: MINGW32
idx: int64
defaults:
run:
# Use MSYS2 bash as default shell
shell: msys2 {0}
env:
CHERE_INVOKING: 1
steps:
- name: Get CPU name
shell: pwsh
run : |
Get-CIMInstance -Class Win32_Processor | Select-Object -Property Name
- name: Install build dependencies
uses: msys2/setup-msys2@v2
with:
msystem: ${{ matrix.msystem }}
update: true
release: false # Use pre-installed version
install: >-
base-devel
${{ matrix.target-prefix }}-cc
${{ matrix.fc-pkg }}
${{ matrix.target-prefix }}-cmake
${{ matrix.target-prefix }}-ninja
${{ matrix.target-prefix }}-ccache
- name: Checkout repository
uses: actions/checkout@v3
- name: Compilation cache
uses: actions/cache@v3
with:
# It looks like this path needs to be hard-coded.
path: C:/msys64/home/runneradmin/.ccache
# We include the commit sha in the cache key, as new cache entries are
# only created if there is no existing entry for the key yet.
key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ github.ref }}-${{ github.sha }}
# Restore a matching ccache cache entry. Prefer same branch.
restore-keys: |
ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ github.ref }}
ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}
- name: Configure ccache
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota.
run: |
which ccache
test -d ~/.ccache || mkdir -p ~/.ccache
echo "max_size = 250M" > ~/.ccache/ccache.conf
echo "compression = true" >> ~/.ccache/ccache.conf
ccache -s
echo $HOME
cygpath -w $HOME
- name: Configure OpenBLAS
run: |
mkdir build && cd build
cmake -DBUILD_SHARED_LIBS=ON \
-DBUILD_STATIC_LIBS=ON \
-DDYNAMIC_ARCH=ON \
-DUSE_THREAD=ON \
-DNUM_THREADS=64 \
-DTARGET=CORE2 \
${{ matrix.idx64-flags }} \
${{ matrix.c-lapack-flags }} \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
..
- name: Build OpenBLAS
run: cd build && cmake --build .
- name: Show ccache status
continue-on-error: true
run: ccache -s
- name: Run tests
timeout-minutes: 60
run: cd build && ctest

View File

@ -25,11 +25,12 @@ matrix:
# - BTYPE="BINARY=64"
#
# - <<: *test-ubuntu
os: linux-ppc64le
os: linux
arch: ppc64le
before_script: &common-before
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
script:
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
- travis_wait 20 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
- make -C test $COMMON_FLAGS $BTYPE
- make -C ctest $COMMON_FLAGS $BTYPE
- make -C utest $COMMON_FLAGS $BTYPE
@ -43,6 +44,7 @@ matrix:
arch: s390x
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=Z13 NUM_THREADS=32"
- sudo apt-get install --only-upgrade binutils
env:
# for matrix annotation only
- TARGET_BOX=IBMZ_LINUX
@ -55,6 +57,7 @@ matrix:
compiler: clang
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=Z13 NUM_THREADS=32"
- sudo apt-get install --only-upgrade binutils
env:
# for matrix annotation only
- TARGET_BOX=IBMZ_LINUX
@ -101,7 +104,7 @@ matrix:
- sudo apt-get update
- sudo apt-get install gcc-9 gfortran-9 -y
script:
- make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
- travis_wait 20 make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
- make -C test $COMMON_FLAGS $BTYPE
- make -C ctest $COMMON_FLAGS $BTYPE
- make -C utest $COMMON_FLAGS $BTYPE
@ -118,7 +121,7 @@ matrix:
- sudo apt-get update
- sudo apt-get install gcc-9 gfortran-9 -y
script:
- make QUIET_MAKE=1 BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
- travis_wait 20 make QUIET_MAKE=1 BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
- make -C test $COMMON_FLAGS $BTYPE
- make -C ctest $COMMON_FLAGS $BTYPE
- make -C utest $COMMON_FLAGS $BTYPE
@ -269,9 +272,9 @@ matrix:
# - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1"
# - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1"
- &test-graviton2
- &test-neoversen1
os: linux
arch: arm64-graviton2
arch: arm64
dist: focal
group: edge
virt: lxd

View File

@ -1,5 +1,9 @@
Thank you for the support.
### [2019.12/2021.9] [Chan-Zuckerberg Foundation EOSS Initiative](https://chanzuckerberg.com/eoss/)
Between December 2019 and September 2021, development and maintaining of OpenBLAS was funded in part by the Chan-Zuckerberg Foundation in the context of two grants awarded to the NumPy Foundation and managed by NumFocus (Cycles 1 and 3 of the Essential Open Source Software for Science (EOSS) Initiative of the Chan-Zuckerberg Foundation)
### [2013.8] [Testbed for OpenBLAS project](https://www.bountysource.com/fundraisers/443-testbed-for-openblas-project)
https://www.bountysource.com/fundraisers/443-testbed-for-openblas-project/pledges

View File

@ -17,14 +17,12 @@ include(GNUInstallDirs)
include(CMakePackageConfigHelpers)
if(MSVC AND NOT DEFINED NOFORTRAN)
set(NOFORTRAN ON)
endif()
#######
if(MSVC)
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON)
endif()
option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF)
option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
@ -36,6 +34,8 @@ option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several
option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF)
option(USE_PERL "Use the older PERL scripts for build preparation instead of universal shell scripts" OFF)
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
else()
@ -179,7 +179,7 @@ endforeach ()
# Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke.
# Not using add_subdirectory here because lapack-netlib already has its own CMakeLists.txt. Instead include a cmake script with the sources we want.
if (NOT NOFORTRAN AND NOT NO_LAPACK)
if (NOT NO_LAPACK)
include("${PROJECT_SOURCE_DIR}/cmake/lapack.cmake")
if (NOT NO_LAPACKE)
include("${PROJECT_SOURCE_DIR}/cmake/lapacke.cmake")
@ -205,8 +205,8 @@ endif ()
# add objects to the openblas lib
if(NOT NO_LAPACK)
add_library(LAPACK OBJECT ${LA_SOURCES})
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACK>")
add_library(LAPACK_OVERRIDES OBJECT ${LA_SOURCES})
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACK_OVERRIDES>")
endif()
if(NOT NO_LAPACKE)
add_library(LAPACKE OBJECT ${LAPACKE_SOURCES})
@ -247,7 +247,7 @@ endif()
if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS)
set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
if (NOT NOFORTRAN)
if (NOT NOFORTRAN)
set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
"sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
@ -314,14 +314,16 @@ endif()
if (NOT NOFORTRAN)
# Build test and ctest
add_subdirectory(test)
if (BUILD_TESTING)
add_subdirectory(lapack-netlib/TESTING)
endif()
endif()
if(NOT NO_CBLAS)
add_subdirectory(ctest)
endif()
add_subdirectory(lapack-netlib/TESTING)
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
add_subdirectory(cpp_thread_test)
endif()
endif()
set_target_properties(${OpenBLAS_LIBS} PROPERTIES
VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}
@ -394,14 +396,23 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
endif()
if (NOT DEFINED USE_PERL)
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def
COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
COMMENT "renaming symbols"
)
else()
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
COMMENT "renaming symbols"
)
endif()
endif()
# Install project
# Install libraries

View File

@ -207,3 +207,8 @@ In chronological order:
* Ilya Kurdyukov <https://github.com/ilyakurdyukov>
* [2021-02-21] Add basic support for the Elbrus E2000 architecture
* PLCT Lab, Institute of Software Chinese Academy of Sciences
* [2022-03] Support RISC-V Vector Intrinisc 1.0 version.

View File

@ -1,4 +1,86 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.21
07-Aug-2022
general:
- Updated the included LAPACK to Reference-LAPACK release 3.10.1
- when no Fortran compiler is available, OpenBLAS builds will now automatically
build LAPACK from an f2c-converted copy of LAPACK 3.9.0 unless the NO_LAPACK option
is specified
- similarly added C versions of the BLAS and CBLAS tests
- enabled building of the ReLAPACK GEMMT kernels when ReLAPACK is built
- function LAPACKE_lsame is now annotated with the GCC attribute "const" to aid static analyzers
- added USE_TLS to the list of options reported by the openblas_get_config() function
- CMAKE builds now support the BUILD_TESTING keyword (to disable the LAPACK testsuite) of Reference-LAPACK
- fixed CMAKE builds of the laswp_ncopy and neg_tcopy kernels
- removed the build system requirements for PERL (while keeping the original perl scripts as backup)
- handle building and running OpenBLAS on systems that report zero available cpu cores
- added SYMBOLPREFIX/SYMBOLSUFFIX handling for LAPACK 3.10.0 functions added in 0.3.20
- fixed linking of the utests on QNX
- Added support for compilation with the Intel ifx compiler
- Added support for compilation with the Fujitsu FCC compiler for Fugaku
- Added support for compilation with the Cray C and Fortran compilers
- reverted OpenMP threadpool behaviour in the exec_blas call to its state before 0.3.11, that is
the threadpool will no longer grow or shrink on demand as the overhead for this is too big at least with
GNU OpenMP. The adaptive behaviour introduced in 0.3.11 can still be requested at runtime by setting
the environment variable OMP_ADAPTIVE
- worked around spurious STFSM/CTFSM errors reported by the LAPACK testsuite
x86_64:
- fixed determination of compiler support for AVX512 and removed the 0.3.19
workaround for building SKYLAKEX kernels on Sandybridge hardware
- fixed compilation for the SKYLAKEX target with gcc 6
- fixed compilation of the CooperLake SBGEMM kernel with LLVM
- fixed compilation of the SkyLakeX small matrix GEMM kernels with LLVM or ICC
- fixed compilation of some BFLOAT16 kernels with CMAKE
- added support for the Zhaoxin/Centaur KH40000 cpu
- fixed a potential crash in the ZSYMV kernel used for all targets except generic
- fixed gmake compilation for DYNAMIC_ARCH with a DYNAMIC_LIST including ATOM
- fixed compilation of LAPACKE with the INTEGER64 option on Windows
- added support for cross-compiling to individual Intel or AMD targets using CMAKE
(previously only CORE2 supported, added targets are ATOM, PRESCOTT, NEHALEM, SANDYBRIDGE,
HASWELL,SKYLAKEX, COOPERLAKE, SAPPHIRERAPIDS, OPTERON, BARCELONA, BULLDOZER, PILEDRIVER,
STEAMROLLER,EXCAVATOR, ZEN)
SPARC:
- worked around an overflow error in the DNRM2 kernel
POWER:
- worked around an overflow error in the POWER6 DNRM2 kernel
- fixed compilation on PPC440
- fixed a performance regression in the level1 BLAS on POWER10
- fixed the POWER10 ZGEMM kernel
- fixed singlethreaded builds for POWER10
- fixed compilation of the POWER10 DGEMV kernel with older gcc versions
- enabled compilation of the BFLOAT16 kernels by default
- enabled the small matrix kernels by default for DYNAMIC_ARCH builds
- added a workaround for a miscompilation of the CDOT and ZDOT kernels by GCC 12
- RISCV:
- fixed cpu autodetection logic
ARMV8:
- added an SBGEMM kernel for Neoverse N2
- worked around an overflow error in the DNRM2 kernel used on M1, NeoverseN1, ThunderX2T99
- added support for ARM64 systems running MS Windows
- added support for cross-compiling to the GENERIC ARMV8 target under CMAKE (Windows/MSVC)
- fixed a performance regression in the generic ARMV8 DGEMM kernel introduced in 0.3.19
- added initial support for the Apple M1 cpu under Linux
- added initial support for the Phytium FT2000 cpu
- added initial support for the Cortex A510, A710, X1 and X2 cpu
- fixed an accidental mixup of cpu identifiers in the autodetection code introduced in 0.3.20
- fixed linking of Apple M1 builds on macOS 12 and later with recent XCode
- made Neoverse N2 available in DYNAMIC_ARCH builds
MIPS,MIPS64:
- worked around an overflow error in the DNRM2 kernel
LOONGARCH64:
- worked around an overflow error in the DNRM2 kernel
- added preliminary support for the LOONGSON2K1000 cpu
- added DYNAMIC_ARCH support
====================================================================
Version 0.3.20
20-Feb-2022

View File

@ -25,11 +25,14 @@ ifeq ($(NO_FORTRAN), 1)
define NOFORTRAN
1
endef
define NO_LAPACK
ifneq ($(NO_LAPACK), 1)
define C_LAPACK
1
endef
endif
export NOFORTRAN
export NO_LAPACK
export C_LAPACK
endif
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
@ -146,21 +149,25 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
ifndef NO_FBLAS
$(MAKE) -C test all
endif
endif
ifneq ($(ONLY_CBLAS), 1)
$(MAKE) -C utest all
endif
ifneq ($(NO_CBLAS), 1)
ifneq ($(ONLY_CBLAS), 1)
$(MAKE) -C ctest all
endif
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
$(MAKE) -C cpp_thread_test all
endif
endif
endif
libs :
ifeq ($(CORE), UNKNOWN)
$(error OpenBLAS: Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for the detail.)
endif
ifeq ($(NOFORTRAN), 1)
$(info OpenBLAS: Detecting fortran compiler failed. Cannot compile LAPACK. Only compile BLAS.)
$(info OpenBLAS: Detecting fortran compiler failed. Can only compile BLAS and f2c-converted LAPACK.)
endif
ifeq ($(NO_STATIC), 1)
ifeq ($(NO_SHARED), 1)
@ -241,19 +248,14 @@ hpl_p :
fi; \
done
ifeq ($(NO_LAPACK), 1)
netlib :
else
netlib : lapack_prebuild
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
ifneq ($(NO_LAPACK), 1)
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
endif
ifneq ($(NO_LAPACKE), 1)
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
endif
endif
ifeq ($(NO_LAPACK), 1)
re_lapack :
@ -267,7 +269,7 @@ prof_lapack : lapack_prebuild
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
lapack_prebuild :
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
ifeq ($(NO_LAPACK), $(filter 0,$(NO_LAPACK)))
-@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc

View File

@ -3,6 +3,9 @@ ifneq ($(C_COMPILER), PGI)
ifeq ($(C_COMPILER), CLANG)
ISCLANG=1
endif
ifeq ($(C_COMPILER), FUJITSU)
ISCLANG=1
endif
ifneq (1, $(filter 1,$(GCCVERSIONGT4) $(ISCLANG)))
CCOMMON_OPT += -march=armv8-a
ifneq ($(F_COMPILER), NAG)
@ -55,6 +58,13 @@ FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
endif
endif
ifeq ($(CORE), FT2000)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
endif
endif
# Use a72 tunings because Neoverse-N1 is only available
# in GCC>=9
ifeq ($(CORE), NEOVERSEN1)
@ -114,9 +124,9 @@ ifeq ($(CORE), NEOVERSEN2)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
ifeq ($(GCCVERSIONGTEQ9), 1)
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
CCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2
CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2
FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
endif
else
CCOMMON_OPT += -march=armv8.5-a -mtune=native
@ -229,6 +239,43 @@ endif
endif
endif
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
ifeq ($(CORE), CORTEXX1)
CCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a -mtune=cortexa72
endif
endif
endif
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
ifeq ($(CORE), CORTEXX2)
CCOMMON_OPT += -march=armv8.4-a+sve
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.4-a+sve
endif
endif
endif
#ifeq (1, $(filter 1,$(ISCLANG)))
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
ifeq ($(CORE), CORTEXA510)
CCOMMON_OPT += -march=armv8.4-a+sve
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.4-a+sve
endif
endif
endif
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG)))
ifeq ($(CORE), CORTEXA710)
CCOMMON_OPT += -march=armv8.4-a+sve
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.4-a+sve
endif
endif
endif
endif
endif

View File

@ -15,6 +15,12 @@ TARGET_MAKE = Makefile.conf
TARGET_CONF = config.h
endif
ifdef USE_PERL
SCRIPTSUFFIX = .pl
else
SCRIPTSUFFIX =
endif
# CPUIDEMU = ../../cpuid/table.o
ifdef CPUIDEMU
@ -46,17 +52,17 @@ TARGET_FLAGS = -mips64r6
endif
ifeq ($(TARGET), C910V)
TARGET_FLAGS = -march=rv64gcvxthead -mabi=lp64v
TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d
endif
all: getarch_2nd
./getarch_2nd 0 >> $(TARGET_MAKE)
./getarch_2nd 1 >> $(TARGET_CONF)
config.h : c_check f_check getarch
perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) $(CFLAGS)
$(TARGET_CONF): c_check$(SCRIPTSUFFIX) f_check$(SCRIPTSUFFIX) getarch
./c_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) $(CFLAGS)
ifneq ($(ONLY_CBLAS), 1)
perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS)
./f_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS)
else
#When we only build CBLAS, we set NOFORTRAN=2
echo "NOFORTRAN=2" >> $(TARGET_MAKE)
@ -71,9 +77,11 @@ endif
getarch : getarch.c cpuid.S dummy $(CPUIDEMU)
$(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) -o $(@F) getarch.c cpuid.S $(CPUIDEMU)
avx512=$$(./c_check$(SCRIPTSUFFIX) - - $(CC) $(TARGET_FLAGS) $(CFLAGS) | grep NO_AVX512); \
rv64gv=$$(./c_check$(SCRIPTSUFFIX) - - $(CC) $(TARGET_FLAGS) $(CFLAGS) | grep NO_RV64GV); \
$(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) $${avx512:+-D$${avx512}} $${rv64gv:+-D$${rv64gv}} -o $(@F) getarch.c cpuid.S $(CPUIDEMU)
getarch_2nd : getarch_2nd.c config.h dummy
getarch_2nd : getarch_2nd.c $(TARGET_CONF) dummy
ifndef TARGET_CORE
$(HOSTCC) -I. $(HOST_CFLAGS) -o $(@F) getarch_2nd.c
else
@ -81,3 +89,5 @@ else
endif
dummy:
.PHONY: dummy

View File

@ -1,4 +1,4 @@
ifeq ($(CORE), C910V)
CCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v
FCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v -static
CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920
FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static
endif

View File

@ -261,8 +261,9 @@ endif
#For small matrix optimization
ifeq ($(ARCH), x86_64)
SMALL_MATRIX_OPT = 1
else ifeq ($(CORE), POWER10)
else ifeq ($(ARCH), power)
SMALL_MATRIX_OPT = 1
BUILD_BFLOAT16 = 1
endif
ifeq ($(SMALL_MATRIX_OPT), 1)
CCOMMON_OPT += -DSMALL_MATRIX_OPT
@ -352,7 +353,7 @@ OBJCONV = $(CROSS_SUFFIX)objconv
# When fortran support was either not detected or actively deselected, only build BLAS.
ifeq ($(NOFORTRAN), 1)
NO_LAPACK = 1
C_LAPACK = 1
override FEXTRALIB =
endif
@ -384,8 +385,12 @@ endif
ifeq ($(OSNAME), Darwin)
ifndef MACOSX_DEPLOYMENT_TARGET
ifeq ($(ARCH), arm64)
export MACOSX_DEPLOYMENT_TARGET=11.0
else
export MACOSX_DEPLOYMENT_TARGET=10.8
endif
endif
MD5SUM = md5 -r
endif
@ -675,6 +680,10 @@ ifeq ($(ARCH), mips64)
DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4
endif
ifeq ($(ARCH), loongarch64)
DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC
endif
ifeq ($(ARCH), zarch)
DYNAMIC_CORE = ZARCH_GENERIC
@ -847,7 +856,7 @@ CCOMMON_OPT += -mabi=32
BINARY_DEFINED = 1
endif
ifeq ($(CORE), $(filter $(CORE),LOONGSON3R3 LOONGSON3R4))
ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4))
CCOMMON_OPT += -march=loongson3a
FCOMMON_OPT += -march=loongson3a
endif
@ -887,11 +896,9 @@ BINARY_DEFINED = 1
endif
ifeq ($(ARCH), loongarch64)
ifeq ($(CORE), LOONGSON3R5)
CCOMMON_OPT += -march=loongarch64 -mabi=lp64
FCOMMON_OPT += -march=loongarch64 -mabi=lp64
endif
endif
endif
@ -1041,9 +1048,13 @@ FCOMMON_OPT += -frecursive
# work around ABI problem with passing single-character arguments
FCOMMON_OPT += -fno-optimize-sibling-calls
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
ifneq ($(NOFORTRAN), 1)
ifneq ($(NOFORTRAN), 2)
ifneq ($(NO_LAPACK), 1)
EXTRALIB += -lgfortran
endif
endif
endif
ifdef NO_BINARY_MODE
ifeq ($(ARCH), $(filter $(ARCH),mips64))
ifdef BINARY64
@ -1179,7 +1190,6 @@ FCOMMON_OPT += -i8
endif
endif
endif
ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
ifndef BINARY64
FCOMMON_OPT += -n32
@ -1189,11 +1199,9 @@ endif
ifeq ($(CORE), LOONGSON3R3)
FCOMMON_OPT += -loongson3 -static
endif
ifeq ($(CORE), LOONGSON3R4)
FCOMMON_OPT += -loongson3 -static
endif
else
ifndef BINARY64
FCOMMON_OPT += -m32
@ -1201,7 +1209,6 @@ else
FCOMMON_OPT += -m64
endif
endif
ifeq ($(USE_OPENMP), 1)
FEXTRALIB += -lstdc++
FCOMMON_OPT += -mp
@ -1209,7 +1216,6 @@ endif
endif
ifeq ($(C_COMPILER), OPEN64)
ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
ifndef BINARY64
CCOMMON_OPT += -n32
@ -1219,13 +1225,10 @@ endif
ifeq ($(CORE), LOONGSON3R3)
CCOMMON_OPT += -loongson3 -static
endif
ifeq ($(CORE), LOONGSON3R4)
CCOMMON_OPT += -loongson3 -static
endif
else
ifndef BINARY64
CCOMMON_OPT += -m32
else
@ -1271,6 +1274,19 @@ FCOMMON_OPT += -openmp
endif
endif
ifeq ($(F_COMPILER), CRAY)
CCOMMON_OPT += -DF_INTERFACE_INTEL
FCOMMON_OPT += -hnopattern
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -s integer64
endif
endif
ifneq ($(USE_OPENMP), 1)
FCOMMON_OPT += -O noomp
endif
endif
ifdef BINARY64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
@ -1303,6 +1319,10 @@ ifeq ($(DYNAMIC_OLDER), 1)
CCOMMON_OPT += -DDYNAMIC_OLDER
endif
ifeq ($(C_LAPACK), 1)
CCOMMON_OPT += -DC_LAPACK
endif
ifeq ($(NO_LAPACK), 1)
CCOMMON_OPT += -DNO_LAPACK
#Disable LAPACK C interface
@ -1532,7 +1552,7 @@ override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
#MAKEOVERRIDES =
ifdef NEED_PIC
ifeq ($(NEED_PIC), 1)
ifeq (,$(findstring PIC,$(FFLAGS)))
override FFLAGS += -fPIC
endif
@ -1550,6 +1570,11 @@ endif
ifeq ($(F_COMPILER),NAG)
LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
endif
ifeq ($(F_COMPILER),CRAY)
LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
endif
LAPACK_CFLAGS = $(CFLAGS)
@ -1562,6 +1587,7 @@ endif
ifdef OS_WINDOWS
LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS
LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE
endif
ifeq ($(C_COMPILER), LSB)
LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE
@ -1661,6 +1687,7 @@ export USE_OPENMP
export CROSS
export CROSS_SUFFIX
export NOFORTRAN
export C_LAPACK
export NO_FBLAS
export EXTRALIB
export CEXTRALIB

View File

@ -92,6 +92,10 @@ CORTEXA53
CORTEXA57
CORTEXA72
CORTEXA73
CORTEXA510
CORTEXA710
CORTEXX1
CORTEXX2
NEOVERSEN1
NEOVERSEV1
NEOVERSEN2
@ -103,6 +107,9 @@ THUNDERX2T99
TSV110
THUNDERX3T110
VORTEX
A64FX
ARMV8SVE
FT2000
9.System Z:
ZARCH_GENERIC
@ -114,7 +121,9 @@ RISCV64_GENERIC
C910V
11.LOONGARCH64:
LOONGSONGENERIC
LOONGSON3R5
LOONGSON2K1000
12. Elbrus E2000:
E2K

View File

@ -65,7 +65,7 @@ jobs:
- task: CMake@1
inputs:
workingDirectory: 'build' # Optional
cmakeArgs: '-G "Visual Studio 16 2019" ..'
cmakeArgs: '-G "Visual Studio 17 2022" ..'
- task: CMake@1
inputs:
cmakeArgs: '--build . --config Release'
@ -81,7 +81,7 @@ jobs:
vmImage: 'windows-latest'
steps:
- script: |
mingw32-make CC=gcc FC=gfortran DYNAMIC_ARCH=1 DYNAMIC_LIST="NEHALEM SANDYBRIDGE HASWELL"
mingw32-make CC=gcc FC=gfortran DYNAMIC_ARCH=1 DYNAMIC_LIST="SANDYBRIDGE"
- job: Windows_clang_cmake
pool:
@ -103,7 +103,7 @@ jobs:
- job: Windows_flang_clang
pool:
vmImage: 'windows-latest'
vmImage: 'windows-2022'
steps:
- script: |
set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%"
@ -114,11 +114,31 @@ jobs:
conda install --yes --quiet ninja flang
mkdir build
cd build
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
cmake --build . --config Release
ctest
- job: Windows_cl_flang
pool:
vmImage: 'windows-2022'
steps:
- script: |
set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%"
set "LIB=C:\Miniconda\Library\lib;%LIB%"
set "CPATH=C:\Miniconda\Library\include;%CPATH%"
conda config --add channels conda-forge --force
conda config --set auto_update_conda false
conda install --yes --quiet ninja flang
mkdir build
cd build
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
cmake -G "Ninja" -DCMAKE_C_COMPILER=cl -DCMAKE_Fortran_COMPILER=flang -DC_LAPACK=1 -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
cmake --build . --config Release
ctest
- job: OSX_OpenMP
pool:
vmImage: 'macOS-10.15'
@ -143,11 +163,12 @@ jobs:
variables:
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
LIBRARY_PATH: /usr/local/opt/llvm/lib
MACOSX_DEPLOYMENT_TARGET: 11.0
steps:
- script: |
brew update
brew install llvm libomp
make TARGET=CORE2 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10
make TARGET=CORE2 USE_OPENMP=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang NOFORTRAN=1
- job: OSX_OpenMP_Clang_cmake
pool:
@ -178,7 +199,7 @@ jobs:
cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON ..
cmake --build .
ctest
- job: OSX_Ifort_Clang
pool:
vmImage: 'macOS-10.15'

767
c_check Normal file → Executable file
View File

@ -1,426 +1,415 @@
#!/usr/bin/env perl
#use File::Basename;
# use File::Temp qw(tempfile);
#!/bin/sh
# Checking cross compile
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
$hostarch = `uname -m | sed -e s/i.86/x86/`;
$hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS");
chop($hostarch);
$hostarch = "x86_64" if ($hostarch eq "amd64");
$hostarch = "arm" if ($hostarch ne "arm64" && $hostarch =~ /^arm.*/);
$hostarch = "arm64" if ($hostarch eq "aarch64");
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
$hostarch = "zarch" if ($hostarch eq "s390x");
hostos=`uname -s | sed -e 's/\-.*//'`
hostarch=`uname -m | sed -e 's/i.86/x86/'`
if [ "$hostos" = "AIX" ] || [ "$hostos" = "SunOS" ]; then
hostarch=`uname -p`
fi
case "$hostarch" in
amd64) hostarch=x86_64 ;;
arm*) [ "$hostarch" = "arm64" ] || hostarch='arm' ;;
aarch64) hostarch=arm64 ;;
powerpc*|ppc*) hostarch=power ;;
s390x) hostarch=zarch ;;
esac
#$tmpf = new File::Temp( UNLINK => 1 );
$binary = $ENV{"BINARY"};
makefile="$1"
config="$2"
$makefile = shift(@ARGV);
$config = shift(@ARGV);
$compiler_name = shift(@ARGV);
$flags = join(" ", @ARGV);
compiler_name="$3"
shift 3
flags="$*"
# First, we need to know the target OS and compiler name
$data = `$compiler_name $flags -E ctest.c`;
if ($?) {
printf STDERR "C Compiler ($compiler_name) is something wrong.\n";
die 1;
{
data=`$compiler_name $flags -E ctest.c`
} || {
printf '%s\n' "C Compiler ($compiler_name) is something wrong." >&2
exit 1
}
$cross_suffix = "";
cross_suffix=""
eval "use File::Basename";
if ($@){
warn "could not load PERL module File::Basename, emulating its functionality";
my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 );
if ($dirnam ne ".") {
$cross_suffix .= $dirnam . "/";
}
my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1);
if ($basnam =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
}
} else {
if (dirname($compiler_name) ne ".") {
$cross_suffix .= dirname($compiler_name) . "/";
}
if [ "`dirname $compiler_name`" != '.' ]; then
cross_suffix="$cross_suffix`dirname $compiler_name`/"
fi
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
}
}
bn=`basename $compiler_name`
case "$bn" in
*-*) cross_suffix="$cross_suffix${bn%-*}-"
esac
$compiler = "";
$compiler = LSB if ($data =~ /COMPILER_LSB/);
$compiler = CLANG if ($data =~ /COMPILER_CLANG/);
$compiler = PGI if ($data =~ /COMPILER_PGI/);
$compiler = PATHSCALE if ($data =~ /COMPILER_PATHSCALE/);
$compiler = INTEL if ($data =~ /COMPILER_INTEL/);
$compiler = OPEN64 if ($data =~ /COMPILER_OPEN64/);
$compiler = SUN if ($data =~ /COMPILER_SUN/);
$compiler = IBM if ($data =~ /COMPILER_IBM/);
$compiler = DEC if ($data =~ /COMPILER_DEC/);
$compiler = GCC if ($compiler eq "");
compiler=""
case "$data" in
*COMPILER_LSB*) compiler=LSB ;;
*COMPILER_CLANG*) compiler=CLANG ;;
*COMPILER_PGI*) compiler=PGI ;;
*COMPILER_PATHSCALE*) compiler=PATHSCALE ;;
*COMPILER_INTEL*) compiler=INTEL ;;
*COMPILER_OPEN64*) compiler=OPEN64 ;;
*COMPILER_SUN*) compiler=SUN ;;
*COMPILER_IBM*) compiler=IBM ;;
*COMPILER_DEC*) compiler=DEC ;;
*COMPILER_FUJITSU*) compiler=FUJITSU ;;
esac
if [ -z "$compiler" ]; then
compiler=GCC
fi
$os = Linux if ($data =~ /OS_LINUX/);
$os = FreeBSD if ($data =~ /OS_FREEBSD/);
$os = NetBSD if ($data =~ /OS_NETBSD/);
$os = OpenBSD if ($data =~ /OS_OPENBSD/);
$os = DragonFly if ($data =~ /OS_DRAGONFLY/);
$os = Darwin if ($data =~ /OS_DARWIN/);
$os = SunOS if ($data =~ /OS_SUNOS/);
$os = AIX if ($data =~ /OS_AIX/);
$os = osf if ($data =~ /OS_OSF/);
$os = WINNT if ($data =~ /OS_WINNT/);
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
$os = Interix if ($data =~ /OS_INTERIX/);
$os = Android if ($data =~ /OS_ANDROID/);
$os = Haiku if ($data =~ /OS_HAIKU/);
case "$data" in *OS_LINUX*) os=Linux ;; esac
case "$data" in *OS_FREEBSD*) os=FreeBSD ;; esac
case "$data" in *OS_NETBSD*) os=NetBSD ;; esac
case "$data" in *OS_OPENBSD*) os=OpenBSD ;; esac
case "$data" in *OS_DRAGONFLY*) os=DragonFly ;; esac
case "$data" in *OS_DARWIN*) os=Darwin ;; esac
case "$data" in *OS_SUNOS*) os=SunOS ;; esac
case "$data" in *OS_AIX*) os=AIX ;; esac
case "$data" in *OS_OSF*) os=osf ;; esac
case "$data" in *OS_WINNT*) os=WINNT ;; esac
case "$data" in *OS_CYGWIN_NT*) os=CYGWIN_NT ;; esac
case "$data" in *OS_INTERIX*) os=Interix ;; esac
case "$data" in *OS_ANDROID*) os=Android ;; esac
case "$data" in *OS_HAIKU*) os=Haiku ;; esac
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
$architecture = e2k if ($data =~ /ARCH_E2K/);
$architecture = power if ($data =~ /ARCH_POWER/);
$architecture = mips if ($data =~ /ARCH_MIPS/);
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
$architecture = sparc if ($data =~ /ARCH_SPARC/);
$architecture = ia64 if ($data =~ /ARCH_IA64/);
$architecture = arm if ($data =~ /ARCH_ARM/);
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
case "$data" in
*ARCH_X86_64*) architecture=x86_64 ;;
*ARCH_X86*) architecture=x86 ;;
*ARCH_E2K*) architecture=e2k ;;
*ARCH_POWER*) architecture=power ;;
*ARCH_MIPS64*) architecture=mips64 ;;
*ARCH_MIPS*) architecture=mips ;;
*ARCH_ALPHA*) architecture=alpha ;;
*ARCH_SPARC*) architecture=sparc ;;
*ARCH_IA64*) architecture=ia64 ;;
*ARCH_ARM64*) architecture=arm64 ;;
*ARCH_ARM*) architecture=arm ;;
*ARCH_ZARCH*) architecture=zarch ;;
*ARCH_RISCV64*) architecture=riscv64 ;;
*ARCH_LOONGARCH64*) architecture=loongarch64 ;;
esac
$defined = 0;
defined=0
if ($os eq "AIX") {
$compiler_name .= " -maix32" if ($binary eq "32");
$compiler_name .= " -maix64" if ($binary eq "64");
$defined = 1;
}
if [ "$os" = "AIX" ]; then
case "$BINARY" in
32) compiler_name="$compiler_name -maix32" ;;
64) compiler_name="$compiler_name -maix64" ;;
esac
defined=1
fi
if ($architecture eq "mips") {
$compiler_name .= " -mabi=32";
$defined = 1;
}
case "$architecture" in
mips)
compiler_name="$compiler_name -mabi=32"
defined=1
;;
mips64)
case "$BINARY" in
32) compiler_name="$compiler_name -mabi=n32" ;;
64) compiler_name="$compiler_name -mabi=64" ;;
esac
defined=1
;;
arm|arm64) defined=1 ;;
zarch|e2k|alpha|ia64|riscv64|loonarch64)
defined=1
BINARY=64
;;
x86)
[ "$os" != "Darwin" ] && [ "$os" != "SunOS" ] && {
defined=1
BINARY=32
}
;;
esac
if ($architecture eq "mips64") {
$compiler_name .= " -mabi=n32" if ($binary eq "32");
$compiler_name .= " -mabi=64" if ($binary eq "64");
$defined = 1;
}
case "$compiler" in
PGI)
case "$BINARY" in
32) compiler_name="$compiler_name -tp p7" ;;
64) compiler_name="$compiler_name -tp p7-64" ;;
esac
openmp='-mp'
defined=1
;;
IBM)
case "$BINARY" in
32) compiler_name="$compiler_name -q32" ;;
64) compiler_name="$compiler_name -q64" ;;
esac
openmp='-qsmp=omp'
defined=1
;;
INTEL) openmp='-openmp' ;;
PATHSCALE|OPEN64) openmp='-mp' ;;
CLANG|GCC|LSB) openmp='-fopenmp' ;;
FUJITSU) openmp='-Kopenmp' ;;
esac
if (($architecture eq "arm") || ($architecture eq "arm64")) {
$defined = 1;
}
if ($architecture eq "zarch") {
$defined = 1;
$binary = 64;
}
if ($architecture eq "e2k") {
$defined = 1;
$binary = 64;
}
if ($architecture eq "alpha") {
$defined = 1;
$binary = 64;
}
if ($architecture eq "ia64") {
$defined = 1;
$binary = 64;
}
if (($architecture eq "x86") && ($os ne Darwin) && ($os ne SunOS)) {
$defined = 1;
$binary =32;
}
if ($architecture eq "riscv64") {
$defined = 1;
$binary = 64;
}
if ($architecture eq "loongarch64") {
$defined = 1;
$binary = 64;
}
if ($compiler eq "PGI") {
$compiler_name .= " -tp p7" if ($binary eq "32");
$compiler_name .= " -tp p7-64" if ($binary eq "64");
$openmp = "-mp";
$defined = 1;
}
if ($compiler eq "IBM") {
$compiler_name .= " -q32" if ($binary eq "32");
$compiler_name .= " -q64" if ($binary eq "64");
$openmp = "-qsmp=omp";
$defined = 1;
}
if ($compiler eq "INTEL") {
$openmp = "-openmp";
}
if ($compiler eq "PATHSCALE") {
$openmp = "-mp";
}
if ($compiler eq "OPEN64") {
$openmp = "-mp";
}
if ($compiler eq "CLANG") {
$openmp = "-fopenmp";
}
if ($compiler eq "GCC" || $compiler eq "LSB") {
$openmp = "-fopenmp";
}
if ($defined == 0) {
$compiler_name .= " -m32" if ($binary eq "32");
$compiler_name .= " -m64" if ($binary eq "64");
}
if [ "$defined" -eq 0 ]; then
case "$BINARY" in
32) compiler_name="$compiler_name -m32" ;;
64) compiler_name="$compiler_name -m64" ;;
esac
fi
# Do again
$data = `$compiler_name $flags -E ctest.c`;
if ($?) {
printf STDERR "C Compiler ($compiler_name) is something wrong.\n";
die 1;
}
$have_msa = 0;
if (($architecture eq "mips") || ($architecture eq "mips64")) {
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check MSA capatibility";
} else {
$tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
$code = '"addvi.b $w0, $w1, 1"';
$msa_flags = "-mmsa -mfp64 -mload-store-pairs";
print $tmpf "#include <msa.h>\n\n";
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
$args = "$msa_flags -o $tmpf.o $tmpf";
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$have_msa = 0;
} else {
$have_msa = 1;
}
unlink("$tmpf.o");
}
}
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
$architecture = e2k if ($data =~ /ARCH_E2K/);
$architecture = power if ($data =~ /ARCH_POWER/);
$architecture = mips if ($data =~ /ARCH_MIPS/);
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
$architecture = sparc if ($data =~ /ARCH_SPARC/);
$architecture = ia64 if ($data =~ /ARCH_IA64/);
$architecture = arm if ($data =~ /ARCH_ARM/);
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
$binformat = bin32;
$binformat = bin64 if ($data =~ /BINARY_64/);
$no_avx512= 0;
if (($architecture eq "x86") || ($architecture eq "x86_64")) {
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512";
$no_avx512 = 0;
} else {
# $tmpf = new File::Temp( UNLINK => 1 );
($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
$args = " -march=skylake-avx512 -c -o $tmpf.o $tmpf";
if ($compiler eq "PGI") {
$args = " -tp skylake -c -o $tmpf.o $tmpf";
}
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_avx512 = 1;
} else {
$no_avx512 = 0;
}
unlink("$tmpf.o");
}
}
$c11_atomics = 0;
if ($data =~ /HAVE_C11/) {
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check compiler compatibility with C11";
$c11_atomics = 0;
} else {
($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
print $tmpf "#include <stdatomic.h>\nint main(void){}\n";
$args = " -c -o $tmpf.o $tmpf";
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$c11_atomics = 0;
} else {
$c11_atomics = 1;
}
unlink("$tmpf.o");
}
}
if ($compiler eq "GCC" &&( ($architecture eq "x86") || ($architecture eq "x86_64"))) {
$no_avx2 = 0;
$oldgcc = 0;
$data = `$compiler_name -dumpversion`;
if ($data <= 4.6) {
$no_avx2 = 1;
$oldgcc = 1;
}
}
$data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
$data =~ /globl\s([_\.]*)(.*)/;
$need_fu = $1;
$cross = 0;
if ($architecture ne $hostarch) {
$cross = 1;
$cross = 0 if (($hostarch eq "x86_64") && ($architecture eq "x86"));
$cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips"));
}
$cross = 1 if ($os ne $hostos);
$openmp = "" if $ENV{USE_OPENMP} != 1;
$linker_L = "";
$linker_l = "";
$linker_a = "";
{
$link = `$compiler_name $flags -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $flags $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`;
$link =~ s/\-Y\sP\,/\-Y/g;
@flags = split(/[\s\,\n]/, $link);
# remove leading and trailing quotes from each flag.
@flags = map {s/^['"]|['"]$//g; $_} @flags;
foreach $flags (@flags) {
if (
($flags =~ /^\-L/)
&& ($flags !~ /^-LIST:/)
&& ($flags !~ /^-LANG:/)
) {
$linker_L .= $flags . " "
}
if ($flags =~ /^\-Y/) {
$linker_L .= "-Wl,". $flags . " "
}
if ($flags =~ /^\--exclude-libs/) {
$linker_L .= "-Wl,". $flags . " ";
$flags="";
}
if (
($flags =~ /^\-l/)
&& ($flags !~ /gfortranbegin/)
&& ($flags !~ /frtbegin/)
&& ($flags !~ /pathfstart/)
&& ($flags !~ /numa/)
&& ($flags !~ /crt[0-9]/)
&& ($flags !~ /gcc/)
&& ($flags !~ /user32/)
&& ($flags !~ /kernel32/)
&& ($flags !~ /advapi32/)
&& ($flags !~ /shell32/)
&& ($flags !~ /omp/)
&& ($flags !~ /[0-9]+/)
) {
$linker_l .= $flags . " "
}
$linker_a .= $flags . " " if $flags =~ /\.a$/;
}
data="$($compiler_name $flags -E ctest.c)"
} || {
printf '%s\n' "C Compiler ($compiler_name) is something wrong." >&2
exit 1
}
open(MAKEFILE, "> $makefile") || die "Can't create $makefile";
open(CONFFILE, "> $config" ) || die "Can't create $config";
have_msa=0
if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
tmpd="$(mktemp -d)"
tmpf="$tmpd/a.c"
code='"addvi.b $w0, $w1, 1"'
msa_flags='-mmsa -mfp64 -mload-store-pairs'
printf "#include <msa.h>\n\n" >> "$tmpf"
printf "void main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
args="$msa_flags -o $tmpf.o $tmpf"
have_msa=1
{
$compiler_name $flags $args >/dev/null 2>&1
} || {
have_msa=0
}
rm -rf "$tmpd"
fi
case "$data" in
*ARCH_X86_64*) architecture=x86_64 ;;
*ARCH_X86*) architecture=x86 ;;
*ARCH_E2K*) architecture=e2k ;;
*ARCH_POWER*) architecture=power ;;
*ARCH_MIPS64*) architecture=mips64 ;;
*ARCH_MIPS*) architecture=mips ;;
*ARCH_ALPHA*) architecture=alpha ;;
*ARCH_SPARC*) architecture=sparc ;;
*ARCH_IA64*) architecture=ia64 ;;
*ARCH_ARM64*) architecture=arm64 ;;
*ARCH_ARM*) architecture=arm ;;
*ARCH_ZARCH*) architecture=zarch ;;
*ARCH_LOONGARCH64*) architecture=loongarch64 ;;
esac
binformat='bin32'
case "$data" in
*BINARY_64*) binformat='bin64' ;;
esac
no_avx512=0
if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
tmpd=`mktemp -d`
tmpf="$tmpd/a.c"
code='"vbroadcastss -4 * 4(%rsi), %zmm2"'
printf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
if [ "$compiler" = "PGI" ]; then
args=" -tp skylake -c -o $tmpf.o $tmpf"
else
args=" -march=skylake-avx512 -c -o $tmpf.o $tmpf"
fi
no_avx512=0
{
$compiler_name $flags $args >/dev/null 2>&1
} || {
no_avx512=1
}
rm -rf "$tmpd"
fi
no_rv64gv=0
if [ "$architecture" = "riscv64" ]; then
tmpd=`mktemp -d`
tmpf="$tmpd/a.c"
code='"vsetvli zero, zero, e8, m1\n"'
printf "int main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
args=" -march=rv64gv -c -o $tmpf.o $tmpf"
no_rv64gv=0
{
$compiler_name $flags $args >/dev/null 2>&1
} || {
no_rv64gv=1
}
rm -rf "$tmpd"
fi
c11_atomics=0
case "$data" in
*HAVE_C11*)
tmpd=`mktemp -d`
tmpf="$tmpd/a.c"
printf "#include <stdatomic.h>\nint main(void){}\n" >> "$tmpf"
args=" -c -o $tmpf.o $tmpf"
c11_atomics=1
{
$compiler_name $flags $args >/dev/null 2>&1
} || {
c11_atomics=0
}
rm -rf "$tmpd"
;;
esac
oldgcc=0
no_avx2=0
if [ "$compiler" = "GCC" ]; then
case "$architecture" in x86|x86_64)
no_avx2=0
oldgcc=0
data=`$compiler_name -dumpversion`
case "$data" in *.*.*)
data="${data%.*}"
esac
if awk -v n1=$data -v n2=4.6 'BEGIN { exit !(n1 <= n2) }'; then
no_avx2=1
oldgcc=1
fi
esac
fi
data=`$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`
need_fu=''
if echo "$data" | grep 'globl[[:space:]][_\.]'; then
need_fu="${data##*globl[[:space:]]}"
need_fu="${need_fu%%[!_\.]*}"
fi
cross=0
if [ "$architecture" != "$hostarch" ]; then
cross=1
[ "$hostarch" = "x86_64" ] && [ "$architecture" = "x86" ] && cross=0
[ "$hostarch" = "mips64" ] && [ "$architecture" = "mips" ] && cross=0
fi
[ "$os" != "$hostos" ] && cross=1
[ "$os" = "Android" ] && [ "$hostos" = "Linux" ] && [ -n "$TERMUX_APP_PID" ] \
&& cross=0
[ "$USE_OPENMP" != 1 ] && openmp=''
linker_L=""
linker_l=""
linker_a=""
link=`$compiler_name $flags -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $flags $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`
link=`echo "$link" | sed 's/\-Y[[:space:]]P\,/\-Y/g'`
flags=`echo $link | tr "'[[:space:]],\n" " "`
# Strip trailing quotes
old_flags="$flags"
flags=''
for flag in $old_flags; do
f=`echo "$flag" | tr '"' ' '`
flags="$flags $f"
done
for flag in $flags; do
case "$flag" in -L*)
case "$flag" in
-LIST:*|-LANG:*) ;;
*) linker_L="$linker_L $flag" ;;
esac
esac
case "$flag" in -Y*)
linker_L="$linker_L -Wl,$flag" ;;
esac
case "$flag" in --exclude-libs*)
linker_L="$linker_L -Wl,$flag"
flags=""
;;
esac
case "$flag" in -l*)
case "$flag" in
*gfortranbegin*|*frtbegin*|*pathfstart*|*numa*|*crt[0-9]*|\
*gcc*|*user32*|*kernel32*|*advapi32*|*shell32*|*omp*|\
*[0-9]*) ;;
*) linker_l="$linker_l $flag" ;;
esac
esac
case "$flag" in *.a) linker_a="$linker_a $flag" ;; esac
done
[ "$makefile" = "-" ] && {
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
exit 0
}
:> "$makefile" || exit 1
:> "$config" || exit 1
# print $data, "\n";
print MAKEFILE "OSNAME=$os\n";
print MAKEFILE "ARCH=$architecture\n";
print MAKEFILE "C_COMPILER=$compiler\n";
print MAKEFILE "BINARY32=\n" if $binformat ne bin32;
print MAKEFILE "BINARY64=\n" if $binformat ne bin64;
print MAKEFILE "BINARY32=1\n" if $binformat eq bin32;
print MAKEFILE "BINARY64=1\n" if $binformat eq bin64;
print MAKEFILE "FU=$need_fu\n" if $need_fu ne "";
print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne "";
print MAKEFILE "CROSS=1\n" if $cross != 0;
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1;
print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1;
{
printf "OSNAME=%s\n" "$os"
printf "ARCH=%s\n" "$architecture"
printf "C_COMPILER=%s\n" "$compiler"
[ $binformat != 'bin32' ] && printf "BINARY32=\n"
[ $binformat != 'bin64' ] && printf "BINARY64=\n"
[ "$binformat" = "bin32" ] && printf "BINARY32=1\n"
[ "$binformat" = "bin64" ] && printf "BINARY64=1\n"
[ -n "$need_fu" ] && printf 'FU=%s\n' "$need_fu"
[ "$cross" -ne 0 ] && [ -n "$cross_suffix" ] && \
printf "CROSS_SUFFIX=%s\n" "$cross_suffix"
[ "$cross" -ne 0 ] && printf "CROSS=1\n"
printf "CEXTRALIB=%s %s %s\n" "$linker_L" "$linker_l" "$linker_a"
[ "$have_msa" -eq 1 ] && {
printf "HAVE_MSA=1\n"
printf "MSA_FLAGS=%s\n" "$msa_flags"
}
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
} >> "$makefile"
$os =~ tr/[a-z]/[A-Z]/;
$architecture =~ tr/[a-z]/[A-Z]/;
$compiler =~ tr/[a-z]/[A-Z]/;
os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ `
architecture=`echo "$architecture" | tr '[[:lower:]]' '[[:upper:]]' `
compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' `
print CONFFILE "#define OS_$os\t1\n";
print CONFFILE "#define ARCH_$architecture\t1\n";
print CONFFILE "#define C_$compiler\t1\n";
print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32;
print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64;
print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1;
print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1;
{
printf "#define OS_%s\t1\n" "$os"
printf "#define ARCH_%s\t1\n" "$architecture"
printf "#define C_%s\t1\n" "$compiler"
[ "$binformat" = "bin32" ] && printf "#define __32BIT__\t1\n"
[ "$binformat" = "bin64" ] && printf "#define __64BIT__\t1\n"
[ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu"
[ "$have_msa" -eq 1 ] && printf "#define HAVE_MSA\t1\n"
[ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n"
} >> "$config"
if ($os eq "LINUX") {
if [ "$os" = "LINUX" ]; then
# @pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`);
# if ($pthread[2] ne "") {
# print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n";
# } else {
print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n";
printf "#define PTHREAD_CREATE_FUNC pthread_create\n" >> "$config"
# }
} else {
print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n";
}
close(MAKEFILE);
close(CONFFILE);
else
printf "#define PTHREAD_CREATE_FUNC pthread_create\n" >> "$config"
fi

456
c_check.pl Normal file
View File

@ -0,0 +1,456 @@
#!/usr/bin/env perl
#use File::Basename;
# use File::Temp qw(tempfile);
# Checking cross compile
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
$hostarch = `uname -m | sed -e s/i.86/x86/`;
$hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS");
chop($hostarch);
$hostarch = "x86_64" if ($hostarch eq "amd64");
$hostarch = "arm" if ($hostarch ne "arm64" && $hostarch =~ /^arm.*/);
$hostarch = "arm64" if ($hostarch eq "aarch64");
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
$hostarch = "zarch" if ($hostarch eq "s390x");
#$tmpf = new File::Temp( UNLINK => 1 );
$binary = $ENV{"BINARY"};
$makefile = shift(@ARGV);
$config = shift(@ARGV);
$compiler_name = shift(@ARGV);
$flags = join(" ", @ARGV);
# First, we need to know the target OS and compiler name
$data = `$compiler_name $flags -E ctest.c`;
if ($?) {
printf STDERR "C Compiler ($compiler_name) is something wrong.\n";
die 1;
}
$cross_suffix = "";
eval "use File::Basename";
if ($@){
warn "could not load PERL module File::Basename, emulating its functionality";
my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 );
if ($dirnam ne ".") {
$cross_suffix .= $dirnam . "/";
}
my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1);
if ($basnam =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
}
} else {
if (dirname($compiler_name) ne ".") {
$cross_suffix .= dirname($compiler_name) . "/";
}
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
}
}
$compiler = "";
$compiler = LSB if ($data =~ /COMPILER_LSB/);
$compiler = CLANG if ($data =~ /COMPILER_CLANG/);
$compiler = PGI if ($data =~ /COMPILER_PGI/);
$compiler = PATHSCALE if ($data =~ /COMPILER_PATHSCALE/);
$compiler = INTEL if ($data =~ /COMPILER_INTEL/);
$compiler = OPEN64 if ($data =~ /COMPILER_OPEN64/);
$compiler = SUN if ($data =~ /COMPILER_SUN/);
$compiler = IBM if ($data =~ /COMPILER_IBM/);
$compiler = DEC if ($data =~ /COMPILER_DEC/);
$compiler = FUJITSU if ($data =~ /COMPILER_FUJITSU/);
$compiler = GCC if ($compiler eq "");
$os = Linux if ($data =~ /OS_LINUX/);
$os = FreeBSD if ($data =~ /OS_FREEBSD/);
$os = NetBSD if ($data =~ /OS_NETBSD/);
$os = OpenBSD if ($data =~ /OS_OPENBSD/);
$os = DragonFly if ($data =~ /OS_DRAGONFLY/);
$os = Darwin if ($data =~ /OS_DARWIN/);
$os = SunOS if ($data =~ /OS_SUNOS/);
$os = AIX if ($data =~ /OS_AIX/);
$os = osf if ($data =~ /OS_OSF/);
$os = WINNT if ($data =~ /OS_WINNT/);
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
$os = Interix if ($data =~ /OS_INTERIX/);
$os = Android if ($data =~ /OS_ANDROID/);
$os = Haiku if ($data =~ /OS_HAIKU/);
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
$architecture = e2k if ($data =~ /ARCH_E2K/);
$architecture = power if ($data =~ /ARCH_POWER/);
$architecture = mips if ($data =~ /ARCH_MIPS/);
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
$architecture = sparc if ($data =~ /ARCH_SPARC/);
$architecture = ia64 if ($data =~ /ARCH_IA64/);
$architecture = arm if ($data =~ /ARCH_ARM/);
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
$defined = 0;
if ($os eq "AIX") {
$compiler_name .= " -maix32" if ($binary eq "32");
$compiler_name .= " -maix64" if ($binary eq "64");
$defined = 1;
}
if ($architecture eq "mips") {
$compiler_name .= " -mabi=32";
$defined = 1;
}
if ($architecture eq "mips64") {
$compiler_name .= " -mabi=n32" if ($binary eq "32");
$compiler_name .= " -mabi=64" if ($binary eq "64");
$defined = 1;
}
if (($architecture eq "arm") || ($architecture eq "arm64")) {
$defined = 1;
}
if ($architecture eq "zarch") {
$defined = 1;
$binary = 64;
}
if ($architecture eq "e2k") {
$defined = 1;
$binary = 64;
}
if ($architecture eq "alpha") {
$defined = 1;
$binary = 64;
}
if ($architecture eq "ia64") {
$defined = 1;
$binary = 64;
}
if (($architecture eq "x86") && ($os ne Darwin) && ($os ne SunOS)) {
$defined = 1;
$binary =32;
}
if ($architecture eq "riscv64") {
$defined = 1;
$binary = 64;
}
if ($architecture eq "loongarch64") {
$defined = 1;
$binary = 64;
}
if ($compiler eq "PGI") {
$compiler_name .= " -tp p7" if ($binary eq "32");
$compiler_name .= " -tp p7-64" if ($binary eq "64");
$openmp = "-mp";
$defined = 1;
}
if ($compiler eq "IBM") {
$compiler_name .= " -q32" if ($binary eq "32");
$compiler_name .= " -q64" if ($binary eq "64");
$openmp = "-qsmp=omp";
$defined = 1;
}
if ($compiler eq "INTEL") {
$openmp = "-openmp";
}
if ($compiler eq "PATHSCALE") {
$openmp = "-mp";
}
if ($compiler eq "OPEN64") {
$openmp = "-mp";
}
if ($compiler eq "CLANG") {
$openmp = "-fopenmp";
}
if ($compiler eq "GCC" || $compiler eq "LSB") {
$openmp = "-fopenmp";
}
if ($compiler eq "FUJITSU") {
$openmp = "-Kopenmp";
}
if ($defined == 0) {
$compiler_name .= " -m32" if ($binary eq "32");
$compiler_name .= " -m64" if ($binary eq "64");
}
# Do again
$data = `$compiler_name $flags -E ctest.c`;
if ($?) {
printf STDERR "C Compiler ($compiler_name) is something wrong.\n";
die 1;
}
$have_msa = 0;
if (($architecture eq "mips") || ($architecture eq "mips64")) {
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check MSA capatibility";
} else {
$tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
$code = '"addvi.b $w0, $w1, 1"';
$msa_flags = "-mmsa -mfp64 -mload-store-pairs";
print $tmpf "#include <msa.h>\n\n";
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
$args = "$msa_flags -o $tmpf.o $tmpf";
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$have_msa = 0;
} else {
$have_msa = 1;
}
unlink("$tmpf.o");
}
}
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
$architecture = e2k if ($data =~ /ARCH_E2K/);
$architecture = power if ($data =~ /ARCH_POWER/);
$architecture = mips if ($data =~ /ARCH_MIPS/);
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
$architecture = sparc if ($data =~ /ARCH_SPARC/);
$architecture = ia64 if ($data =~ /ARCH_IA64/);
$architecture = arm if ($data =~ /ARCH_ARM/);
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
$binformat = bin32;
$binformat = bin64 if ($data =~ /BINARY_64/);
$no_avx512= 0;
if (($architecture eq "x86") || ($architecture eq "x86_64")) {
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512";
$no_avx512 = 0;
} else {
# $tmpf = new File::Temp( UNLINK => 1 );
($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
print $fh "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
$args = " -march=skylake-avx512 -c -o $tmpf.o $tmpf";
if ($compiler eq "PGI") {
$args = " -tp skylake -c -o $tmpf.o $tmpf";
}
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_avx512 = 1;
} else {
$no_avx512 = 0;
}
unlink("$tmpf.o");
}
}
$no_rv64gv= 0;
if (($architecture eq "riscv64")) {
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check compiler compatibility with the RISCV vector extension";
$no_rv64gv = 0;
} else {
# $tmpf = new File::Temp( UNLINK => 1 );
($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
$code = '"vsetvli zero, zero, e8, m1\n"';
print $fh "int main(void){ __asm__ volatile($code); }\n";
$args = " -march=rv64gv -c -o $tmpf.o $tmpf";
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_rv64gv = 1;
} else {
$no_rv64gv = 0;
}
unlink("$tmpf.o");
}
}
$c11_atomics = 0;
if ($data =~ /HAVE_C11/) {
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check compiler compatibility with C11";
$c11_atomics = 0;
} else {
($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
print $fh "#include <stdatomic.h>\nint main(void){}\n";
$args = " -c -o $tmpf.o $tmpf";
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$c11_atomics = 0;
} else {
$c11_atomics = 1;
}
unlink("$tmpf.o");
}
}
if ($compiler eq "GCC" &&( ($architecture eq "x86") || ($architecture eq "x86_64"))) {
$no_avx2 = 0;
$oldgcc = 0;
$data = `$compiler_name -dumpversion`;
if ($data <= 4.6) {
$no_avx2 = 1;
$oldgcc = 1;
}
}
$data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
$data =~ /globl\s([_\.]*)(.*)/;
$need_fu = $1;
$cross = 0;
if ($architecture ne $hostarch) {
$cross = 1;
$cross = 0 if (($hostarch eq "x86_64") && ($architecture eq "x86"));
$cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips"));
}
$cross = 1 if ($os ne $hostos);
$cross = 0 if (($os eq "Android") && ($hostos eq "Linux") && ($ENV{TERMUX_APP_PID} != ""));
$openmp = "" if $ENV{USE_OPENMP} != 1;
$linker_L = "";
$linker_l = "";
$linker_a = "";
{
$link = `$compiler_name $flags -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $flags $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`;
$link =~ s/\-Y\sP\,/\-Y/g;
@flags = split(/[\s\,\n]/, $link);
# remove leading and trailing quotes from each flag.
@flags = map {s/^['"]|['"]$//g; $_} @flags;
foreach $flags (@flags) {
if (
($flags =~ /^\-L/)
&& ($flags !~ /^-LIST:/)
&& ($flags !~ /^-LANG:/)
) {
$linker_L .= $flags . " "
}
if ($flags =~ /^\-Y/) {
$linker_L .= "-Wl,". $flags . " "
}
if ($flags =~ /^\--exclude-libs/) {
$linker_L .= "-Wl,". $flags . " ";
$flags="";
}
if (
($flags =~ /^\-l/)
&& ($flags !~ /gfortranbegin/)
&& ($flags !~ /frtbegin/)
&& ($flags !~ /pathfstart/)
&& ($flags !~ /numa/)
&& ($flags !~ /crt[0-9]/)
&& ($flags !~ /gcc/)
&& ($flags !~ /user32/)
&& ($flags !~ /kernel32/)
&& ($flags !~ /advapi32/)
&& ($flags !~ /shell32/)
&& ($flags !~ /omp/)
&& ($flags !~ /[0-9]+/)
) {
$linker_l .= $flags . " "
}
$linker_a .= $flags . " " if $flags =~ /\.a$/;
}
}
open(MAKEFILE, "> $makefile") || die "Can't create $makefile";
open(CONFFILE, "> $config" ) || die "Can't create $config";
# print $data, "\n";
print MAKEFILE "OSNAME=$os\n";
print MAKEFILE "ARCH=$architecture\n";
print MAKEFILE "C_COMPILER=$compiler\n";
print MAKEFILE "BINARY32=\n" if $binformat ne bin32;
print MAKEFILE "BINARY64=\n" if $binformat ne bin64;
print MAKEFILE "BINARY32=1\n" if $binformat eq bin32;
print MAKEFILE "BINARY64=1\n" if $binformat eq bin64;
print MAKEFILE "FU=$need_fu\n" if $need_fu ne "";
print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne "";
print MAKEFILE "CROSS=1\n" if $cross != 0;
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1;
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1;
print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1;
$os =~ tr/[a-z]/[A-Z]/;
$architecture =~ tr/[a-z]/[A-Z]/;
$compiler =~ tr/[a-z]/[A-Z]/;
print CONFFILE "#define OS_$os\t1\n";
print CONFFILE "#define ARCH_$architecture\t1\n";
print CONFFILE "#define C_$compiler\t1\n";
print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32;
print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64;
print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1;
print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1;
if ($os eq "LINUX") {
# @pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`);
# if ($pthread[2] ne "") {
# print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n";
# } else {
print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n";
# }
} else {
print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n";
}
close(MAKEFILE);
close(CONFFILE);

View File

@ -28,6 +28,8 @@ char* openblas_get_corename(void);
#ifdef OPENBLAS_OS_LINUX
/* Sets thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */
int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set);
/* Queries thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */
int openblas_getaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set);
#endif
/* Get the parallelization type which is used by OpenBLAS */

View File

@ -161,6 +161,30 @@ if (${CORE} STREQUAL ARMV8SVE)
endif ()
endif ()
if (${CORE} STREQUAL CORTEXA510)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
endif ()
endif ()
if (${CORE} STREQUAL CORTEXA710)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
endif ()
endif ()
if (${CORE} STREQUAL CORTEXX1)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a")
endif ()
endif ()
if (${CORE} STREQUAL CORTEXX2)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
endif ()
endif ()
if (${CORE} STREQUAL POWER10)
if (NOT DYNAMIC_ARCH)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)

View File

@ -50,6 +50,15 @@ else()
set(ONLY_CBLAS_IN ${ONLY_CBLAS})
endif()
if (NOT DEFINED USE_PERL)
add_custom_command(
OUTPUT ${PROJECT_BINARY_DIR}/openblas.def
#TARGET ${OpenBLAS_LIBNAME} PRE_LINK
COMMAND "${PROJECT_SOURCE_DIR}/exports/gensymbol"
ARGS "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def"
COMMENT "Create openblas.def file"
VERBATIM)
else
add_custom_command(
OUTPUT ${PROJECT_BINARY_DIR}/openblas.def
#TARGET ${OpenBLAS_LIBNAME} PRE_LINK
@ -57,5 +66,5 @@ add_custom_command(
ARGS "${PROJECT_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def"
COMMENT "Create openblas.def file"
VERBATIM)
endif()
endif()

View File

@ -25,11 +25,19 @@ check_language(Fortran)
if(CMAKE_Fortran_COMPILER)
enable_language(Fortran)
else()
if (NOT NO_LAPACK)
message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK")
endif()
set (NOFORTRAN 1)
set (NO_LAPACK 1)
if (NOT NO_LAPACK)
if (NOT XXXXX)
message(STATUS "No Fortran compiler found, can build only BLAS and f2c-converted LAPACK")
set(C_LAPACK 1)
if (INTERFACE64)
set (CCOMMON_OPT "${CCOMMON_OPT} -DLAPACK_ILP64")
endif ()
set(TIMER "NONE")
else ()
message(STATUS "No Fortran compiler found, can build only BLAS")
endif()
endif()
endif()
if (NOT ONLY_CBLAS)

View File

@ -67,7 +67,15 @@ if (${F_COMPILER} STREQUAL "GFORTRAN")
if (BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
if (INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
if (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel")
if (WIN32)
set(FCOMMON_OPT "${FCOMMON_OPT} /integer-size:64")
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -integer-size 64")
endif ()
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
endif ()
endif ()
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -m32")
@ -214,6 +222,17 @@ if (${F_COMPILER} STREQUAL "COMPAQ")
endif ()
endif ()
if (${F_COMPILER} STREQUAL "CRAY")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_INTEL")
set(FCOMMON_OPT "${FCOMMON_OPT} -hnopattern")
if (INTERFACE64)
set (FCOMMON_OPT "${FCOMMON_OPT} -s integer64")
endif ()
if (NOT USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -O noomp")
endif ()
endif ()
# from the root Makefile - this is for lapack-netlib to compile the correct secnd file.
if (${F_COMPILER} STREQUAL "GFORTRAN")
set(TIMER "INT_ETIME")

View File

@ -1,12 +1,14 @@
# Sources for compiling lapack-netlib. Can't use CMakeLists.txt because lapack-netlib already has its own cmake files.
if (NOT C_LAPACK)
message (STATUS "fortran lapack")
set(ALLAUX ilaenv.f ilaenv2stage.f ieeeck.f lsamen.f iparmq.f iparam2stage.F
ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f dlaset.f
ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f dlaset.f la_xisnan.F90
../INSTALL/ilaver.f xerbla_array.f
../INSTALL/slamch.f)
set(SCLAUX
scombssq.f sbdsvdx.f sstevx.f sstein.f
la_constants.f90
sbdsdc.f
sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f
slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f
@ -15,16 +17,17 @@ set(SCLAUX
slapy2.f slapy3.f slarnv.f
slarra.f slarrb.f slarrc.f slarrd.f slarre.f slarrf.f slarrj.f
slarrk.f slarrr.f slaneg.f
slartg.f slaruv.f slas2.f slascl.f
slartg.f90 slaruv.f slas2.f slascl.f
slasd0.f slasd1.f slasd2.f slasd3.f slasd4.f slasd5.f slasd6.f
slasd7.f slasd8.f slasda.f slasdq.f slasdt.f
slaset.f slasq1.f slasq2.f slasq3.f slasq4.f slasq5.f slasq6.f
slasr.f slasrt.f slassq.f slasv2.f spttrf.f sstebz.f sstedc.f
slasr.f slasrt.f slassq.f90 slasv2.f spttrf.f sstebz.f sstedc.f
ssteqr.f ssterf.f slaisnan.f sisnan.f
slartgp.f slartgs.f
slartgp.f slartgs.f ../INSTALL/sroundup_lwork.f
../INSTALL/second_${TIMER}.f)
set(DZLAUX
la_constants.f90
dbdsdc.f
dbdsvdx.f dstevx.f dstein.f
dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f
@ -34,13 +37,13 @@ set(DZLAUX
dlapy2.f dlapy3.f dlarnv.f
dlarra.f dlarrb.f dlarrc.f dlarrd.f dlarre.f dlarrf.f dlarrj.f
dlarrk.f dlarrr.f dlaneg.f
dlartg.f dlaruv.f dlas2.f dlascl.f
dlartg.f90 dlaruv.f dlas2.f dlascl.f
dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f
dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f
dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f
dlasr.f dlasrt.f dlassq.f dlasv2.f dpttrf.f dstebz.f dstedc.f
dlasr.f dlasrt.f dlassq.f90 dlasv2.f dpttrf.f dstebz.f dstedc.f
dsteqr.f dsterf.f dlaisnan.f disnan.f
dlartgp.f dlartgs.f
dlartgp.f dlartgs.f ../INSTALL/droundup_lwork.f
../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f)
set(SLASRC
@ -58,6 +61,7 @@ set(SLASRC
sggrqf.f sggsvd3.f sggsvp3.f sgtcon.f sgtrfs.f sgtsv.f
sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f
shsein.f shseqr.f slabrd.f slacon.f slacn2.f
slaqz0.f slaqz1.f slaqz2.f slaqz3.f slaqz4.f
slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f
slahr2.f slaic1.f slaln2.f slals0.f slalsa.f slalsd.f
slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f
@ -170,10 +174,11 @@ set(CLASRC
clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f
claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f
claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
claqz0.f claqz1.f claqz2.f claqz3.f
claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
clarf.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f clartv.f
clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f
clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f90 clartv.f
clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f90
clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f
clatbs.f clatdf.f clatps.f clatrd.f clatrs.f clatrz.f
cpbcon.f cpbequ.f cpbrfs.f cpbstf.f cpbsv.f
@ -244,6 +249,7 @@ set(DLASRC
dggglm.f dgghrd.f dgghd3.f dgglse.f dggqrf.f
dggrqf.f dggsvd3.f dggsvp3.f dgtcon.f dgtrfs.f dgtsv.f
dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f
dlaqz0.f dlaqz1.f dlaqz2.f dlaqz3.f dlaqz4.f
dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f
dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f
dlahr2.f dlaic1.f dlaln2.f dlals0.f dlalsa.f dlalsd.f
@ -345,6 +351,7 @@ set(ZLASRC
zhetrs_3.f zhecon_3.f zhesv_rk.f
zhesv_aa.f zhesv_aa_2stage.f zhetrf_aa.f zhetrf_aa_2stage.f zhetrs_aa.f zhetrs_aa_2stage.f
zhgeqz.f zhpcon.f zhpev.f zhpevd.f
zlaqz0.f zlaqz1.f zlaqz2.f zlaqz3.f
zhpevx.f zhpgst.f zhpgv.f zhpgvd.f zhpgvx.f zhprfs.f zhpsv.f
zhpsvx.f
zhptrd.f zhptrf.f zhptri.f zhptrs.f zhsein.f zhseqr.f zlabrd.f
@ -362,9 +369,9 @@ set(ZLASRC
zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f
zlarfg.f zlarfgp.f zlarft.f
zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f
zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f90 zlartv.f
zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
zlassq.f zlasyf.f zlasyf_rook.f zlasyf_rk.f zlasyf_aa.f
zlassq.f90 zlasyf.f zlasyf_rook.f zlasyf_rk.f zlasyf_aa.f
zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f
zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f
zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f
@ -488,6 +495,499 @@ if(BUILD_COMPLEX16)
message(STATUS "Building Double Precision Complex")
endif()
else ()
message (STATUS "c lapack")
set(ALLAUX ilaenv.c ilaenv2stage.c ieeeck.c lsamen.c iparmq.c iparam2stage.c
ilaprec.c ilatrans.c ilauplo.c iladiag.c chla_transtype.c dlaset.c
../INSTALL/ilaver.c xerbla_array.c
../INSTALL/slamch.c)
set(SCLAUX
scombssq.c sbdsvdx.c sstevx.c sstein.c
sbdsdc.c
sbdsqr.c sdisna.c slabad.c slacpy.c sladiv.c slae2.c slaebz.c
slaed0.c slaed1.c slaed2.c slaed3.c slaed4.c slaed5.c slaed6.c
slaed7.c slaed8.c slaed9.c slaeda.c slaev2.c slagtf.c
slagts.c slamrg.c slanst.c
slapy2.c slapy3.c slarnv.c
slarra.c slarrb.c slarrc.c slarrd.c slarre.c slarrf.c slarrj.c
slarrk.c slarrr.c slaneg.c
slartg.c slaruv.c slas2.c slascl.c
slasd0.c slasd1.c slasd2.c slasd3.c slasd4.c slasd5.c slasd6.c
slasd7.c slasd8.c slasda.c slasdq.c slasdt.c
slaset.c slasq1.c slasq2.c slasq3.c slasq4.c slasq5.c slasq6.c
slasr.c slasrt.c slassq.c slasv2.c spttrf.c sstebz.c sstedc.c
ssteqr.c ssterf.c slaisnan.c sisnan.c
slartgp.c slartgs.c
../INSTALL/second_${TIMER}.c)
set(DZLAUX
dbdsdc.c
dbdsvdx.c dstevx.c dstein.c
dbdsqr.c ddisna.c dlabad.c dlacpy.c dladiv.c dlae2.c dlaebz.c
dlaed0.c dlaed1.c dlaed2.c dlaed3.c dlaed4.c dlaed5.c dlaed6.c
dlaed7.c dlaed8.c dlaed9.c dlaeda.c dlaev2.c dlagtf.c
dlagts.c dlamrg.c dlanst.c
dlapy2.c dlapy3.c dlarnv.c
dlarra.c dlarrb.c dlarrc.c dlarrd.c dlarre.c dlarrf.c dlarrj.c
dlarrk.c dlarrr.c dlaneg.c
dlartg.c dlaruv.c dlas2.c dlascl.c
dlasd0.c dlasd1.c dlasd2.c dlasd3.c dlasd4.c dlasd5.c dlasd6.c
dlasd7.c dlasd8.c dlasda.c dlasdq.c dlasdt.c
dlasq1.c dlasq2.c dlasq3.c dlasq4.c dlasq5.c dlasq6.c
dlasr.c dlasrt.c dlassq.c dlasv2.c dpttrf.c dstebz.c dstedc.c
dsteqr.c dsterf.c dlaisnan.c disnan.c
dlartgp.c dlartgs.c
../INSTALL/dlamch.c ../INSTALL/dsecnd_${TIMER}.c)
set(SLASRC
sgbbrd.c sgbcon.c sgbequ.c sgbrfs.c sgbsv.c
sgbsvx.c sgbtf2.c sgbtrf.c sgbtrs.c sgebak.c sgebal.c sgebd2.c
sgebrd.c sgecon.c sgeequ.c sgees.c sgeesx.c sgeev.c sgeevx.c
sgehd2.c sgehrd.c sgelq2.c sgelqf.c
sgels.c sgelsd.c sgelss.c sgelsy.c sgeql2.c sgeqlf.c
sgeqp3.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c
sgesc2.c sgesdd.c sgesvd.c sgesvdx.c sgesvx.c sgetc2.c
sgetrf2.c sgetri.c
sggbak.c sggbal.c
sgges.c sgges3.c sggesx.c sggev.c sggev3.c sggevx.c
sggglm.c sgghrd.c sgghd3.c sgglse.c sggqrf.c
sggrqf.c sggsvd3.c sggsvp3.c sgtcon.c sgtrfs.c sgtsv.c
sgtsvx.c sgttrf.c sgttrs.c sgtts2.c shgeqz.c
shsein.c shseqr.c slabrd.c slacon.c slacn2.c
slaein.c slaexc.c slag2.c slags2.c slagtm.c slagv2.c slahqr.c
slahr2.c slaic1.c slaln2.c slals0.c slalsa.c slalsd.c
slangb.c slange.c slangt.c slanhs.c slansb.c slansp.c
slansy.c slantb.c slantp.c slantr.c slanv2.c
slapll.c slapmt.c
slaqgb.c slaqge.c slaqp2.c slaqps.c slaqsb.c slaqsp.c slaqsy.c
slaqr0.c slaqr1.c slaqr2.c slaqr3.c slaqr4.c slaqr5.c
slaqtr.c slar1v.c slar2v.c ilaslr.c ilaslc.c
slarf.c slarfb.c slarfb_gett.c slarfg.c slarfgp.c slarft.c slarfx.c slarfy.c slargv.c
slarrv.c slartv.c
slarz.c slarzb.c slarzt.c slasy2.c
slasyf.c slasyf_rook.c slasyf_rk.c slasyf_aa.c
slatbs.c slatdf.c slatps.c slatrd.c slatrs.c slatrz.c
sopgtr.c sopmtr.c sorg2l.c sorg2r.c
sorgbr.c sorghr.c sorgl2.c sorglq.c sorgql.c sorgqr.c sorgr2.c
sorgrq.c sorgtr.c sorm2l.c sorm2r.c sorm22.c
sormbr.c sormhr.c sorml2.c sormlq.c sormql.c sormqr.c sormr2.c
sormr3.c sormrq.c sormrz.c sormtr.c spbcon.c spbequ.c spbrfs.c
spbstf.c spbsv.c spbsvx.c
spbtf2.c spbtrf.c spbtrs.c spocon.c spoequ.c sporfs.c sposv.c
sposvx.c spotrf2.c spotri.c spstrf.c spstf2.c
sppcon.c sppequ.c
spprfs.c sppsv.c sppsvx.c spptrf.c spptri.c spptrs.c sptcon.c
spteqr.c sptrfs.c sptsv.c sptsvx.c spttrs.c sptts2.c srscl.c
ssbev.c ssbevd.c ssbevx.c ssbgst.c ssbgv.c ssbgvd.c ssbgvx.c
ssbtrd.c sspcon.c sspev.c sspevd.c sspevx.c sspgst.c
sspgv.c sspgvd.c sspgvx.c ssprfs.c sspsv.c sspsvx.c ssptrd.c
ssptrf.c ssptri.c ssptrs.c sstegr.c sstev.c sstevd.c sstevr.c
ssycon.c ssyev.c ssyevd.c ssyevr.c ssyevx.c ssygs2.c
ssygst.c ssygv.c ssygvd.c ssygvx.c ssyrfs.c ssysv.c ssysvx.c
ssytd2.c ssytf2.c ssytrd.c ssytrf.c ssytri.c ssytri2.c ssytri2x.c
ssyswapr.c ssytrs.c ssytrs2.c
ssyconv.c ssyconvf.c ssyconvf_rook.c
ssysv_aa.c ssysv_aa_2stage.c ssytrf_aa.c ssytrf_aa_2stage.c ssytrs_aa.c ssytrs_aa_2stage.c
ssytf2_rook.c ssytrf_rook.c ssytrs_rook.c
ssytri_rook.c ssycon_rook.c ssysv_rook.c
ssytf2_rk.c ssytrf_rk.c ssytrs_3.c
ssytri_3.c ssytri_3x.c ssycon_3.c ssysv_rk.c
ssysv_aa.c ssytrf_aa.c ssytrs_aa.c
stbcon.c
stbrfs.c stbtrs.c stgevc.c stgex2.c stgexc.c stgsen.c
stgsja.c stgsna.c stgsy2.c stgsyl.c stpcon.c stprfs.c stptri.c
stptrs.c
strcon.c strevc.c strevc3.c strexc.c strrfs.c strsen.c strsna.c strsyl.c
strtrs.c stzrzf.c sstemr.c
slansf.c spftrf.c spftri.c spftrs.c ssfrk.c stfsm.c stftri.c stfttp.c
stfttr.c stpttf.c stpttr.c strttf.c strttp.c
sgejsv.c sgesvj.c sgsvj0.c sgsvj1.c
sgeequb.c ssyequb.c spoequb.c sgbequb.c
sbbcsd.c slapmr.c sorbdb.c sorbdb1.c sorbdb2.c sorbdb3.c sorbdb4.c
sorbdb5.c sorbdb6.c sorcsd.c sorcsd2by1.c
sgeqrt.c sgeqrt2.c sgeqrt3.c sgemqrt.c
stpqrt.c stpqrt2.c stpmqrt.c stprfb.c
sgelqt.c sgelqt3.c sgemlqt.c
sgetsls.c sgetsqrhrt.c sgeqr.c slatsqr.c slamtsqr.c sgemqr.c
sgelq.c slaswlq.c slamswlq.c sgemlq.c
stplqt.c stplqt2.c stpmlqt.c
ssytrd_2stage.c ssytrd_sy2sb.c ssytrd_sb2st.c ssb2st_kernels.c
ssyevd_2stage.c ssyev_2stage.c ssyevx_2stage.c ssyevr_2stage.c
ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c
sgesvdq.c slaorhr_col_getrfnp.c
slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c )
set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c
sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c
sla_syrfsx_extended.c sla_syamv.c sla_syrcond.c sla_syrpvgrw.c
sposvxx.c sporfsx.c sla_porfsx_extended.c sla_porcond.c
sla_porpvgrw.c sgbsvxx.c sgbrfsx.c sla_gbrfsx_extended.c
sla_gbamv.c sla_gbrcond.c sla_gbrpvgrw.c sla_lin_berr.c slarscl2.c
slascl2.c sla_wwaddw.c)
set(CLASRC
cbdsqr.c cgbbrd.c cgbcon.c cgbequ.c cgbrfs.c cgbsv.c cgbsvx.c
cgbtf2.c cgbtrf.c cgbtrs.c cgebak.c cgebal.c cgebd2.c cgebrd.c
cgecon.c cgeequ.c cgees.c cgeesx.c cgeev.c cgeevx.c
cgehd2.c cgehrd.c cgelq2.c cgelqf.c
cgels.c cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c
cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c
cgesc2.c cgesdd.c cgesvd.c cgesvdx.c
cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c
cgesvx.c cgetc2.c cgetrf2.c
cgetri.c
cggbak.c cggbal.c
cgges.c cgges3.c cggesx.c cggev.c cggev3.c cggevx.c
cggglm.c cgghrd.c cgghd3.c cgglse.c cggqrf.c cggrqf.c
cggsvd3.c cggsvp3.c
cgtcon.c cgtrfs.c cgtsv.c cgtsvx.c cgttrf.c cgttrs.c cgtts2.c chbev.c
chbevd.c chbevx.c chbgst.c chbgv.c chbgvd.c chbgvx.c chbtrd.c
checon.c cheev.c cheevd.c cheevr.c cheevx.c chegs2.c chegst.c
chegv.c chegvd.c chegvx.c cherfs.c chesv.c chesvx.c chetd2.c
chetf2.c chetrd.c
chetrf.c chetri.c chetri2.c chetri2x.c cheswapr.c
chetrs.c chetrs2.c
chetf2_rook.c chetrf_rook.c chetri_rook.c
chetrs_rook.c checon_rook.c chesv_rook.c
chetf2_rk.c chetrf_rk.c chetri_3.c chetri_3x.c
chetrs_3.c checon_3.c chesv_rk.c
chesv_aa.c chesv_aa_2stage.c chetrf_aa.c chetrf_aa_2stage.c chetrs_aa.c chetrs_aa_2stage.c
chgeqz.c chpcon.c chpev.c chpevd.c
chpevx.c chpgst.c chpgv.c chpgvd.c chpgvx.c chprfs.c chpsv.c
chpsvx.c
chptrd.c chptrf.c chptri.c chptrs.c chsein.c chseqr.c clabrd.c
clacgv.c clacon.c clacn2.c clacp2.c clacpy.c clacrm.c clacrt.c cladiv.c
claed0.c claed7.c claed8.c
claein.c claesy.c claev2.c clags2.c clagtm.c
clahef.c clahef_rook.c clahef_rk.c clahef_aa.c clahqr.c
clahr2.c claic1.c clals0.c clalsa.c clalsd.c clangb.c clange.c clangt.c
clanhb.c clanhe.c
clanhp.c clanhs.c clanht.c clansb.c clansp.c clansy.c clantb.c
clantp.c clantr.c clapll.c clapmt.c clarcm.c claqgb.c claqge.c
claqhb.c claqhe.c claqhp.c claqp2.c claqps.c claqsb.c
claqr0.c claqr1.c claqr2.c claqr3.c claqr4.c claqr5.c
claqsp.c claqsy.c clar1v.c clar2v.c ilaclr.c ilaclc.c
clarf.c clarfb.c clarfb_gett.c clarfg.c clarfgp.c clarft.c
clarfx.c clarfy.c clargv.c clarnv.c clarrv.c clartg.c clartv.c
clarz.c clarzb.c clarzt.c clascl.c claset.c clasr.c classq.c
clasyf.c clasyf_rook.c clasyf_rk.c clasyf_aa.c
clatbs.c clatdf.c clatps.c clatrd.c clatrs.c clatrz.c
cpbcon.c cpbequ.c cpbrfs.c cpbstf.c cpbsv.c
cpbsvx.c cpbtf2.c cpbtrf.c cpbtrs.c cpocon.c cpoequ.c cporfs.c
cposv.c cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c
cppcon.c cppequ.c cpprfs.c cppsv.c cppsvx.c cpptrf.c cpptri.c cpptrs.c
cptcon.c cpteqr.c cptrfs.c cptsv.c cptsvx.c cpttrf.c cpttrs.c cptts2.c
crot.c cspcon.c csprfs.c cspsv.c
cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c
cstegr.c cstein.c csteqr.c csycon.c
csyrfs.c csysv.c csysvx.c csytf2.c csytrf.c csytri.c
csytri2.c csytri2x.c csyswapr.c
csytrs.c csytrs2.c
csyconv.c csyconvf.c csyconvf_rook.c
csytf2_rook.c csytrf_rook.c csytrs_rook.c
csytri_rook.c csycon_rook.c csysv_rook.c
csytf2_rk.c csytrf_rk.c csytrf_aa.c csytrf_aa_2stage.c csytrs_3.c csytrs_aa.c csytrs_aa_2stage.c
csytri_3.c csytri_3x.c csycon_3.c csysv_rk.c csysv_aa.c csysv_aa_2stage.c
ctbcon.c ctbrfs.c ctbtrs.c ctgevc.c ctgex2.c
ctgexc.c ctgsen.c ctgsja.c ctgsna.c ctgsy2.c ctgsyl.c ctpcon.c
ctprfs.c ctptri.c
ctptrs.c ctrcon.c ctrevc.c ctrevc3.c ctrexc.c ctrrfs.c ctrsen.c ctrsna.c
ctrsyl.c ctrtrs.c ctzrzf.c cung2l.c cung2r.c
cungbr.c cunghr.c cungl2.c cunglq.c cungql.c cungqr.c cungr2.c
cungrq.c cungtr.c cunm2l.c cunm2r.c cunmbr.c cunmhr.c cunml2.c cunm22.c
cunmlq.c cunmql.c cunmqr.c cunmr2.c cunmr3.c cunmrq.c cunmrz.c
cunmtr.c cupgtr.c cupmtr.c icmax1.c scsum1.c cstemr.c
chfrk.c ctfttp.c clanhf.c cpftrf.c cpftri.c cpftrs.c ctfsm.c ctftri.c
ctfttr.c ctpttf.c ctpttr.c ctrttf.c ctrttp.c
cgeequb.c cgbequb.c csyequb.c cpoequb.c cheequb.c
cbbcsd.c clapmr.c cunbdb.c cunbdb1.c cunbdb2.c cunbdb3.c cunbdb4.c
cunbdb5.c cunbdb6.c cuncsd.c cuncsd2by1.c
cgeqrt.c cgeqrt2.c cgeqrt3.c cgemqrt.c
ctpqrt.c ctpqrt2.c ctpmqrt.c ctprfb.c
cgelqt.c cgelqt3.c cgemlqt.c
cgetsls.c cgetsqrhrt.c cgeqr.c clatsqr.c clamtsqr.c cgemqr.c
cgelq.c claswlq.c clamswlq.c cgemlq.c
ctplqt.c ctplqt2.c ctpmlqt.c
chetrd_2stage.c chetrd_he2hb.c chetrd_hb2st.c chb2st_kernels.c
cheevd_2stage.c cheev_2stage.c cheevx_2stage.c cheevr_2stage.c
chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c
cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c
cungtsqr.c cungtsqr_row.c cunhr_col.c )
set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c
cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c
csysvxx.c csyrfsx.c cla_syrfsx_extended.c cla_syamv.c
cla_syrcond_c.c cla_syrcond_x.c cla_syrpvgrw.c
cposvxx.c cporfsx.c cla_porfsx_extended.c
cla_porcond_c.c cla_porcond_x.c cla_porpvgrw.c
cgbsvxx.c cgbrfsx.c cla_gbrfsx_extended.c cla_gbamv.c
cla_gbrcond_c.c cla_gbrcond_x.c cla_gbrpvgrw.c
chesvxx.c cherfsx.c cla_herfsx_extended.c cla_heamv.c
cla_hercond_c.c cla_hercond_x.c cla_herpvgrw.c
cla_lin_berr.c clarscl2.c clascl2.c cla_wwaddw.c)
set(DLASRC
dgbbrd.c dgbcon.c dgbequ.c dgbrfs.c dgbsv.c
dgbsvx.c dgbtf2.c dgbtrf.c dgbtrs.c dgebak.c dgebal.c dgebd2.c
dgebrd.c dgecon.c dgeequ.c dgees.c dgeesx.c dgeev.c dgeevx.c
dgehd2.c dgehrd.c dgelq2.c dgelqf.c
dgels.c dgelsd.c dgelss.c dgelsy.c dgeql2.c dgeqlf.c
dgeqp3.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c
dgesc2.c dgesdd.c dgesvd.c dgesvdx.c dgesvx.c dgetc2.c
dgetrf2.c dgetri.c
dggbak.c dggbal.c
dgges.c dgges3.c dggesx.c dggev.c dggev3.c dggevx.c
dggglm.c dgghrd.c dgghd3.c dgglse.c dggqrf.c
dggrqf.c dggsvd3.c dggsvp3.c dgtcon.c dgtrfs.c dgtsv.c
dgtsvx.c dgttrf.c dgttrs.c dgtts2.c dhgeqz.c
dhsein.c dhseqr.c dlabrd.c dlacon.c dlacn2.c
dlaein.c dlaexc.c dlag2.c dlags2.c dlagtm.c dlagv2.c dlahqr.c
dlahr2.c dlaic1.c dlaln2.c dlals0.c dlalsa.c dlalsd.c
dlangb.c dlange.c dlangt.c dlanhs.c dlansb.c dlansp.c
dlansy.c dlantb.c dlantp.c dlantr.c dlanv2.c
dlapll.c dlapmt.c
dlaqgb.c dlaqge.c dlaqp2.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c
dlaqr0.c dlaqr1.c dlaqr2.c dlaqr3.c dlaqr4.c dlaqr5.c
dlaqtr.c dlar1v.c dlar2v.c iladlr.c iladlc.c
dlarf.c dlarfb.c dlarfb_gett.c dlarfg.c dlarfgp.c dlarft.c dlarfx.c dlarfy.c
dlargv.c dlarrv.c dlartv.c
dlarz.c dlarzb.c dlarzt.c dlasy2.c
dlasyf.c dlasyf_rook.c dlasyf_rk.c dlasyf_aa.c
dlatbs.c dlatdf.c dlatps.c dlatrd.c dlatrs.c dlatrz.c
dopgtr.c dopmtr.c dorg2l.c dorg2r.c
dorgbr.c dorghr.c dorgl2.c dorglq.c dorgql.c dorgqr.c dorgr2.c
dorgrq.c dorgtr.c dorm2l.c dorm2r.c dorm22.c
dormbr.c dormhr.c dorml2.c dormlq.c dormql.c dormqr.c dormr2.c
dormr3.c dormrq.c dormrz.c dormtr.c dpbcon.c dpbequ.c dpbrfs.c
dpbstf.c dpbsv.c dpbsvx.c
dpbtf2.c dpbtrf.c dpbtrs.c dpocon.c dpoequ.c dporfs.c dposv.c
dposvx.c dpotrf2.c dpotri.c dpotrs.c dpstrf.c dpstf2.c
dppcon.c dppequ.c
dpprfs.c dppsv.c dppsvx.c dpptrf.c dpptri.c dpptrs.c dptcon.c
dpteqr.c dptrfs.c dptsv.c dptsvx.c dpttrs.c dptts2.c drscl.c
dsbev.c dsbevd.c dsbevx.c dsbgst.c dsbgv.c dsbgvd.c dsbgvx.c
dsbtrd.c dspcon.c dspev.c dspevd.c dspevx.c dspgst.c
dspgv.c dspgvd.c dspgvx.c dsprfs.c dspsv.c dspsvx.c dsptrd.c
dsptrf.c dsptri.c dsptrs.c dstegr.c dstev.c dstevd.c dstevr.c
dsycon.c dsyev.c dsyevd.c dsyevr.c
dsyevx.c dsygs2.c dsygst.c dsygv.c dsygvd.c dsygvx.c dsyrfs.c
dsysv.c dsysvx.c
dsytd2.c dsytf2.c dsytrd.c dsytrf.c dsytri.c dsytrs.c dsytrs2.c
dsytri2.c dsytri2x.c dsyswapr.c
dsyconv.c dsyconvf.c dsyconvf_rook.c
dsytf2_rook.c dsytrf_rook.c dsytrs_rook.c
dsytri_rook.c dsycon_rook.c dsysv_rook.c
dsytf2_rk.c dsytrf_rk.c dsytrs_3.c
dsytri_3.c dsytri_3x.c dsycon_3.c dsysv_rk.c
dsysv_aa.c dsysv_aa_2stage.c dsytrf_aa.c dsytrf_aa_2stage.c dsytrs_aa.c dsytrs_aa_2stage.c
dtbcon.c
dtbrfs.c dtbtrs.c dtgevc.c dtgex2.c dtgexc.c dtgsen.c
dtgsja.c dtgsna.c dtgsy2.c dtgsyl.c dtpcon.c dtprfs.c dtptri.c
dtptrs.c
dtrcon.c dtrevc.c dtrevc3.c dtrexc.c dtrrfs.c dtrsen.c dtrsna.c dtrsyl.c
dtrtrs.c dtzrzf.c dstemr.c
dsgesv.c dsposv.c dlag2s.c slag2d.c dlat2s.c
dlansf.c dpftrf.c dpftri.c dpftrs.c dsfrk.c dtfsm.c dtftri.c dtfttp.c
dtfttr.c dtpttf.c dtpttr.c dtrttf.c dtrttp.c
dgejsv.c dgesvj.c dgsvj0.c dgsvj1.c
dgeequb.c dsyequb.c dpoequb.c dgbequb.c
dbbcsd.c dlapmr.c dorbdb.c dorbdb1.c dorbdb2.c dorbdb3.c dorbdb4.c
dorbdb5.c dorbdb6.c dorcsd.c dorcsd2by1.c
dgeqrt.c dgeqrt2.c dgeqrt3.c dgemqrt.c
dtpqrt.c dtpqrt2.c dtpmqrt.c dtprfb.c
dgelqt.c dgelqt3.c dgemlqt.c
dgetsls.c dgetsqrhrt.c dgeqr.c dlatsqr.c dlamtsqr.c dgemqr.c
dgelq.c dlaswlq.c dlamswlq.c dgemlq.c
dtplqt.c dtplqt2.c dtpmlqt.c
dsytrd_2stage.c dsytrd_sy2sb.c dsytrd_sb2st.c dsb2st_kernels.c
dsyevd_2stage.c dsyev_2stage.c dsyevx_2stage.c dsyevr_2stage.c
dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c
dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c
dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c )
set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c
dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c
dla_syrfsx_extended.c dla_syamv.c dla_syrcond.c dla_syrpvgrw.c
dposvxx.c dporfsx.c dla_porfsx_extended.c dla_porcond.c
dla_porpvgrw.c dgbsvxx.c dgbrfsx.c dla_gbrfsx_extended.c
dla_gbamv.c dla_gbrcond.c dla_gbrpvgrw.c dla_lin_berr.c dlarscl2.c
dlascl2.c dla_wwaddw.c)
set(ZLASRC
zbdsqr.c zgbbrd.c zgbcon.c zgbequ.c zgbrfs.c zgbsv.c zgbsvx.c
zgbtf2.c zgbtrf.c zgbtrs.c zgebak.c zgebal.c zgebd2.c zgebrd.c
zgecon.c zgeequ.c zgees.c zgeesx.c zgeev.c zgeevx.c
zgehd2.c zgehrd.c zgelq2.c zgelqf.c
zgels.c zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c
zgeqr2.c zgeqr2p.c zgeqrf.c zgeqrfp.c zgerfs.c zgerq2.c zgerqf.c
zgesc2.c zgesdd.c zgesvd.c zgesvdx.c zgesvx.c
zgesvj.c zgejsv.c zgsvj0.c zgsvj1.c
zgetc2.c zgetrf2.c
zgetri.c
zggbak.c zggbal.c
zgges.c zgges3.c zggesx.c zggev.c zggev3.c zggevx.c
zggglm.c zgghrd.c zgghd3.c zgglse.c zggqrf.c zggrqf.c
zggsvd3.c zggsvp3.c
zgtcon.c zgtrfs.c zgtsv.c zgtsvx.c zgttrf.c zgttrs.c zgtts2.c zhbev.c
zhbevd.c zhbevx.c zhbgst.c zhbgv.c zhbgvd.c zhbgvx.c zhbtrd.c
zhecon.c zheev.c zheevd.c zheevr.c zheevx.c zhegs2.c zhegst.c
zhegv.c zhegvd.c zhegvx.c zherfs.c zhesv.c zhesvx.c zhetd2.c
zhetf2.c zhetrd.c
zhetrf.c zhetri.c zhetri2.c zhetri2x.c zheswapr.c
zhetrs.c zhetrs2.c
zhetf2_rook.c zhetrf_rook.c zhetri_rook.c
zhetrs_rook.c zhecon_rook.c zhesv_rook.c
zhetf2_rk.c zhetrf_rk.c zhetri_3.c zhetri_3x.c
zhetrs_3.c zhecon_3.c zhesv_rk.c
zhesv_aa.c zhesv_aa_2stage.c zhetrf_aa.c zhetrf_aa_2stage.c zhetrs_aa.c zhetrs_aa_2stage.c
zhgeqz.c zhpcon.c zhpev.c zhpevd.c
zhpevx.c zhpgst.c zhpgv.c zhpgvd.c zhpgvx.c zhprfs.c zhpsv.c
zhpsvx.c
zhptrd.c zhptrf.c zhptri.c zhptrs.c zhsein.c zhseqr.c zlabrd.c
zlacgv.c zlacon.c zlacn2.c zlacp2.c zlacpy.c zlacrm.c zlacrt.c zladiv.c
zlaed0.c zlaed7.c zlaed8.c
zlaein.c zlaesy.c zlaev2.c zlags2.c zlagtm.c
zlahef.c zlahef_rook.c zlahef_rk.c zlahef_aa.c zlahqr.c
zlahr2.c zlaic1.c zlals0.c zlalsa.c zlalsd.c zlangb.c zlange.c
zlangt.c zlanhb.c
zlanhe.c
zlanhp.c zlanhs.c zlanht.c zlansb.c zlansp.c zlansy.c zlantb.c
zlantp.c zlantr.c zlapll.c zlapmt.c zlaqgb.c zlaqge.c
zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqps.c zlaqsb.c
zlaqr0.c zlaqr1.c zlaqr2.c zlaqr3.c zlaqr4.c zlaqr5.c
zlaqsp.c zlaqsy.c zlar1v.c zlar2v.c ilazlr.c ilazlc.c
zlarcm.c zlarf.c zlarfb.c zlarfb_gett.c
zlarfg.c zlarfgp.c zlarft.c
zlarfx.c zlarfy.c zlargv.c zlarnv.c zlarrv.c zlartg.c zlartv.c
zlarz.c zlarzb.c zlarzt.c zlascl.c zlaset.c zlasr.c
zlassq.c zlasyf.c zlasyf_rook.c zlasyf_rk.c zlasyf_aa.c
zlatbs.c zlatdf.c zlatps.c zlatrd.c zlatrs.c zlatrz.c
zpbcon.c zpbequ.c zpbrfs.c zpbstf.c zpbsv.c
zpbsvx.c zpbtf2.c zpbtrf.c zpbtrs.c zpocon.c zpoequ.c zporfs.c
zposv.c zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c
zppcon.c zppequ.c zpprfs.c zppsv.c zppsvx.c zpptrf.c zpptri.c zpptrs.c
zptcon.c zpteqr.c zptrfs.c zptsv.c zptsvx.c zpttrf.c zpttrs.c zptts2.c
zrot.c zspcon.c zsprfs.c zspsv.c
zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c
zstegr.c zstein.c zsteqr.c zsycon.c
zsyrfs.c zsysv.c zsysvx.c zsytf2.c zsytrf.c zsytri.c
zsytri2.c zsytri2x.c zsyswapr.c
zsytrs.c zsytrs2.c
zsyconv.c zsyconvf.c zsyconvf_rook.c
zsytf2_rook.c zsytrf_rook.c zsytrs_rook.c zsytrs_aa.c zsytrs_aa_2stage.c
zsytri_rook.c zsycon_rook.c zsysv_rook.c
zsytf2_rk.c zsytrf_rk.c zsytrf_aa.c zsytrf_aa_2stage.c zsytrs_3.c
zsytri_3.c zsytri_3x.c zsycon_3.c zsysv_rk.c zsysv_aa.c zsysv_aa_2stage.c
ztbcon.c ztbrfs.c ztbtrs.c ztgevc.c ztgex2.c
ztgexc.c ztgsen.c ztgsja.c ztgsna.c ztgsy2.c ztgsyl.c ztpcon.c
ztprfs.c ztptri.c
ztptrs.c ztrcon.c ztrevc.c ztrevc3.c ztrexc.c ztrrfs.c ztrsen.c ztrsna.c
ztrsyl.c ztrtrs.c ztzrzf.c zung2l.c
zung2r.c zungbr.c zunghr.c zungl2.c zunglq.c zungql.c zungqr.c zungr2.c
zungrq.c zungtr.c zunm2l.c zunm2r.c zunmbr.c zunmhr.c zunml2.c zunm22.c
zunmlq.c zunmql.c zunmqr.c zunmr2.c zunmr3.c zunmrq.c zunmrz.c
zunmtr.c zupgtr.c
zupmtr.c izmax1.c dzsum1.c zstemr.c
zcgesv.c zcposv.c zlag2c.c clag2z.c zlat2c.c
zhfrk.c ztfttp.c zlanhf.c zpftrf.c zpftri.c zpftrs.c ztfsm.c ztftri.c
ztfttr.c ztpttf.c ztpttr.c ztrttf.c ztrttp.c
zgeequb.c zgbequb.c zsyequb.c zpoequb.c zheequb.c
zbbcsd.c zlapmr.c zunbdb.c zunbdb1.c zunbdb2.c zunbdb3.c zunbdb4.c
zunbdb5.c zunbdb6.c zuncsd.c zuncsd2by1.c
zgeqrt.c zgeqrt2.c zgeqrt3.c zgemqrt.c
ztpqrt.c ztpqrt2.c ztpmqrt.c ztprfb.c
ztplqt.c ztplqt2.c ztpmlqt.c
zgelqt.c zgelqt3.c zgemlqt.c
zgetsls.c zgetsqrhrt.c zgeqr.c zlatsqr.c zlamtsqr.c zgemqr.c
zgelq.c zlaswlq.c zlamswlq.c zgemlq.c
zhetrd_2stage.c zhetrd_he2hb.c zhetrd_hb2st.c zhb2st_kernels.c
zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c
zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c
zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c
zungtsqr.c zungtsqr_row.c zunhr_col.c)
set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c
zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c
zla_syrfsx_extended.c zla_syamv.c zla_syrcond_c.c zla_syrcond_x.c
zla_syrpvgrw.c zposvxx.c zporfsx.c zla_porfsx_extended.c
zla_porcond_c.c zla_porcond_x.c zla_porpvgrw.c zgbsvxx.c zgbrfsx.c
zla_gbrfsx_extended.c zla_gbamv.c zla_gbrcond_c.c zla_gbrcond_x.c
zla_gbrpvgrw.c zhesvxx.c zherfsx.c zla_herfsx_extended.c
zla_heamv.c zla_hercond_c.c zla_hercond_x.c zla_herpvgrw.c
zla_lin_berr.c zlarscl2.c zlascl2.c zla_wwaddw.c)
if(USE_XBLAS)
set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
endif()
list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c
DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c
DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c)
list(APPEND DLASRC DEPRECATED/dgegs.c DEPRECATED/dgegv.c
DEPRECATED/dgeqpf.c DEPRECATED/dgelsx.c DEPRECATED/dggsvd.c
DEPRECATED/dggsvp.c DEPRECATED/dlahrd.c DEPRECATED/dlatzm.c DEPRECATED/dtzrqf.c)
list(APPEND CLASRC DEPRECATED/cgegs.c DEPRECATED/cgegv.c
DEPRECATED/cgeqpf.c DEPRECATED/cgelsx.c DEPRECATED/cggsvd.c
DEPRECATED/cggsvp.c DEPRECATED/clahrd.c DEPRECATED/clatzm.c DEPRECATED/ctzrqf.c)
list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c
DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c
DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c)
message(STATUS "Building deprecated routines")
set(DSLASRC spotrs.c)
set(ZCLASRC cpotrs.c)
set(SCATGEN slatm1.c slaran.c slarnd.c)
set(SMATGEN slatms.c slatme.c slatmr.c slatmt.c
slagge.c slagsy.c slakf2.c slarge.c slaror.c slarot.c slatm2.c
slatm3.c slatm5.c slatm6.c slatm7.c slahilb.c)
set(CMATGEN clatms.c clatme.c clatmr.c clatmt.c
clagge.c claghe.c clagsy.c clakf2.c clarge.c claror.c clarot.c
clatm1.c clarnd.c clatm2.c clatm3.c clatm5.c clatm6.c clahilb.c slatm7.c)
set(DZATGEN dlatm1.c dlaran.c dlarnd.c)
set(DMATGEN dlatms.c dlatme.c dlatmr.c dlatmt.c
dlagge.c dlagsy.c dlakf2.c dlarge.c dlaror.c dlarot.c dlatm2.c
dlatm3.c dlatm5.c dlatm6.c dlatm7.c dlahilb.c)
set(ZMATGEN zlatms.c zlatme.c zlatmr.c zlatmt.c
zlagge.c zlaghe.c zlagsy.c zlakf2.c zlarge.c zlaror.c zlarot.c
zlatm1.c zlarnd.c zlatm2.c zlatm3.c zlatm5.c zlatm6.c zlahilb.c dlatm7.c)
if(BUILD_SINGLE)
set(LA_REL_SRC ${SLASRC} ${DSLASRC} ${ALLAUX} ${SCLAUX})
set(LA_GEN_SRC ${SMATGEN} ${SCATGEN})
message(STATUS "Building Single Precision")
endif()
if(BUILD_DOUBLE)
set(LA_REL_SRC ${LA_REL_SRC} ${DLASRC} ${DSLASRC} ${ALLAUX} ${DZLAUX})
set(LA_GEN_SRC ${LA_GEN_SRC} ${DMATGEN} ${DZATGEN})
message(STATUS "Building Double Precision")
endif()
if(BUILD_COMPLEX)
set(LA_REL_SRC ${LA_REL_SRC} ${CLASRC} ${ZCLASRC} ${ALLAUX} ${SCLAUX})
SET(LA_GEN_SRC ${LA_GEN_SRC} ${CMATGEN} ${SCATGEN})
message(STATUS "Building Single Precision Complex")
endif()
if(BUILD_COMPLEX16)
set(LA_REL_SRC ${LA_REL_SRC} ${ZLASRC} ${ZCLASRC} ${ALLAUX} ${DZLAUX})
SET(LA_GEN_SRC ${LA_GEN_SRC} ${ZMATGEN} ${DZATGEN})
# for zlange/zlanhe
if (NOT BUILD_DOUBLE)
set (LA_REL_SRC ${LA_REL_SRC} dcombssq.c)
endif ()
message(STATUS "Building Double Precision Complex")
endif()
endif()
# add lapack-netlib folder to the sources
set(LA_SOURCES "")
foreach (LA_FILE ${LA_REL_SRC})
@ -496,4 +996,9 @@ endforeach ()
foreach (LA_FILE ${LA_GEN_SRC})
list(APPEND LA_SOURCES "${NETLIB_LAPACK_DIR}/TESTING/MATGEN/${LA_FILE}")
endforeach ()
set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}")
if (NOT C_LAPACK)
set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}")
else ()
set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")
endif ()

View File

@ -131,6 +131,8 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
set(HAVE_SSE2 1)
set(HAVE_SSE3 1)
set(HAVE_SSSE3 1)
set(SBGEMM_UNROLL_M 8)
set(SBGEMM_UNROLL_N 4)
set(SGEMM_UNROLL_M 8)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 4)
@ -143,6 +145,684 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
set(CGEMM3M_UNROLL_N 4)
set(ZGEMM3M_UNROLL_M 4)
set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "ATOM")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t24576\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L2_SIZE\t524288\n"
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_CMOV\n"
"#define HAVE_MMX\n"
"#define HAVE_SSE\n"
"#define HAVE_SSE2\n"
"#define HAVE_SSE3\n"
"#define HAVE_SSSE3\n"
"#define SLOCAL_BUFFER_SIZE\t16384\n"
"#define DLOCAL_BUFFER_SIZE\t8192\n"
"#define CLOCAL_BUFFER_SIZE\t16384\n"
"#define ZLOCAL_BUFFER_SIZE\t8192\n")
set(HAVE_SSE 1)
set(HAVE_SSE2 1)
set(HAVE_SSE3 1)
set(HAVE_SSSE3 1)
set(SBGEMM_UNROLL_M 8)
set(SBGEMM_UNROLL_N 4)
set(SGEMM_UNROLL_M 8)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 2)
set(CGEMM_UNROLL_M 4)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 1)
set(CGEMM3M_UNROLL_M 8)
set(CGEMM3M_UNROLL_N 4)
set(ZGEMM3M_UNROLL_M 4)
set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "PRESCOTT")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t16384\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L2_SIZE\t1048576\n"
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_CMOV\n"
"#define HAVE_MMX\n"
"#define HAVE_SSE\n"
"#define HAVE_SSE2\n"
"#define HAVE_SSE3\n"
"#define SLOCAL_BUFFER_SIZE\t8192\n"
"#define DLOCAL_BUFFER_SIZE\t8192\n"
"#define CLOCAL_BUFFER_SIZE\t8192\n"
"#define ZLOCAL_BUFFER_SIZE\t8192\n")
set(HAVE_SSE 1)
set(HAVE_SSE2 1)
set(HAVE_SSE3 1)
set(SBGEMM_UNROLL_M 8)
set(SBGEMM_UNROLL_N 4)
set(SGEMM_UNROLL_M 8)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 4)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
set(CGEMM3M_UNROLL_M 8)
set(CGEMM3M_UNROLL_N 4)
set(ZGEMM3M_UNROLL_M 4)
set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "NEHALEM")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L2_SIZE\t262144\n"
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_CMOV\n"
"#define HAVE_MMX\n"
"#define HAVE_SSE\n"
"#define HAVE_SSE2\n"
"#define HAVE_SSE3\n"
"#define HAVE_SSSE3\n"
"#define HAVE_SSE4_1\n"
"#define HAVE_SSE4_2\n"
"#define SLOCAL_BUFFER_SIZE\t65535\n"
"#define DLOCAL_BUFFER_SIZE\t32768\n"
"#define CLOCAL_BUFFER_SIZE\t65536\n"
"#define ZLOCAL_BUFFER_SIZE\t32768\n")
set(HAVE_SSE 1)
set(HAVE_SSE2 1)
set(HAVE_SSE3 1)
set(HAVE_SSSE3 1)
set(HAVE_SSE4_1 1)
set(HAVE_SSE4_2 1)
set(SBGEMM_UNROLL_M 8)
set(SBGEMM_UNROLL_N 4)
set(SGEMM_UNROLL_M 4)
set(SGEMM_UNROLL_N 8)
set(DGEMM_UNROLL_M 2)
set(DGEMM_UNROLL_N 8)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 1)
set(ZGEMM_UNROLL_N 4)
set(CGEMM3M_UNROLL_M 4)
set(CGEMM3M_UNROLL_N 8)
set(ZGEMM3M_UNROLL_M 2)
set(ZGEMM3M_UNROLL_N 8)
elseif ("${TCORE}" STREQUAL "SANDYBRIDGE")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L2_SIZE\t262144\n"
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_CMOV\n"
"#define HAVE_MMX\n"
"#define HAVE_SSE\n"
"#define HAVE_SSE2\n"
"#define HAVE_SSE3\n"
"#define HAVE_SSSE3\n"
"#define HAVE_SSE4_1\n"
"#define HAVE_SSE4_2\n"
"#define HAVE_AVX\n"
"#define SLOCAL_BUFFER_SIZE\t24576\n"
"#define DLOCAL_BUFFER_SIZE\t16384\n"
"#define CLOCAL_BUFFER_SIZE\t32768\n"
"#define ZLOCAL_BUFFER_SIZE\t24576\n")
set(HAVE_SSE 1)
set(HAVE_SSE2 1)
set(HAVE_SSE3 1)
set(HAVE_SSSE3 1)
set(HAVE_SSE4_1 1)
set(HAVE_SSE4_2 1)
set(HAVE_AVX 1)
set(SBGEMM_UNROLL_M 8)
set(SBGEMM_UNROLL_N 4)
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 1)
set(ZGEMM_UNROLL_N 4)
set(CGEMM3M_UNROLL_M 4)
set(CGEMM3M_UNROLL_N 8)
set(ZGEMM3M_UNROLL_M 2)
set(ZGEMM3M_UNROLL_N 8)
elseif ("${TCORE}" STREQUAL "HASWELL")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L2_SIZE\t262144\n"
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_CMOV\n"
"#define HAVE_MMX\n"
"#define HAVE_SSE\n"
"#define HAVE_SSE2\n"
"#define HAVE_SSE3\n"
"#define HAVE_SSSE3\n"
"#define HAVE_SSE4_1\n"
"#define HAVE_SSE4_2\n"
"#define HAVE_AVX\n"
"#define HAVE_AVX2\n"
"#define HAVE_FMA3\n"
"#define SLOCAL_BUFFER_SIZE\t20480\n"
"#define DLOCAL_BUFFER_SIZE\t32768\n"
"#define CLOCAL_BUFFER_SIZE\t16384\n"
"#define ZLOCAL_BUFFER_SIZE\t12288\n")
set(HAVE_SSE 1)
set(HAVE_SSE2 1)
set(HAVE_SSE3 1)
set(HAVE_SSSE3 1)
set(HAVE_SSE4_1 1)
set(HAVE_SSE4_2 1)
set(HAVE_AVX 1)
set(HAVE_AVX2 1)
set(HAVE_FMA3 1)
set(SBGEMM_UNROLL_M 8)
set(SBGEMM_UNROLL_N 4)
set(SGEMM_UNROLL_M 8)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 8)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 2)
set(CGEMM3M_UNROLL_M 8)
set(CGEMM3M_UNROLL_N 4)
set(ZGEMM3M_UNROLL_M 4)
set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "SKYLAKEX")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L2_SIZE\t262144\n"
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_CMOV\n"
"#define HAVE_MMX\n"
"#define HAVE_SSE\n"
"#define HAVE_SSE2\n"
"#define HAVE_SSE3\n"
"#define HAVE_SSSE3\n"
"#define HAVE_SSE4_1\n"
"#define HAVE_SSE4_2\n"
"#define HAVE_AVX\n"
"#define HAVE_AVX2\n"
"#define HAVE_FMA3\n"
"#define HAVE_AVX512VL\n"
"#define SLOCAL_BUFFER_SIZE\t28672\n"
"#define DLOCAL_BUFFER_SIZE\t12288\n"
"#define CLOCAL_BUFFER_SIZE\t12288\n"
"#define ZLOCAL_BUFFER_SIZE\t8192\n")
set(HAVE_CMOV 1)
set(HAVE_MMX 1)
set(HAVE_SSE 1)
set(HAVE_SSE2 1)
set(HAVE_SSE3 1)
set(HAVE_SSSE3 1)
set(HAVE_SSE4_1 1)
set(HAVE_SSE4_2 1)
set(HAVE_AVX 1)
set(HAVE_AVX2 1)
set(HAVE_FMA3 1)
set(HAVE_AVX512VL 1)
set(SBGEMM_UNROLL_M 8)
set(SBGEMM_UNROLL_N 4)
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 16)
set(DGEMM_UNROLL_N 2)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 2)
set(CGEMM3M_UNROLL_M 8)
set(CGEMM3M_UNROLL_N 4)
set(ZGEMM3M_UNROLL_M 4)
set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "COOPERLAKE")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L2_SIZE\t262144\n"
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_CMOV\n"
"#define HAVE_MMX\n"
"#define HAVE_SSE\n"
"#define HAVE_SSE2\n"
"#define HAVE_SSE3\n"
"#define HAVE_SSSE3\n"
"#define HAVE_SSE4_1\n"
"#define HAVE_SSE4_2\n"
"#define HAVE_AVX\n"
"#define HAVE_AVX2\n"
"#define HAVE_FMA3\n"
"#define HAVE_AVX512VL\n"
"#define HAVE_AVX512BF16\n"
"#define SLOCAL_BUFFER_SIZE\t20480\n"
"#define DLOCAL_BUFFER_SIZE\t12288\n"
"#define CLOCAL_BUFFER_SIZE\t12288\n"
"#define ZLOCAL_BUFFER_SIZE\t8192\n")
set(HAVE_CMOV 1)
set(HAVE_MMX 1)
set(HAVE_SSE 1)
set(HAVE_SSE2 1)
set(HAVE_SSE3 1)
set(HAVE_SSSE3 1)
set(HAVE_SSE4_1 1)
set(HAVE_SSE4_2 1)
set(HAVE_AVX 1)
set(HAVE_AVX2 1)
set(HAVE_FMA3 1)
set(HAVE_AVX512VL 1)
set(HAVE_AVX512BF16 1)
set(SBGEMM_UNROLL_M 16)
set(SBGEMM_UNROLL_N 4)
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 16)
set(DGEMM_UNROLL_N 2)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 2)
set(CGEMM3M_UNROLL_M 8)
set(CGEMM3M_UNROLL_N 4)
set(ZGEMM3M_UNROLL_M 4)
set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "SAPPHIRERAPIDS")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L2_SIZE\t262144\n"
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_CMOV\n"
"#define HAVE_MMX\n"
"#define HAVE_SSE\n"
"#define HAVE_SSE2\n"
"#define HAVE_SSE3\n"
"#define HAVE_SSSE3\n"
"#define HAVE_SSE4_1\n"
"#define HAVE_SSE4_2\n"
"#define HAVE_AVX\n"
"#define HAVE_AVX2\n"
"#define HAVE_FMA3\n"
"#define HAVE_AVX512VL\n"
"#define HAVE_AVX512BF16\n"
"#define SLOCAL_BUFFER_SIZE\t20480\n"
"#define DLOCAL_BUFFER_SIZE\t12288\n"
"#define CLOCAL_BUFFER_SIZE\t12288\n"
"#define ZLOCAL_BUFFER_SIZE\t8192\n")
set(HAVE_CMOV 1)
set(HAVE_MMX 1)
set(HAVE_SSE 1)
set(HAVE_SSE2 1)
set(HAVE_SSE3 1)
set(HAVE_SSSE3 1)
set(HAVE_SSE4_1 1)
set(HAVE_SSE4_2 1)
set(HAVE_AVX 1)
set(HAVE_AVX2 1)
set(HAVE_FMA3 1)
set(HAVE_AVX512VL 1)
set(HAVE_AVX512BF16 1)
set(SBGEMM_UNROLL_M 32)
set(SBGEMM_UNROLL_N 16)
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 16)
set(DGEMM_UNROLL_N 2)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 2)
set(CGEMM3M_UNROLL_M 8)
set(CGEMM3M_UNROLL_N 4)
set(ZGEMM3M_UNROLL_M 4)
set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "OPTERON")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t65536\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L2_SIZE\t1048576\n"
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t32\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_3DNOW\n"
"#define HAVE_3DNOWEX\n"
"#define HAVE_MMX\n"
"#define HAVE_SSE\n"
"#define HAVE_SSE2\n"
"#define SLOCAL_BUFFER_SIZE\t15360\n"
"#define DLOCAL_BUFFER_SIZE\t15360\n"
"#define CLOCAL_BUFFER_SIZE\t15360\n"
"#define ZLOCAL_BUFFER_SIZE\t15360\n")
set(HAVE_3DNOW 1)
set(HAVE_3DNOWEX 1)
set(HAVE_MMX 1)
set(HAVE_SSE 1)
set(HAVE_SSE2 1)
set(SBGEMM_UNROLL_M 8)
set(SBGEMM_UNROLL_N 4)
set(SGEMM_UNROLL_M 8)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 4)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
set(CGEMM3M_UNROLL_M 8)
set(CGEMM3M_UNROLL_N 4)
set(ZGEMM3M_UNROLL_M 4)
set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "BARCELONA")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L2_SIZE\t524288\n"
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_MMX\n"
"#define HAVE_SSE\n"
"#define HAVE_SSE2\n"
"#define HAVE_SSE3\n"
"#define HAVE_SSE4A\n"
"#define HAVE_MISALIGNSSE\n"
"#define HAVE_128BITFPU\n"
"#define HAVE_FASTMOVU\n"
"#define SLOCAL_BUFFER_SIZE\t14336\n"
"#define DLOCAL_BUFFER_SIZE\t14336\n"
"#define CLOCAL_BUFFER_SIZE\t14336\n"
"#define ZLOCAL_BUFFER_SIZE\t14336\n")
set(HAVE_SSE 1)
set(HAVE_SSE2 1)
set(HAVE_SSE3 1)
set(HAVE_SSE4A 1)
set(HAVE_MISALIGNSSE 1)
set(HAVE_128BITFPU 1)
set(HAVE_FASTMOVU 1)
set(SBGEMM_UNROLL_M 8)
set(SBGEMM_UNROLL_N 4)
set(SGEMM_UNROLL_M 8)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 4)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
set(CGEMM3M_UNROLL_M 8)
set(CGEMM3M_UNROLL_N 4)
set(ZGEMM3M_UNROLL_M 4)
set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "BULLDOZER")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t49152\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L2_SIZE\t1024000\n"
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t32\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_MMX\n"
"#define HAVE_SSE\n"
"#define HAVE_SSE2\n"
"#define HAVE_SSE3\n"
"#define HAVE_SSE4A\n"
"#define HAVE_AVX\n"
"#define HAVE_MISALIGNSSE\n"
"#define HAVE_128BITFPU\n"
"#define HAVE_FASTMOVU\n"
"#define SLOCAL_BUFFER_SIZE\t5376\n"
"#define DLOCAL_BUFFER_SIZE\t5376\n"
"#define CLOCAL_BUFFER_SIZE\t14336\n"
"#define ZLOCAL_BUFFER_SIZE\t14336\n")
set(HAVE_SSE 1)
set(HAVE_SSE2 1)
set(HAVE_SSE3 1)
set(HAVE_SSE4A 1)
set(HAVE_AVX 1)
set(HAVE_MISALIGNSSE 1)
set(HAVE_128BITFPU 1)
set(HAVE_FASTMOVU 1)
set(SBGEMM_UNROLL_M 8)
set(SBGEMM_UNROLL_N 4)
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 2)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 2)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
set(CGEMM3M_UNROLL_M 8)
set(CGEMM3M_UNROLL_N 4)
set(ZGEMM3M_UNROLL_M 4)
set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "PILEDRIVER")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t16384\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L2_SIZE\t2097152\n"
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_MMX\n"
"#define HAVE_SSE\n"
"#define HAVE_SSE2\n"
"#define HAVE_SSE3\n"
"#define HAVE_SSE4_1\n"
"#define HAVE_SSE4_2\n"
"#define HAVE_SSE4A\n"
"#define HAVE_AVX\n"
"#define HAVE_MISALIGNSSE\n"
"#define HAVE_128BITFPU\n"
"#define HAVE_FASTMOVU\n"
"#define HAVE_CFLUSH\n"
"#define HAVE_FMA3\n"
"#define SLOCAL_BUFFER_SIZE\t6144\n"
"#define DLOCAL_BUFFER_SIZE\t5376\n"
"#define CLOCAL_BUFFER_SIZE\t10752\n"
"#define ZLOCAL_BUFFER_SIZE\t10752\n")
set(HAVE_SSE 1)
set(HAVE_SSE2 1)
set(HAVE_SSE3 1)
set(HAVE_SSE4_1 1)
set(HAVE_SSE4_2 1)
set(HAVE_SSE4A 1)
set(HAVE_AVX 1)
set(HAVE_FMA3 1)
set(HAVE_MISALIGNSSE 1)
set(HAVE_128BITFPU 1)
set(HAVE_FASTMOVU 1)
set(HAVE_CFLUSH 1)
set(SBGEMM_UNROLL_M 8)
set(SBGEMM_UNROLL_N 4)
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 2)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 2)
set(CGEMM_UNROLL_M 4)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
set(CGEMM3M_UNROLL_M 8)
set(CGEMM3M_UNROLL_N 4)
set(ZGEMM3M_UNROLL_M 4)
set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "STEAMROLLER")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t16384\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L2_SIZE\t2097152\n"
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_MMX\n"
"#define HAVE_SSE\n"
"#define HAVE_SSE2\n"
"#define HAVE_SSE3\n"
"#define HAVE_SSE4_1\n"
"#define HAVE_SSE4_2\n"
"#define HAVE_SSE4A\n"
"#define HAVE_AVX\n"
"#define HAVE_MISALIGNSSE\n"
"#define HAVE_128BITFPU\n"
"#define HAVE_FASTMOVU\n"
"#define HAVE_CFLUSH\n"
"#define HAVE_FMA3\n"
"#define SLOCAL_BUFFER_SIZE\t6144\n"
"#define DLOCAL_BUFFER_SIZE\t5120\n"
"#define CLOCAL_BUFFER_SIZE\t10240\n"
"#define ZLOCAL_BUFFER_SIZE\t10240\n")
set(HAVE_SSE 1)
set(HAVE_SSE2 1)
set(HAVE_SSE3 1)
set(HAVE_SSE4_1 1)
set(HAVE_SSE4_2 1)
set(HAVE_SSE4A 1)
set(HAVE_AVX 1)
set(HAVE_FMA3 1)
set(HAVE_MISALIGNSSE 1)
set(HAVE_128BITFPU 1)
set(HAVE_FASTMOVU 1)
set(HAVE_CFLUSH 1)
set(SBGEMM_UNROLL_M 8)
set(SBGEMM_UNROLL_N 4)
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 2)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 2)
set(CGEMM_UNROLL_M 4)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
set(CGEMM3M_UNROLL_M 8)
set(CGEMM3M_UNROLL_N 4)
set(ZGEMM3M_UNROLL_M 4)
set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "EXCAVATOR")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t16384\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L2_SIZE\t2097152\n"
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_MMX\n"
"#define HAVE_SSE\n"
"#define HAVE_SSE2\n"
"#define HAVE_SSE3\n"
"#define HAVE_SSE4_1\n"
"#define HAVE_SSE4_2\n"
"#define HAVE_SSE4A\n"
"#define HAVE_AVX\n"
"#define HAVE_MISALIGNSSE\n"
"#define HAVE_128BITFPU\n"
"#define HAVE_FASTMOVU\n"
"#define HAVE_CFLUSH\n"
"#define HAVE_FMA3\n"
"#define SLOCAL_BUFFER_SIZE\t6144\n"
"#define DLOCAL_BUFFER_SIZE\t5120\n"
"#define CLOCAL_BUFFER_SIZE\t10240\n"
"#define ZLOCAL_BUFFER_SIZE\t10240\n")
set(HAVE_SSE 1)
set(HAVE_SSE2 1)
set(HAVE_SSE3 1)
set(HAVE_SSE4_1 1)
set(HAVE_SSE4_2 1)
set(HAVE_SSE4A 1)
set(HAVE_AVX 1)
set(HAVE_FMA3 1)
set(HAVE_MISALIGNSSE 1)
set(HAVE_128BITFPU 1)
set(HAVE_FASTMOVU 1)
set(HAVE_CFLUSH 1)
set(SBGEMM_UNROLL_M 8)
set(SBGEMM_UNROLL_N 4)
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 2)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 2)
set(CGEMM_UNROLL_M 4)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
set(CGEMM3M_UNROLL_M 8)
set(CGEMM3M_UNROLL_N 4)
set(ZGEMM3M_UNROLL_M 4)
set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "ZEN")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L2_SIZE\t524288\n"
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_MMX\n"
"#define HAVE_SSE\n"
"#define HAVE_SSE2\n"
"#define HAVE_SSE3\n"
"#define HAVE_SSE4_1\n"
"#define HAVE_SSE4_2\n"
"#define HAVE_SSE4A\n"
"#define HAVE_MISALIGNSSE\n"
"#define HAVE_128BITFPU\n"
"#define HAVE_FASTMOVU\n"
"#define HAVE_CFLUSH\n"
"#define HAVE_AVX\n"
"#define HAVE_AVX2\n"
"#define HAVE_FMA3\n"
"#define SLOCAL_BUFFER_SIZE\t20480\n"
"#define DLOCAL_BUFFER_SIZE\t32768\n"
"#define CLOCAL_BUFFER_SIZE\t16384\n"
"#define ZLOCAL_BUFFER_SIZE\t12288\n")
set(HAVE_SSE 1)
set(HAVE_SSE2 1)
set(HAVE_SSE3 1)
set(HAVE_SSE4_1 1)
set(HAVE_SSE4_2 1)
set(HAVE_AVX 1)
set(HAVE_AVX2 1)
set(HAVE_FMA3 1)
set(HAVE_SSE4A 1)
set(HAVE_MISALIGNSSE 1)
set(HAVE_128BITFPU 1)
set(HAVE_FASTMOVU 1)
set(HAVE_CFLUSH 1)
set(SBGEMM_UNROLL_M 8)
set(SBGEMM_UNROLL_N 4)
set(SGEMM_UNROLL_M 8)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 8)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 2)
set(CGEMM3M_UNROLL_M 8)
set(CGEMM3M_UNROLL_N 4)
set(ZGEMM3M_UNROLL_M 4)
set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "ARMV7")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t65536\n"
@ -199,12 +879,12 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
"#define HAVE_VFP\n"
"#define HAVE_NEON\n"
"#define ARMV8\n")
if ("${TCORE}" STREQUAL "CORTEXA57")
if ("${TCORE}" STREQUAL "CORTEXA57")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
else ()
set(SGEMM_UNROLL_M 8)
set(SGEMM_UNROLL_N 8)
set(SGEMM_UNROLL_N 8)
endif ()
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
@ -581,6 +1261,15 @@ endif ()
set(ZGEMM_UNROLL_M 8)
set(ZGEMM_UNROLL_N 2)
set(SYMV_P 8)
elseif ("${TCORE}" STREQUAL "GENERIC")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE 32768\n"
"#define L1_DATA_LINESIZE 128\n"
"#define L2_SIZE 524288\n"
"#define L2_LINESIZE 128 \n"
"#define DTB_DEFAULT_ENTRIES 128\n"
"#define DTB_SIZE 4096\n"
"#define L2_ASSOCIATIVE 8\n")
endif()
set(SBGEMM_UNROLL_M 8)
set(SBGEMM_UNROLL_N 4)
@ -603,7 +1292,7 @@ endif ()
"#define GEMM_MULTITHREAD_THRESHOLD\t${GEMM_MULTITHREAD_THRESHOLD}\n")
# Move to where gen_config_h would place it
file(MAKE_DIRECTORY ${TARGET_CONF_DIR})
file(RENAME ${TARGET_CONF_TEMP} "${TARGET_CONF_DIR}/${TARGET_CONF}")
file(RENAME ${TARGET_CONF_TEMP} "${TARGET_CONF_DIR}/${TARGET_CONF}")
else(NOT CMAKE_CROSSCOMPILING)
# compile getarch
@ -639,7 +1328,7 @@ else(NOT CMAKE_CROSSCOMPILING)
OUTPUT_VARIABLE GETARCH_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
)
if (NOT ${GETARCH_RESULT})
MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
endif ()

View File

@ -284,8 +284,15 @@ if (NOT NOFORTRAN)
# Fortran Compiler dependent settings
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
else ()
set(NO_LAPACK 1)
set(NO_LAPACKE 1)
if (NOT XXXX)
set(C_LAPACK 1)
if (INTERFACE64)
set (CCOMMON_OPT "${CCOMMON_OPT} -DLAPACK_ILP64")
endif ()
set(TIMER "NONE")
else ()
set (NO_LAPACK 1)
endif ()
endif ()
if (BINARY64)
@ -552,6 +559,14 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
endforeach ()
endif ()
if ("${F_COMPILER}" STREQUAL "NAG" OR "${F_COMPILER}" STREQUAL "CRAY")
set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512")
foreach (FILTER_FLAG ${FILTER_FLAGS})
string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS})
string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS})
endforeach ()
endif ()
if ("${F_COMPILER}" STREQUAL "GFORTRAN")
# lapack-netlib is rife with uninitialized warnings -hpa
set(LAPACK_FFLAGS "${LAPACK_FFLAGS} -Wno-maybe-uninitialized")

View File

@ -31,7 +31,11 @@ endif()
# Pretty thorough determination of arch. Add more if needed
if(CMAKE_CL_64 OR MINGW64)
set(X86_64 1)
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)")
set(ARM64 1)
else()
set(X86_64 1)
endif()
elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING))
set(X86 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")

View File

@ -33,9 +33,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef COMMON_ARM64
#define COMMON_ARM64
#ifdef C_MSVC
#include <intrin.h>
#define MB __dmb(_ARM64_BARRIER_ISH)
#define WMB __dmb(_ARM64_BARRIER_ISHST)
#define RMB __dmb(_ARM64_BARRIER_ISHLD)
#else
#define MB __asm__ __volatile__ ("dmb ish" : : : "memory")
#define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory")
#define RMB __asm__ __volatile__ ("dmb ishld" : : : "memory")
#endif
#define INLINE inline
@ -53,6 +60,7 @@ static void __inline blas_lock(volatile BLASULONG *address){
BLASULONG ret;
do {
#ifndef C_MSVC
__asm__ __volatile__(
"mov x4, #1 \n\t"
"sevl \n\t"
@ -70,7 +78,10 @@ static void __inline blas_lock(volatile BLASULONG *address){
);
#else
while (*address) {YIELDING;}
ret=InterlockedExchange64((volatile LONG64 *)(address), 1);
#endif
} while (ret);
@ -80,6 +91,14 @@ static void __inline blas_lock(volatile BLASULONG *address){
#if !defined(OS_DARWIN) && !defined (OS_ANDROID)
static __inline BLASULONG rpcc(void){
#ifdef C_MSVC
const int64_t pmccntr_el0 = (((3 & 1) << 14) | // op0
((3 & 7) << 11) | // op1
((9 & 15) << 7) | // crn
((13 & 15) << 3) | // crm
((0 & 7) << 0)); // op2
return _ReadStatusReg(pmccntr_el0);
#else
BLASULONG ret = 0;
blasint shift;
@ -87,6 +106,7 @@ static __inline BLASULONG rpcc(void){
__asm__ __volatile__ ("mrs %0,cntfrq_el0; clz %w0, %w0":"=&r"(shift));
return ret << shift;
#endif
}
#define RPCC_DEFINED

View File

@ -2610,8 +2610,9 @@
#endif
#ifndef ASSEMBLER
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\
|| defined(ARCH_LOONGARCH64) || defined(ARCH_E2K)
#if !defined(DYNAMIC_ARCH) \
&& (defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) \
|| defined(ARCH_LOONGARCH64) || defined(ARCH_E2K))
extern BLASLONG gemm_offset_a;
extern BLASLONG gemm_offset_b;
extern BLASLONG sbgemm_p;

View File

@ -92,7 +92,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define SEEK_ADDRESS
#if defined(C910V)
#include <riscv-vector.h>
#include <riscv_vector.h>
#endif
#endif

View File

@ -6,12 +6,14 @@
#include "../cblas.h"
#include "cpp_thread_safety_common.h"
void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){
void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize)
{
const blasint inc = 1;
cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc);
}
}
int main(int argc, char* argv[]){
int main(int argc, char* argv[])
{
blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
uint32_t numTestRounds = 16; //number of testing rounds before success exit
@ -20,20 +22,23 @@ int main(int argc, char* argv[]){
if (maxHwThreads < 52)
numConcurrentThreads = maxHwThreads;
if (argc > 4){
if (argc > 4)
{
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
abort();
}
if(argc == 4){
}
if(argc == 4)
{
std::vector<std::string> cliArgs;
for (int i = 1; i < argc; i++){
for (int i = 1; i < argc; i++)
{
cliArgs.push_back(argv[i]);
std::cout<<argv[i]<<std::endl;
}
}
randomMatSize = std::stoul(cliArgs.at(0));
numConcurrentThreads = std::stoul(cliArgs.at(1));
numTestRounds = std::stoul(cliArgs.at(2));
}
}
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
std::vector<std::vector<double>> matBlock(numConcurrentThreads);
@ -56,15 +61,18 @@ int main(int argc, char* argv[]){
std::cout<<"Preparing to test CBLAS DGEMV thread safety\n";
std::cout<<"Allocating matrices..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
for(uint32_t i=0; i<numConcurrentThreads; i++)
{
matBlock.at(i).resize(randomMatSize*randomMatSize);
}
}
std::cout<<"done\n";
std::cout<<"Allocating vectors..."<<std::flush;
for(uint32_t i=0; i<(numConcurrentThreads*2); i++){
for(uint32_t i=0; i<(numConcurrentThreads*2); i++)
{
vecBlock.at(i).resize(randomMatSize);
}
}
std::cout<<"done\n";
//pauser();
std::cout<<"Filling matrices with random numbers..."<<std::flush;
@ -77,31 +85,35 @@ int main(int argc, char* argv[]){
std::cout<<"Testing CBLAS DGEMV thread safety"<<std::endl;
omp_set_num_threads(numConcurrentThreads);
for(uint32_t R=0; R<numTestRounds; R++){
for(uint32_t R=0; R<numTestRounds; R++)
{
std::cout<<"DGEMV round #"<<R<<std::endl;
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
#pragma omp parallel for default(none) shared(futureBlock, matBlock, vecBlock, randomMatSize, numConcurrentThreads)
for(uint32_t i=0; i<numConcurrentThreads; i++){
for(uint32_t i=0; i<numConcurrentThreads; i++)
{
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemv, &matBlock[i][0], &vecBlock[i*2][0], &vecBlock[i*2+1][0], randomMatSize);
}
}
std::cout<<"done\n";
std::cout<<"Waiting for threads to finish..."<<std::flush;
for(uint32_t i=0; i<numConcurrentThreads; i++){
for(uint32_t i=0; i<numConcurrentThreads; i++)
{
futureBlock[i].get();
}
}
std::cout<<"done\n";
std::cout<<"Comparing results from different threads..."<<std::flush;
for(uint32_t i=2; i<(numConcurrentThreads*2); i+=2){ //i is the index of vector x, for a given thread
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++)
{
if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+1<<std::endl;
std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl;
return -1;
}
}
}
std::cout<<"OK!\n"<<std::endl;
}
std::cout<<"OK!\n"<<std::endl;
}
std::cout<<"CBLAS DGEMV thread safety test PASSED!\n"<<std::endl;
return 0;
}
}

View File

@ -45,6 +45,10 @@ size_t length64=sizeof(value64);
#define CPU_NEOVERSEN1 11
#define CPU_NEOVERSEV1 16
#define CPU_NEOVERSEN2 17
#define CPU_CORTEXX1 18
#define CPU_CORTEXX2 19
#define CPU_CORTEXA510 20
#define CPU_CORTEXA710 21
// Qualcomm
#define CPU_FALKOR 6
// Cavium
@ -59,6 +63,8 @@ size_t length64=sizeof(value64);
#define CPU_VORTEX 13
// Fujitsu
#define CPU_A64FX 15
// Phytium
#define CPU_FT2000 22
static char *cpuname[] = {
"UNKNOWN",
@ -73,12 +79,17 @@ static char *cpuname[] = {
"TSV110",
"EMAG8180",
"NEOVERSEN1",
"NEOVERSEV1"
"NEOVERSEN2"
"THUNDERX3T110",
"VORTEX",
"CORTEXA55",
"A64FX"
"A64FX",
"NEOVERSEV1",
"NEOVERSEN2",
"CORTEXX1",
"CORTEXX2",
"CORTEXA510",
"CORTEXA710",
"FT2000"
};
static char *cpuname_lower[] = {
@ -94,12 +105,17 @@ static char *cpuname_lower[] = {
"tsv110",
"emag8180",
"neoversen1",
"neoversev1",
"neoversen2",
"thunderx3t110",
"vortex",
"cortexa55",
"a64fx"
"a64fx",
"neoversev1",
"neoversen2",
"cortexx1",
"cortexx2",
"cortexa510",
"cortexa710",
"ft2000"
};
int get_feature(char *search)
@ -182,6 +198,14 @@ int detect(void)
return CPU_NEOVERSEN2;
else if (strstr(cpu_part, "0xd05"))
return CPU_CORTEXA55;
else if (strstr(cpu_part, "0xd46"))
return CPU_CORTEXA510;
else if (strstr(cpu_part, "0xd47"))
return CPU_CORTEXA710;
else if (strstr(cpu_part, "0xd44"))
return CPU_CORTEXX1;
else if (strstr(cpu_part, "0xd4c"))
return CPU_CORTEXX2;
}
// Qualcomm
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
@ -202,6 +226,13 @@ int detect(void)
// Fujitsu
else if (strstr(cpu_implementer, "0x46") && strstr(cpu_part, "0x001"))
return CPU_A64FX;
// Apple
else if (strstr(cpu_implementer, "0x61") && strstr(cpu_part, "0x022"))
return CPU_VORTEX;
// Phytium
else if (strstr(cpu_implementer, "0x70") && (strstr(cpu_part, "0x660") || strstr(cpu_part, "0x661")
|| strstr(cpu_part, "0x662") || strstr(cpu_part, "0x663")))
return CPU_FT2000;
}
p = (char *) NULL ;
@ -382,7 +413,24 @@ void get_cpuconfig(void)
printf("#define DTB_DEFAULT_ENTRIES 48\n");
printf("#define DTB_SIZE 4096\n");
break;
case CPU_CORTEXA510:
case CPU_CORTEXA710:
case CPU_CORTEXX1:
case CPU_CORTEXX2:
printf("#define ARMV9\n");
printf("#define %s\n", cpuname[d]);
printf("#define L1_CODE_SIZE 65536\n");
printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_CODE_ASSOCIATIVE 4\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L1_DATA_ASSOCIATIVE 4\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 8\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
case CPU_FALKOR:
printf("#define FALKOR\n");
printf("#define L1_CODE_SIZE 65536\n");
@ -469,9 +517,9 @@ void get_cpuconfig(void)
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
#ifdef __APPLE__
case CPU_VORTEX:
printf("#define VORTEX \n");
#ifdef __APPLE__
sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0);
printf("#define L1_CODE_SIZE %lld \n",value64);
sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0);
@ -480,10 +528,10 @@ void get_cpuconfig(void)
printf("#define L1_DATA_SIZE %lld \n",value64);
sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0);
printf("#define L2_SIZE %lld \n",value64);
#endif
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
#endif
case CPU_A64FX:
printf("#define A64FX\n");
printf("#define L1_CODE_SIZE 65535\n");
@ -494,6 +542,16 @@ void get_cpuconfig(void)
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
case CPU_FT2000:
printf("#define FT2000\n");
printf("#define L1_CODE_SIZE 32768\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 33554432\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
}
get_cpucount();
}

View File

@ -33,30 +33,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <stdint.h>
#define CPU_UNKNOWN 0
#define CPU_LOONGSON3R5 1
/* If LASX extension instructions supported,
* using core LOONGSON3R5
* If only LSX extension instructions supported,
* using core LOONGSON2K1000
* If neither LASX nor LSX extension instructions supported,
* using core LOONGSONGENERIC (As far as I know, there is no such
* CPU yet)
*/
#define CPU_GENERIC 0
#define CPU_LOONGSON3R5 1
#define CPU_LOONGSON2K1000 2
#define LOONGARCH_CFG2 0x02
#define LOONGARCH_LASX 1<<7
#define LOONGARCH_LSX 1<<6
static char *cpuname[] = {
"UNKNOWN",
"LOONGSON3R5"
"LOONGSONGENERIC",
"LOONGSON3R5",
"LOONGSON2K1000"
};
static char *cpuname_lower[] = {
"loongsongeneric",
"loongson3r5",
"loongson2k1000"
};
int detect(void) {
uint32_t reg = 0;
#ifdef __linux
uint32_t reg = 0;
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(reg)
: "r"(LOONGARCH_CFG2)
);
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(reg)
: "r"(LOONGARCH_CFG2)
);
if (reg & LOONGARCH_LASX)
return CPU_LOONGSON3R5;
else
return CPU_UNKNOWN;
if (reg & LOONGARCH_LASX)
return CPU_LOONGSON3R5;
else if (reg & LOONGARCH_LSX)
return CPU_LOONGSON2K1000;
else
return CPU_GENERIC;
#endif
return CPU_GENERIC;
}
char *get_corename(void) {
@ -68,11 +91,8 @@ void get_architecture(void) {
}
void get_subarchitecture(void) {
if (detect() == CPU_LOONGSON3R5) {
printf("LOONGSON3R5");
} else {
printf("UNKNOWN");
}
int d = detect();
printf("%s", cpuname[d]);
}
void get_subdirname(void) {
@ -80,31 +100,44 @@ void get_subdirname(void) {
}
void get_cpuconfig(void) {
if (detect() == CPU_LOONGSON3R5) {
printf("#define LOONGSON3R5\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
} else {
printf("#define LOONGSON3R5\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
int d = detect();
switch (d) {
case CPU_LOONGSON3R5:
printf("#define LOONGSON3R5\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
break;
case CPU_LOONGSON2K1000:
printf("#define LOONGSON2K1000\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 262144\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
break;
default:
printf("#define LOONGSONGENERIC\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 262144\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
break;
}
}
void get_libname(void){
if (detect() == CPU_LOONGSON3R5) {
printf("loongson3r5\n");
} else {
printf("loongarch64\n");
}
int d = detect();
printf("%s", cpuname_lower[d]);
}

View File

@ -1,5 +1,5 @@
/*****************************************************************************
Copyright (c) 2011-2014, The OpenBLAS Project
Copyright (c) 2011-2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
@ -13,9 +13,9 @@ met:
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
@ -70,16 +70,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define CPU_UNKNOWN 0
#define CPU_C910V 1
#define CPU_GENERIC 0
#define CPU_C910V 1
static char *cpuname[] = {
"UNKOWN",
"RISCV64_GENERIC",
"C910V"
};
int detect(void){
return CPU_UNKNOWN;
#ifdef __linux
FILE *infile;
char buffer[512],isa_buffer[512],model_buffer[512];
const char* check_c910_str = "T-HEAD C910";
char *pmodel = NULL, *pisa = NULL;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)){
if(!strncmp(buffer, "model name", 10)){
strcpy(model_buffer, buffer);
pmodel = strchr(isa_buffer, ':') + 1;
}
if(!strncmp(buffer, "isa", 3)){
strcpy(isa_buffer, buffer);
pisa = strchr(isa_buffer, '4') + 1;
}
}
fclose(infile);
if (!pmodel)
return(CPU_GENERIC);
if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v'))
return CPU_C910V;
return CPU_GENERIC;
#endif
return CPU_GENERIC;
}
char *get_corename(void){
@ -91,6 +121,7 @@ void get_architecture(void){
}
void get_subarchitecture(void){
printf("%s",cpuname[detect()]);
}
void get_subdirname(void){
@ -98,7 +129,7 @@ void get_subdirname(void){
}
void get_cpuconfig(void){
printf("#define UNKNOWN\n");
printf("#define %s\n", cpuname[detect()]);
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");

View File

@ -1707,8 +1707,18 @@ int get_cpuname(void){
if (model == 0xf && stepping < 0xe)
return CPUTYPE_NANO;
return CPUTYPE_NEHALEM;
case 0x7:
switch (exmodel) {
case 5:
if (support_avx2())
return CPUTYPE_ZEN;
else
return CPUTYPE_DUNNINGTON;
default:
return CPUTYPE_NEHALEM;
}
default:
if (family >= 0x7)
if (family >= 0x8)
return CPUTYPE_NEHALEM;
else
return CPUTYPE_VIAC3;
@ -1716,7 +1726,20 @@ int get_cpuname(void){
}
if (vendor == VENDOR_ZHAOXIN){
return CPUTYPE_NEHALEM;
switch (family) {
case 0x7:
switch (exmodel) {
case 5:
if (support_avx2())
return CPUTYPE_ZEN;
else
return CPUTYPE_DUNNINGTON;
default:
return CPUTYPE_NEHALEM;
}
default:
return CPUTYPE_NEHALEM;
}
}
if (vendor == VENDOR_RISE){
@ -2416,8 +2439,18 @@ int get_coretype(void){
if (model == 0xf && stepping < 0xe)
return CORE_NANO;
return CORE_NEHALEM;
case 0x7:
switch (exmodel) {
case 5:
if (support_avx2())
return CORE_ZEN;
else
return CORE_DUNNINGTON;
default:
return CORE_NEHALEM;
}
default:
if (family >= 0x7)
if (family >= 0x8)
return CORE_NEHALEM;
else
return CORE_VIAC3;
@ -2425,7 +2458,20 @@ int get_coretype(void){
}
if (vendor == VENDOR_ZHAOXIN) {
return CORE_NEHALEM;
switch (family) {
case 0x7:
switch (exmodel) {
case 5:
if (support_avx2())
return CORE_ZEN;
else
return CORE_DUNNINGTON;
default:
return CORE_NEHALEM;
}
default:
return CORE_NEHALEM;
}
}
return CORE_UNKNOWN;

View File

@ -44,6 +44,10 @@ COMPILER_DEC
COMPILER_GNU
#endif
#if defined(__fcc_version__) || defined(__FCC_version__)
COMPILER_FUJITSU
#endif
#if defined(__ANDROID__)
OS_ANDROID
#endif

View File

@ -1,7 +1,9 @@
include_directories(${PROJECT_SOURCE_DIR})
include_directories(${PROJECT_BINARY_DIR})
if (NOT NOFORTRAN)
enable_language(Fortran)
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS")
if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
@ -28,14 +30,24 @@ foreach(float_type ${FLOAT_TYPES})
continue()
endif()
#level1
if (NOT NOFORTRAN)
add_executable(x${float_char}cblat1
c_${float_char}blat1.f
c_${float_char}blas1.c)
else()
add_executable(x${float_char}cblat1
c_${float_char}blat1c.c
c_${float_char}blas1.c)
endif()
target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD")
target_link_libraries(x${float_char}cblat1 m)
endif()
add_test(NAME "x${float_char}cblat1"
COMMAND $<TARGET_FILE:x${float_char}cblat1>)
#level2
if (NOT NOFORTRAN)
add_executable(x${float_char}cblat2
c_${float_char}blat2.f
c_${float_char}blas2.c
@ -43,11 +55,24 @@ foreach(float_type ${FLOAT_TYPES})
auxiliary.c
c_xerbla.c
constant.c)
else()
add_executable(x${float_char}cblat2
c_${float_char}blat2c.c
c_${float_char}blas2.c
c_${float_char}2chke.c
auxiliary.c
c_xerbla.c
constant.c)
endif()
target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD")
target_link_libraries(x${float_char}cblat2 m)
endif()
add_test(NAME "x${float_char}cblat2"
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat2> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2")
#level3
if (NOT NOFORTRAN)
add_executable(x${float_char}cblat3
c_${float_char}blat3.f
c_${float_char}blas3.c
@ -55,7 +80,19 @@ foreach(float_type ${FLOAT_TYPES})
auxiliary.c
c_xerbla.c
constant.c)
else()
add_executable(x${float_char}cblat3
c_${float_char}blat3c.c
c_${float_char}blas3.c
c_${float_char}3chke.c
auxiliary.c
c_xerbla.c
constant.c)
endif()
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD")
target_link_libraries(x${float_char}cblat3 m)
endif()
add_test(NAME "x${float_char}cblat3"
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")

View File

@ -43,11 +43,7 @@ ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o
ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o
ifeq ($(NOFORTRAN),1)
all ::
else
all :: all1 all2 all3
endif
ifeq ($(BUILD_SINGLE),1)
all1targets += xscblat1
@ -222,53 +218,83 @@ endif
ifeq ($(BUILD_SINGLE),1)
# Single real
ifeq ($(NOFORTRAN), $(filter 0 2,$(NOFORTRAN)))
xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xscblat1 c_sblat1.o $(stestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xscblat2: $(stestl2o) c_sblat2.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xscblat2 c_sblat2.o $(stestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xscblat3: $(stestl3o) c_sblat3.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xscblat3 c_sblat3.o $(stestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
else
xscblat1: $(stestl1o) c_sblat1c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xscblat1 c_sblat1c.o $(stestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
xscblat2: $(stestl2o) c_sblat2c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xscblat2 c_sblat2c.o $(stestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
xscblat3: $(stestl3o) c_sblat3c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xscblat3 c_sblat3c.o $(stestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
endif
endif
ifeq ($(BUILD_DOUBLE),1)
# Double real
ifeq ($(NOFORTRAN),0)
xdcblat1: $(dtestl1o) c_dblat1.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xdcblat1 c_dblat1.o $(dtestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xdcblat2: $(dtestl2o) c_dblat2.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xdcblat2 c_dblat2.o $(dtestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xdcblat3: $(dtestl3o) c_dblat3.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xdcblat3 c_dblat3.o $(dtestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
else
xdcblat1: $(dtestl1o) c_dblat1c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xdcblat1 c_dblat1c.o $(dtestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
xdcblat2: $(dtestl2o) c_dblat2c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xdcblat2 c_dblat2c.o $(dtestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
xdcblat3: $(dtestl3o) c_dblat3c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xdcblat3 c_dblat3c.o $(dtestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
endif
endif
ifeq ($(BUILD_COMPLEX),1)
# Single complex
ifeq ($(NOFORTRAN),0)
xccblat1: $(ctestl1o) c_cblat1.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat1 c_cblat1.o $(ctestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
else
xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
xccblat2: $(ctestl2o) c_cblat2c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
endif
endif
ifeq ($(BUILD_COMPLEX16),1)
# Double complex
ifeq ($(NOFORTRAN),0)
xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat1 c_zblat1.o $(ztestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
else
xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
xzcblat2: $(ztestl2o) c_zblat2c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME)
$(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
endif
endif
include $(TOPDIR)/Makefile.tail

1273
ctest/c_cblat1c.c Normal file

File diff suppressed because it is too large Load Diff

4351
ctest/c_cblat2c.c Normal file

File diff suppressed because it is too large Load Diff

4066
ctest/c_cblat3c.c Normal file

File diff suppressed because it is too large Load Diff

1191
ctest/c_dblat1c.c Normal file

File diff suppressed because it is too large Load Diff

4117
ctest/c_dblat2c.c Normal file

File diff suppressed because it is too large Load Diff

3656
ctest/c_dblat3c.c Normal file

File diff suppressed because it is too large Load Diff

1404
ctest/c_sblat1c.c Normal file

File diff suppressed because it is too large Load Diff

4244
ctest/c_sblat2c.c Normal file

File diff suppressed because it is too large Load Diff

3652
ctest/c_sblat3c.c Normal file

File diff suppressed because it is too large Load Diff

1128
ctest/c_zblat1c.c Normal file

File diff suppressed because it is too large Load Diff

4358
ctest/c_zblat2c.c Normal file

File diff suppressed because it is too large Load Diff

4276
ctest/c_zblat3c.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -27,11 +27,15 @@ else
ifeq ($(ARCH),mips64)
COMMONOBJS += dynamic_mips64.$(SUFFIX)
else
ifeq ($(ARCH),loongarch64)
COMMONOBJS += dynamic_loongarch64.$(SUFFIX)
else
COMMONOBJS += dynamic.$(SUFFIX)
endif
endif
endif
endif
endif
else
COMMONOBJS += parameter.$(SUFFIX)
endif
@ -99,11 +103,15 @@ else
ifeq ($(ARCH),mips64)
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX)
else
ifeq ($(ARCH),loongarch64)
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_loongarch64.$(SUFFIX)
else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
endif
endif
endif
endif
endif
else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
endif

View File

@ -352,6 +352,20 @@ int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set)
return pthread_setaffinity_np(thread, cpusetsize, cpu_set);
}
int openblas_getaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) {
const int active_threads = openblas_get_num_threads();
if (thread_idx < 0 || thread_idx >= active_threads) {
errno = EINVAL;
return -1;
}
pthread_t thread = (thread_idx == active_threads - 1)
? pthread_self()
: blas_threads[thread_idx];
return pthread_getaffinity_np(thread, cpusetsize, cpu_set);
}
#endif
static void* blas_thread_server(void *arg){

View File

@ -403,6 +403,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
break;
}
if (openblas_omp_adaptive_env() != 0) {
#pragma omp parallel for num_threads(num) schedule(OMP_SCHED)
for (i = 0; i < num; i ++) {
@ -412,6 +413,17 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
exec_threads(&queue[i], buf_index);
}
} else {
#pragma omp parallel for schedule(OMP_SCHED)
for (i = 0; i < num; i ++) {
#ifndef USE_SIMPLE_THREADED_LEVEL3
queue[i].position = i;
#endif
exec_threads(&queue[i], buf_index);
}
}
#ifdef HAVE_C11
atomic_store(&blas_buffer_inuse[buf_index], false);

View File

@ -96,7 +96,7 @@ extern gotoblas_t gotoblas_BARCELONA;
#endif
#ifdef DYN_ATOM
extern gotoblas_t gotoblas_ATOM;
elif defined(DYN_NEHALEM)
#elif defined(DYN_NEHALEM)
#define gotoblas_ATOM gotoblas_NEHALEM
#else
#define gotoblas_ATOM gotoblas_PRESCOTT
@ -855,7 +855,11 @@ static gotoblas_t *get_coretype(void){
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
} else if (exfamily == 10) {
} else if (exfamily == 10) {
if(support_avx512_bf16())
return &gotoblas_COOPERLAKE;
if(support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx())
return &gotoblas_ZEN;
else{
@ -863,7 +867,7 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else {
return &gotoblas_BARCELONA;
return NULL;
}
}
@ -875,14 +879,37 @@ static gotoblas_t *get_coretype(void){
if (model == 0xf && stepping < 0xe)
return &gotoblas_NANO;
return &gotoblas_NEHALEM;
case 0x7:
switch (exmodel) {
case 5:
if (support_avx2())
return &gotoblas_ZEN;
else
return &gotoblas_DUNNINGTON;
default:
return &gotoblas_NEHALEM;
}
default:
if (family >= 0x7)
if (family >= 0x8)
return &gotoblas_NEHALEM;
}
}
if (vendor == VENDOR_ZHAOXIN) {
return &gotoblas_NEHALEM;
switch (family) {
case 0x7:
switch (exmodel) {
case 5:
if (support_avx2())
return &gotoblas_ZEN;
else
return &gotoblas_DUNNINGTON;
default:
return &gotoblas_NEHALEM;
}
default:
return &gotoblas_NEHALEM;
}
}
return NULL;

View File

@ -99,6 +99,16 @@ extern gotoblas_t gotoblas_NEOVERSEN1;
#else
#define gotoblas_NEOVERSEN1 gotoblas_ARMV8
#endif
#ifdef DYN_NEOVERSEV1
extern gotoblas_t gotoblas_NEOVERSEV1;
#else
#define gotoblas_NEOVERSEV1 gotoblas_ARMV8
#endif
#ifdef DYN_NEOVERSEN2
extern gotoblas_t gotoblas_NEOVERSEN2;
#else
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
#endif
#ifdef DYN_CORTEX_A55
extern gotoblas_t gotoblas_CORTEXA55;
#else
@ -115,6 +125,8 @@ extern gotoblas_t gotoblas_THUNDERX2T99;
extern gotoblas_t gotoblas_TSV110;
extern gotoblas_t gotoblas_EMAG8180;
extern gotoblas_t gotoblas_NEOVERSEN1;
extern gotoblas_t gotoblas_NEOVERSEV1;
extern gotoblas_t gotoblas_NEOVERSEN2;
extern gotoblas_t gotoblas_THUNDERX3T110;
extern gotoblas_t gotoblas_CORTEXA55;
#endif
@ -166,8 +178,10 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_TSV110) return corename[ 8];
if (gotoblas == &gotoblas_EMAG8180) return corename[ 9];
if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10];
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11];
if (gotoblas == &gotoblas_CORTEXA55) return corename[12];
if (gotoblas == &gotoblas_NEOVERSEV1) return corename[11];
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12];
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
if (gotoblas == &gotoblas_CORTEXA55) return corename[14];
return corename[NUM_CORETYPES];
}
@ -198,8 +212,10 @@ static gotoblas_t *force_coretype(char *coretype) {
case 8: return (&gotoblas_TSV110);
case 9: return (&gotoblas_EMAG8180);
case 10: return (&gotoblas_NEOVERSEN1);
case 11: return (&gotoblas_THUNDERX3T110);
case 12: return (&gotoblas_CORTEXA55);
case 11: return (&gotoblas_NEOVERSEV1);
case 12: return (&gotoblas_NEOVERSEN2);
case 13: return (&gotoblas_THUNDERX3T110);
case 14: return (&gotoblas_CORTEXA55);
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
@ -258,6 +274,10 @@ static gotoblas_t *get_coretype(void) {
return &gotoblas_CORTEXA73;
case 0xd0c: // Neoverse N1
return &gotoblas_NEOVERSEN1;
case 0xd49:
return &gotoblas_NEOVERSEN2;
case 0xd40:
return &gotoblas_NEOVERSEV1;
case 0xd05: // Cortex A55
return &gotoblas_CORTEXA55;
}

View File

@ -0,0 +1,128 @@
/*******************************************************************************
Copyright (c) 2022, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
extern gotoblas_t gotoblas_LOONGSON3R5;
extern gotoblas_t gotoblas_LOONGSON2K1000;
extern gotoblas_t gotoblas_LOONGSONGENERIC;
extern void openblas_warning(int verbose, const char * msg);
#define NUM_CORETYPES 3
static char *corename[] = {
"loongson3r5",
"loongson2k1000",
"loongsongeneric",
"unknown"
};
char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_LOONGSON3R5) return corename[0];
if (gotoblas == &gotoblas_LOONGSON2K1000) return corename[1];
if (gotoblas == &gotoblas_LOONGSONGENERIC) return corename[2];
return corename[NUM_CORETYPES];
}
static gotoblas_t *force_coretype(char *coretype) {
int i;
int found = -1;
char message[128];
for ( i=0 ; i < NUM_CORETYPES; i++)
{
if (!strncasecmp(coretype, corename[i], 20))
{
found = i;
break;
}
}
switch (found)
{
case 0: return (&gotoblas_LOONGSON3R5);
case 1: return (&gotoblas_LOONGSON2K1000);
case 2: return (&gotoblas_LOONGSONGENERIC);
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
return NULL;
}
#define LASX_MASK 1<<7
#define LSX_MASK 1<<6
#define LOONGARCH_CFG2 0x02
static gotoblas_t *get_coretype(void) {
int ret = 0;
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(ret)
: "r"(LOONGARCH_CFG2)
);
if (ret & LASX_MASK)
return &gotoblas_LOONGSON3R5;
else if (ret & LSX_MASK)
return &gotoblas_LOONGSON2K1000;
else
return &gotoblas_LOONGSONGENERIC;
}
void gotoblas_dynamic_init(void) {
char coremsg[128];
char coren[22];
char *p;
if (gotoblas) return;
p = getenv("OPENBLAS_CORETYPE");
if ( p )
{
gotoblas = force_coretype(p);
}
else
{
gotoblas = get_coretype();
}
if (gotoblas && gotoblas->init) {
strncpy(coren, gotoblas_corename(), 20);
sprintf(coremsg, "Core: %s\n", coren);
openblas_warning(2, coremsg);
gotoblas -> init();
} else {
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
exit(1);
}
}
void gotoblas_dynamic_quit(void) {
gotoblas = NULL;
}

View File

@ -877,21 +877,21 @@ void gotoblas_affinity_init(void) {
nums = sysconf(_SC_NPROCESSORS_CONF);
#if !defined(__GLIBC_PREREQ)
common->num_procs = nums;
common->num_procs = nums >0 ? nums : 2;
#else
#if !__GLIBC_PREREQ(2, 3)
common->num_procs = nums;
common->num_procs = nums >0 ? nums : 2;
#elif __GLIBC_PREREQ(2, 7)
cpusetp = CPU_ALLOC(nums);
cpusetp = CPU_ALLOC(nums>0? nums:1024);
if (cpusetp == NULL) {
common->num_procs = nums;
common->num_procs = nums>0 ? nums: 2;
} else {
size_t size;
size = CPU_ALLOC_SIZE(nums);
size = CPU_ALLOC_SIZE(nums>0? nums: 1024);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0)
common->num_procs = nums;
common->num_procs = nums >0 ? nums : 1;
else
common->num_procs = CPU_COUNT_S(size,cpusetp);
}
@ -899,12 +899,12 @@ void gotoblas_affinity_init(void) {
#else
ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset);
if (ret!=0) {
common->num_procs = nums;
common->num_procs = nums >0 ? nums : 2;
} else {
#if !__GLIBC_PREREQ(2, 6)
int i;
int n = 0;
for (i=0;i<nums;i++)
for (i=0;i<(nums >0 ?nums:1024) ;i++)
if (CPU_ISSET(i,&cpuset)) n++;
common->num_procs = n;
}
@ -1022,7 +1022,7 @@ void gotoblas_set_affinity2(int threads) {};
void gotoblas_affinity_reschedule(void) {};
int get_num_procs(void) { return sysconf(_SC_NPROCESSORS_CONF); }
int get_num_procs(void) { int num = sysconf(_SC_NPROCESSORS_CONF); return (nums >0 ? nums : 2); }
int get_num_nodes(void) { return 1; }

View File

@ -252,23 +252,23 @@ int get_num_procs(void) {
ret = omp_get_num_places();
if (ret >0 ) nums = ret;
#endif
return nums;
return (nums > 0 ? nums : 2);
#endif
#if !defined(OS_LINUX)
return nums;
return (nums > 0 ? nums : 2);
#endif
#if !defined(__GLIBC_PREREQ)
return nums;
return (nums > 0 ? nums :2);
#else
#if !__GLIBC_PREREQ(2, 3)
return nums;
return (nums > 0 ? nums :2);
#endif
#if !__GLIBC_PREREQ(2, 7)
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
if (ret!=0) return nums;
if (ret!=0) return (nums > 0 ? nums :2);
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
@ -277,31 +277,31 @@ int get_num_procs(void) {
#else
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
#endif
return nums;
return (nums > 0 ? nums :2);
#else
if (nums >= CPU_SETSIZE) {
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) {
return nums;
return (nums > 0 ? nums :2);
}
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) {
CPU_FREE(cpusetp);
return nums;
return (nums > 0 ? nums :2);
}
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
return (nums > 0 ? nums :2);
} else {
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
if (ret!=0) {
return nums;
return (nums > 0 ? nums :2);
}
ret = CPU_COUNT(&cpuset);
if (ret > 0 && ret < nums) nums = ret;
return nums;
return (nums > 0 ? nums :2);
}
#endif
#endif
@ -1823,56 +1823,56 @@ int get_num_procs(void) {
ret = omp_get_num_places();
if (ret >0 ) nums = ret;
#endif
return nums;
return (nums > 0 ? nums :2);
#endif
#if !defined(OS_LINUX)
return nums;
return (nums > 0 ? nums :2);
#endif
#if !defined(__GLIBC_PREREQ)
return nums;
return (nums > 0 ? nums :2);
#else
#if !__GLIBC_PREREQ(2, 3)
return nums;
return (nums > 0 ? nums :2);
#endif
#if !__GLIBC_PREREQ(2, 7)
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
if (ret!=0) return nums;
if (ret!=0) return (nums > 0 ? nums :2);
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
for (i=0;i<(nums > 0 ? nums :2);i++)
if (CPU_ISSET(i,&cpuset)) n++;
nums=n;
#else
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
#endif
return nums;
return (nums > 0 ? nums :2);
#else
if (nums >= CPU_SETSIZE) {
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) {
return nums;
return (nums > 0 ? nums :2);
}
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) {
CPU_FREE(cpusetp);
return nums;
return (nums > 0 ? nums :2);
}
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
return (nums > 0 ? nums :2);
} else {
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
if (ret!=0) {
return nums;
return (nums > 0 ? nums :2);
}
ret = CPU_COUNT(&cpuset);
if (ret > 0 && ret < nums) nums = ret;
return nums;
return (nums > 0 ? nums :2);
}
#endif
#endif

View File

@ -39,6 +39,7 @@ static int openblas_env_block_factor=0;
static int openblas_env_openblas_num_threads=0;
static int openblas_env_goto_num_threads=0;
static int openblas_env_omp_num_threads=0;
static int openblas_env_omp_adaptive=0;
int openblas_verbose() { return openblas_env_verbose;}
unsigned int openblas_thread_timeout() { return openblas_env_thread_timeout;}
@ -46,6 +47,7 @@ int openblas_block_factor() { return openblas_env_block_factor;}
int openblas_num_threads_env() { return openblas_env_openblas_num_threads;}
int openblas_goto_num_threads_env() { return openblas_env_goto_num_threads;}
int openblas_omp_num_threads_env() { return openblas_env_omp_num_threads;}
int openblas_omp_adaptive_env() { return openblas_env_omp_adaptive;}
void openblas_read_env() {
int ret=0;
@ -79,6 +81,11 @@ void openblas_read_env() {
if(ret<0) ret=0;
openblas_env_omp_num_threads=ret;
ret=0;
if (readenv(p,"OMP_ADAPTIVE")) ret = atoi(p);
if(ret<0) ret=0;
openblas_env_omp_adaptive=ret;
}

View File

@ -60,6 +60,9 @@ static char* openblas_config_str=""
#ifdef USE_OPENMP
"USE_OPENMP "
#endif
#ifdef USE_TLS
"USE_TLS "
#endif
#ifndef DYNAMIC_ARCH
CHAR_CORENAME
#endif

View File

@ -2,6 +2,12 @@ TOPDIR = ..
include ../Makefile.system
ifdef USE_PERL
GENSYM = gensymbol.pl
else
GENSYM = gensymbol
endif
ifndef EXPRECISION
EXPRECISION = 0
endif
@ -119,11 +125,11 @@ dll : ../$(LIBDLLNAME)
-shared -o ../$(LIBDLLNAME) -Wl,--out-implib,../$(IMPLIBNAME) \
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB)
$(LIBPREFIX).def : gensymbol
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
$(LIBPREFIX).def : $(GENSYM)
./$(GENSYM) win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
libgoto_hpl.def : gensymbol
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
libgoto_hpl.def : $(GENSYM)
./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
ifeq ($(OSNAME), Darwin)
INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
@ -265,24 +271,24 @@ static : ../$(LIBNAME)
$(AR) -cq ../$(LIBNAME) goto.$(SUFFIX)
rm -f goto.$(SUFFIX)
osx.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
osx.def : $(GENSYM) ../Makefile.system ../getarch.c
./$(GENSYM) osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
aix.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
aix.def : $(GENSYM) ../Makefile.system ../getarch.c
./$(GENSYM) aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
objcopy.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
objcopy.def : $(GENSYM) ../Makefile.system ../getarch.c
./$(GENSYM) objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
objconv.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
objconv.def : $(GENSYM) ../Makefile.system ../getarch.c
./$(GENSYM) objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
test : linktest.c
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
rm -f linktest
linktest.c : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c
linktest.c : $(GENSYM) ../Makefile.system ../getarch.c
./$(GENSYM) linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c
clean ::
@rm -f *.def *.dylib __.SYMDEF* *.renamed

7721
exports/gensymbol Normal file → Executable file

File diff suppressed because it is too large Load Diff

3972
exports/gensymbol.pl Normal file

File diff suppressed because it is too large Load Diff

742
f_check Normal file → Executable file
View File

@ -1,6 +1,16 @@
#!/usr/bin/env perl
#!/bin/sh
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
split() {
set -f
old_ifs=$IFS
IFS=$2
set -- $1
printf '%s ' "$@"
IFS=$old_ifs
set +f
}
hostos="$(uname -s | sed 's/\-.*//')"
#
# 1. Not specified
@ -12,407 +22,397 @@ $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
# 2.2.2 Path is not correct, invalid compiler name, then gfortran is default with NOFORTRAN definition
#
$makefile = shift(@ARGV);
$config = shift(@ARGV);
makefile="$1"
config="$2"
$nofortran = 0;
nofortran=0
shift 2
compiler="$*"
compiler_bin="$1"
$compiler = join(" ", @ARGV);
$compiler_bin = shift(@ARGV);
# f77 is too ambiguous
$compiler = "" if $compiler eq "f77";
[ "$compiler" = "f77" ] && compiler=''
@path = split(/:/, $ENV{"PATH"});
path=`split "$PATH" ':'`
if ($compiler eq "") {
if [ -z "$compiler" ]; then
@lists = ("gfortran", "g95", "frt", "fort", "openf90", "openf95",
"sunf77", "sunf90", "sunf95",
"xlf95", "xlf90", "xlf",
"ppuf77", "ppuf95", "ppuf90", "ppuxlf",
"pathf90", "pathf95",
"pgf95", "pgf90", "pgf77", "pgfortran", "nvfortran",
"flang", "egfortran",
"ifort", "nagfor");
lists="gfortran g95 frt fort openf90 openf95
sunf77 sunf90 sunf95
xlf95 xlf90 xlf
ppuf77 ppuf95 ppuf90 ppuxlf
pathf90 pathf95
pgf95 pgf90 pgf77 pgfortran nvfortran
flang egfortran
ifort nagfor ifx ftn crayftn"
OUTER:
foreach $lists (@lists) {
foreach $path (@path) {
if (-x $path . "/" . $lists) {
$compiler = $lists;
$compiler_bin = $lists;
last OUTER;
for list in $lists; do
for p in $path; do
if [ -x "$p/$list" ]; then
compiler=$list
compiler_bin=$list
break 2
fi
done
done
fi
if [ -z "$compiler" ]; then
nofortran=1
compiler=gfortran
vendor=GFORTRAN
bu="_"
else
{
data="$(command -v "$compiler_bin" >/dev/null 2>&1)"
vendor=""
} && {
data=`$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`
if [ -z "$data" ]; then
data=`$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.c && rm -f ftest.c`
fi
case "$data" in *zhoge_*) bu=_ ;; esac
case "$data" in
*Fujitsu*)
vendor=FUJITSU
openmp='-Kopenmp'
;;
*Cray*)
vendor=CRAY
openmp='-fopenmp'
;;
*GNU*|*GCC*)
v="${data#*GCC: *\) }"
v="${v%%\"*}"
major="${v%%.*}"
if [ "$major" -ge 4 ]; then
vendor=GFORTRAN
openmp='-fopenmp'
else
case "$compiler" in
*flang*)
vendor=FLANG
openmp='-fopenmp'
;;
*ifx*)
vendor=INTEL
openmp='-fopenmp'
;;
*pgf*|*nvf*)
vendor=PGI
openmp='-mp'
;;
*)
vendor=G77
openmp=''
;;
esac
fi
;;
*g95*)
vendor=G95
openmp=''
;;
*Intel*)
vendor=INTEL
openmp='-fopenmp'
;;
*'Sun Fortran'*)
vendor=SUN
openmp='-xopenmp=parallel'
;;
*PathScale*)
vendor=PATHSCALE
openmp='-openmp'
;;
*Open64*)
vendor=OPEN64
openmp='-mp'
;;
*PGF*|*NVF*)
vendor=PGI
openmp='-mp'
;;
*'IBM XL'*)
vendor=IBM
openmp='-openmp'
;;
*NAG*)
vendor=NAG
openmp='-openmp'
;;
esac
# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
data=`$compiler -O2 -S ftest3.f >/dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`
[ -z "$data" ] && {
data=`$compiler -O2 -S ftest3.f >/dev/null 2>&1 && cat ftest3.c && rm -f ftest3.c`
}
case "$data" in *' zho_ge__'*) need2bu=1 ;; esac
case "$vendor" in *G95*) [ "$NO_LAPACKE" != 1 ] && need2bu='' ;; esac
}
if [ -z "$vendor" ]; then
case "$compiler" in
*g77*)
vendor=G77
bu=_
openmp=''
;;
*g95*)
vendor=G95
bu=_
openmp=''
;;
*gfortran*)
vendor=GFORTRAN
bu=_
openmp='-fopenmp'
;;
*ifort*|*ifx*)
vendor=INTEL
bu=_
openmp='-fopenmp'
;;
*pathf*)
vendor=PATHSCALE
bu=_
openmp='-mp'
;;
*pgf*|*nvf*)
vendor=PGI
bu=_
openmp='-mp'
;;
*ftn*)
vendor=PGI
bu=_
openmp=-openmp
;;
*frt*)
vendor=FUJITSU
bu=_
openmp='-openmp'
;;
*sunf77*|*sunf90*|*sunf95*)
vendor=SUN
bu=_
openmp='-xopenmp=parallel'
;;
*ppuf*|*xlf*)
vendor=IBM
openmp='-openmp'
;;
*open64*)
vendor=OPEN64
openmp='-mp'
;;
*flang*)
vendor=FLANG
bu=_
openmp='-fopenmp'
;;
*nagfor*)
vendor=NAG
bu=_
openmp='-openmp'
;;
esac
if [ -z "$vendor" ]; then
nofortran=1
compiler="gfortran"
vendor=GFORTRAN
bu=_
openmp=''
fi
fi
fi
{
data=`command -v $compiler_bin >/dev/null 2>&1`
} && {
binary=$BINARY
[ "$USE_OPENMP" != 1 ] && openmp=''
case "$binary" in
32)
{
link=`$compiler $openmp -m32 -v ftest2.f 2>&1 && rm -f a.out a.exe`
} || {
link=`$compiler $openmp -q32 -v ftest2.f 2>&1 && rm -f a.out a.exe`
} || {
# for AIX
link=`$compiler $openmp -maix32 -v ftest2.f 2>&1 && rm -f a.out a.exe`
} || {
# for gfortran MIPS
mips_data=`$compiler_bin -E -dM - < /dev/null`
case "$mips_data" in
*_MIPS_ISA_MIPS64*)
link=`$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`
;;
*)
link=`$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`
;;
esac
} || {
binary=''
}
}
}
;;
64)
{
link=`$compiler $openmp -m64 -v ftest2.f 2>&1 && rm -f a.out a.exe`
} || {
link=`$compiler $openmp -q64 -v ftest2.f 2>&1 && rm -f a.out a.exe`
} || {
# for AIX
link=`$compiler $openmp -maix64 -v ftest2.f 2>&1 && rm -f a.out a.exe`
} || {
# for gfortran MIPS
link=`$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`
} || {
# for nagfor
link=`$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`
} || {
binary=''
}
;;
esac
if [ -z "$binary" ]; then
link=`$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`
fi
}
if ($compiler eq "") {
if [ "$vendor" = "NAG" ]; then
link=`$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`
fi
if [ "$vendor" = "CRAY" ]; then
link=`$compiler $openmp -hnopattern ftest2.f 2>&1 && rm -f a.out a.exe`
fi
linker_L=""
linker_l=""
linker_a=""
$nofortran = 1;
$compiler = "gfortran";
$vendor = GFORTRAN;
$bu = "_";
if [ -n "$link" ]; then
} else {
link=`echo "$link" | sed 's/\-Y[[:space:]]P\,/\-Y/g'`
$data = `which $compiler_bin > /dev/null 2> /dev/null`;
$vendor = "";
link=`echo "$link" | sed 's/\-R[[:space:]]*/\-rpath\%/g'`
if (!$?) {
link=`echo "$link" | sed 's/\-rpath[[:space:]]+/\-rpath\%/g'`
$data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`;
if ($data eq "") {
$data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.c && rm -f ftest.c`;
}
if ($data =~ /zhoge_/) {
$bu = "_";
}
link=`echo "$link" | sed 's/\-rpath-link[[:space:]]+/\-rpath-link\%/g'`
if ($data =~ /Fujitsu/) {
$vendor = FUJITSU;
$openmp = "-Kopenmp";
} elsif ($data =~ /GNU/ || $data =~ /GCC/ ) {
$data =~ s/\(+.*?\)+//g;
$data =~ /(\d+)\.(\d+).(\d+)/;
$major = $1;
$minor = $2;
if ($major >= 4) {
$vendor = GFORTRAN;
$openmp = "-fopenmp";
} else {
if ($compiler =~ /flang/) {
$vendor = FLANG;
$openmp = "-fopenmp";
} elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
$vendor = PGI;
$openmp = "-mp";
} else {
$vendor = G77;
$openmp = "";
}
}
}
if ($data =~ /g95/) {
$vendor = G95;
$openmp = "";
}
if ($data =~ /Intel/) {
$vendor = INTEL;
$openmp = "-fopenmp";
}
if ($data =~ /Sun Fortran/) {
$vendor = SUN;
$openmp = "-xopenmp=parallel";
}
if ($data =~ /PathScale/) {
$vendor = PATHSCALE;
$openmp = "-openmp";
}
if ($data =~ /Open64/) {
$vendor = OPEN64;
$openmp = "-mp";
}
if ($data =~ /PGF/ || $data =~ /NVF/) {
$vendor = PGI;
$openmp = "-mp";
}
if ($data =~ /IBM XL/) {
$vendor = IBM;
$openmp = "-openmp";
}
if ($data =~ /NAG/) {
$vendor = NAG;
$openmp = "-openmp";
}
# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
if ($data eq "") {
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.c && rm -f ftest3.c`;
}
if ($data =~ / zho_ge__/) {
$need2bu = 1;
}
if ($vendor =~ /G95/) {
if ($ENV{NO_LAPACKE} != 1) {
$need2bu = "";
}
}
}
if ($vendor eq "") {
if ($compiler =~ /g77/) {
$vendor = G77;
$bu = "_";
$openmp = "";
}
if ($compiler =~ /g95/) {
$vendor = G95;
$bu = "_";
$openmp = "";
}
if ($compiler =~ /gfortran/) {
$vendor = GFORTRAN;
$bu = "_";
$openmp = "-fopenmp";
}
if ($compiler =~ /ifort/) {
$vendor = INTEL;
$bu = "_";
$openmp = "-fopenmp";
}
if ($compiler =~ /pathf/) {
$vendor = PATHSCALE;
$bu = "_";
$openmp = "-mp";
}
if ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
$vendor = PGI;
$bu = "_";
$openmp = "-mp";
}
if ($compiler =~ /ftn/) {
$vendor = PGI;
$bu = "_";
$openmp = "-openmp";
}
if ($compiler =~ /frt/) {
$vendor = FUJITSU;
$bu = "_";
$openmp = "-openmp";
}
if ($compiler =~ /sunf77|sunf90|sunf95/) {
$vendor = SUN;
$bu = "_";
$openmp = "-xopenmp=parallel";
}
if ($compiler =~ /ppuf/) {
$vendor = IBM;
$openmp = "-openmp";
}
if ($compiler =~ /xlf/) {
$vendor = IBM;
$openmp = "-openmp";
}
if ($compiler =~ /open64/) {
$vendor = OPEN64;
$openmp = "-mp";
}
if ($compiler =~ /flang/) {
$vendor = FLANG;
$bu = "_";
$openmp = "-fopenmp";
}
if ($compiler =~ /nagfor/) {
$vendor = NAG;
$bu = "_";
$openmp = "-openmp";
}
if ($vendor eq "") {
$nofortran = 1;
$compiler = "gfortran";
$vendor = GFORTRAN;
$bu = "_";
$openmp = "";
}
}
}
$data = `which $compiler_bin > /dev/null 2> /dev/null`;
if (!$?) {
$binary = $ENV{"BINARY"};
$openmp = "" if $ENV{USE_OPENMP} != 1;
if ($binary == 32) {
$link = `$compiler $openmp -m32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
if ($?) {
$link = `$compiler $openmp -q32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
# for AIX
if ($?) {
$link = `$compiler $openmp -maix32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
#For gfortran MIPS
if ($?) {
$mips_data = `$compiler_bin -E -dM - < /dev/null`;
if ($mips_data =~ /_MIPS_ISA_MIPS64/) {
$link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
} else {
$link = `$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
}
$binary = "" if ($?);
}
if ($binary == 64) {
$link = `$compiler $openmp -m64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
if ($?) {
$link = `$compiler $openmp -q64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
# for AIX
if ($?) {
$link = `$compiler $openmp -maix64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
#For gfortran MIPS
if ($?) {
$link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
#For nagfor
if ($?) {
$link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`;
}
$binary = "" if ($?);
}
if ($binary eq "") {
$link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
}
if ( $vendor eq "NAG") {
$link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`;
}
$linker_L = "";
$linker_l = "";
$linker_a = "";
if ($link ne "") {
$link =~ s/\-Y\sP\,/\-Y/g;
$link =~ s/\-R\s*/\-rpath\%/g;
$link =~ s/\-rpath\s+/\-rpath\%/g;
$link =~ s/\-rpath-link\s+/\-rpath-link\%/g;
@flags = split(/[\s\,\n]/, $link);
flags=`echo "$link" | tr "',\n" " "`
# remove leading and trailing quotes from each flag.
@flags = map {s/^['"]|['"]$//g; $_} @flags;
#@flags = map {s/^['"]|['"]$//g; $_} @flags;
foreach $flags (@flags) {
if (
($flags =~ /^\-L/)
&& ($flags !~ /^-LIST:/)
&& ($flags !~ /^-LANG:/)
) {
$linker_L .= $flags . " ";
}
for flag in $flags; do
case "$flag" in -L*)
case "$flag" in
-LIST:*|-LANG:*) ;;
*) linker_L="$linker_L $flag" ;;
esac
esac
if ($flags =~ /^\-Y/) {
next if ($hostos eq 'SunOS');
$linker_L .= "-Wl,". $flags . " ";
}
case "$flag" in -Y*)
[ "$hostos" = "SunOS" ] && continue
linker_L="$linker_L -Wl,$flag"
;;
esac
if ($flags =~ /^\--exclude-libs/) {
$linker_L .= "-Wl,". $flags . " ";
$flags="";
}
case "$flag" in --exclude-libs*)
linker_L="$linker_L -Wl,$flag"
flag=""
;;
esac
case "$flag" in -rpath%*)
flag=`echo "$flag" | sed 's/\%/\,/g'`
linker_L="$linker_L -Wl,$flag"
esac
if ($flags =~ /^\-rpath\%/) {
$flags =~ s/\%/\,/g;
$linker_L .= "-Wl,". $flags . " " ;
}
case "$flag" in -rpath-link%*)
flag=`echo "$flag" | sed 's/\%/\,/g'`
linker_L="$linker_L -Wl,$flag"
;;
esac
if ($flags =~ /^\-rpath-link\%/) {
$flags =~ s/\%/\,/g;
$linker_L .= "-Wl,". $flags . " " ;
}
if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) {
$flags = "-lomp";
}
case "$flag" in *-lgomp*)
case "$CC" in *clang*)
flag="-lomp"
;;
esac
esac
if (
($flags =~ /^\-l/)
&& ($flags !~ /ibrary/)
&& ($flags !~ /gfortranbegin/)
&& ($flags !~ /flangmain/)
&& ($flags !~ /frtbegin/)
&& ($flags !~ /pathfstart/)
&& ($flags !~ /crt[0-9]/)
&& ($flags !~ /gcc/)
&& ($flags !~ /user32/)
&& ($flags !~ /kernel32/)
&& ($flags !~ /advapi32/)
&& ($flags !~ /shell32/)
&& ($flags !~ /omp/ || ($vendor !~ /PGI/ && $vendor !~ /FUJITSU/ && $flags =~ /omp/))
&& ($flags !~ /[0-9]+/ || ($vendor == FUJITSU && $flags =~ /^-lfj90/))
&& ($flags !~ /^\-l$/)
) {
$linker_l .= $flags . " ";
}
case "$flag" in -l*)
case "$flag" in
*ibrary*|*gfortranbegin*|*flangmain*|*frtbegin*|*pathfstart*|\
*crt[0-9]*|*gcc*|*user32*|*kernel32*|*advapi32*|*shell32*|\
-l) ;;
*omp*)
case "$vendor" in
*PGI*|*FUJITSU*) ;;
*) linker_l="$linker_l $flag" ;;
esac
;;
*[0-9]*)
if [ "$vendor" = "FUJITSU" ]; then
case "$flag" in
-lfj90*) linker_l="$linker_l $flag" ;;
*) ;;
esac
fi
;;
*) linker_l="$linker_l $flag" ;;
esac
esac
if ( $flags =~ /quickfit.o/ && $vendor == NAG) {
$linker_l .= $flags . " ";
}
if ( $flags =~ /safefit.o/ && $vendor == NAG) {
$linker_l .= $flags . " ";
}
if ( $flags =~ /thsafe.o/ && $vendor == NAG) {
$linker_l .= $flags . " ";
}
case "$flag" in *quickfit.o*)
[ "$vendor" = "NAG" ] && linker_l="$linker_l $flag" ;;
esac
$linker_a .= $flags . " " if $flags =~ /\.a$/;
}
case "$flag" in *safefit.o*)
[ "$vendor" = "NAG" ] && linker_l="$linker_l $flag" ;;
esac
}
case "$flag" in *thsafe.o*)
[ "$vendor" = "NAG" ] && linker_l="$linker_l $flag" ;;
esac
if ($vendor eq "FLANG"){
$linker_a .= "-lflang"
}
case "$flag" in *.a) linker_a="$linker_a $flag" ;; esac
done
fi
open(MAKEFILE, ">> $makefile") || die "Can't append $makefile";
open(CONFFILE, ">> $config" ) || die "Can't append $config";
if [ "$vendor" = "FLANG" ]; then
linker_a="$linker_a -lflang"
fi
print MAKEFILE "F_COMPILER=$vendor\n";
print MAKEFILE "FC=$compiler\n";
print MAKEFILE "BU=$bu\n" if $bu ne "";
print MAKEFILE "NOFORTRAN=1\n" if $nofortran == 1;
printf "F_COMPILER=%s\n" "$vendor" >> "$makefile"
printf "FC=%s\n" "$compiler" >> "$makefile"
[ -n "$bu" ] && printf 'BU=%s\n' "$bu" >> "$makefile"
[ "$nofortran" -eq 1 ] && printf 'NOFORTRAN=1\n' >> "$makefile"
print CONFFILE "#define BUNDERSCORE\t$bu\n" if $bu ne "";
print CONFFILE "#define NEEDBUNDERSCORE\t1\n" if $bu ne "";
print CONFFILE "#define NEED2UNDERSCORES\t1\n" if $need2bu ne "";
[ -n "$bu" ] && printf '#define BUNDERSCORE\t%s\n' "$bu" >> "$config"
[ -n "$bu" ] && printf '#define NEEDBUNDERSCORE\t1\n' >> "$config"
[ -n "$need2bu" ] && printf "#define NEED2UNDERSCORES\t1\n" >> "$config"
print MAKEFILE "NEED2UNDERSCORES=1\n" if $need2bu ne "";
[ -n "$need2bu" ] && printf "#define NEED2UNDERSCORES=1\n" >> "$config"
if (($linker_l ne "") || ($linker_a ne "")) {
print MAKEFILE "FEXTRALIB=$linker_L $linker_l $linker_a\n";
}
if [ -n "$linker_l" ] || [ -n "$linker_a" ]; then
printf "FEXTRALIB=%s %s %s\n" "$linker_L" "$linker_l" "$linker_a" >> "$makefile"
fi
close(MAKEFILE);
close(CONFFILE);

429
f_check.pl Normal file
View File

@ -0,0 +1,429 @@
#!/usr/bin/env perl
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
#
# 1. Not specified
# 1.1 Automatically detect, then check compiler
# 1.2 If no fortran compiler is detected, gfortran is default with NOFORTRAN definition
# 2. Specified
# 2.1 If path is correct, check compiler
# 2.2 If path is not correct, but still valid compiler name, force setting
# 2.2.2 Path is not correct, invalid compiler name, then gfortran is default with NOFORTRAN definition
#
$makefile = shift(@ARGV);
$config = shift(@ARGV);
$nofortran = 0;
$compiler = join(" ", @ARGV);
$compiler_bin = shift(@ARGV);
# f77 is too ambiguous
$compiler = "" if $compiler eq "f77";
@path = split(/:/, $ENV{"PATH"});
if ($compiler eq "") {
@lists = ("gfortran", "g95", "frt", "fort", "openf90", "openf95",
"sunf77", "sunf90", "sunf95",
"xlf95", "xlf90", "xlf",
"ppuf77", "ppuf95", "ppuf90", "ppuxlf",
"pathf90", "pathf95",
"pgf95", "pgf90", "pgf77", "pgfortran", "nvfortran",
"flang", "egfortran",
"ifort", "nagfor", "ifx", "ftn", "crayftn");
OUTER:
foreach $lists (@lists) {
foreach $path (@path) {
if (-x $path . "/" . $lists) {
$compiler = $lists;
$compiler_bin = $lists;
last OUTER;
}
}
}
}
if ($compiler eq "") {
$nofortran = 1;
$compiler = "gfortran";
$vendor = GFORTRAN;
$bu = "_";
} else {
$data = `which $compiler_bin > /dev/null 2> /dev/null`;
$vendor = "";
if (!$?) {
$data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`;
if ($data eq "") {
$data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.c && rm -f ftest.c`;
}
if ($data =~ /zhoge_/) {
$bu = "_";
}
if ($data =~ /Fujitsu/) {
$vendor = FUJITSU;
$openmp = "-Kopenmp";
} elsif ($data =~ /Cray/) {
$vendor = CRAY;
$openmp = "-fopenmp";
} elsif ($data =~ /GNU/ || $data =~ /GCC/ ) {
$data =~ s/\(+.*?\)+//g;
$data =~ /(\d+)\.(\d+).(\d+)/;
$major = $1;
$minor = $2;
if ($major >= 4) {
$vendor = GFORTRAN;
$openmp = "-fopenmp";
} else {
if ($compiler =~ /flang/) {
$vendor = FLANG;
$openmp = "-fopenmp";
} elsif ($compiler =~ /ifx/) {
$vendor = INTEL;
$openmp = "-fopenmp";
} elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
$vendor = PGI;
$openmp = "-mp";
} else {
$vendor = G77;
$openmp = "";
}
}
}
if ($data =~ /g95/) {
$vendor = G95;
$openmp = "";
}
if ($data =~ /Intel/) {
$vendor = INTEL;
$openmp = "-fopenmp";
}
if ($data =~ /Sun Fortran/) {
$vendor = SUN;
$openmp = "-xopenmp=parallel";
}
if ($data =~ /PathScale/) {
$vendor = PATHSCALE;
$openmp = "-openmp";
}
if ($data =~ /Open64/) {
$vendor = OPEN64;
$openmp = "-mp";
}
if ($data =~ /PGF/ || $data =~ /NVF/) {
$vendor = PGI;
$openmp = "-mp";
}
if ($data =~ /IBM XL/) {
$vendor = IBM;
$openmp = "-openmp";
}
if ($data =~ /NAG/) {
$vendor = NAG;
$openmp = "-openmp";
}
# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
if ($data eq "") {
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.c && rm -f ftest3.c`;
}
if ($data =~ / zho_ge__/) {
$need2bu = 1;
}
if ($vendor =~ /G95/) {
if ($ENV{NO_LAPACKE} != 1) {
$need2bu = "";
}
}
}
if ($vendor eq "") {
if ($compiler =~ /g77/) {
$vendor = G77;
$bu = "_";
$openmp = "";
}
if ($compiler =~ /g95/) {
$vendor = G95;
$bu = "_";
$openmp = "";
}
if ($compiler =~ /gfortran/) {
$vendor = GFORTRAN;
$bu = "_";
$openmp = "-fopenmp";
}
if ($compiler =~ /ifort/ || $compiler =~ /ifx/) {
$vendor = INTEL;
$bu = "_";
$openmp = "-fopenmp";
}
if ($compiler =~ /pathf/) {
$vendor = PATHSCALE;
$bu = "_";
$openmp = "-mp";
}
if ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
$vendor = PGI;
$bu = "_";
$openmp = "-mp";
}
if ($compiler =~ /ftn/) {
$vendor = PGI;
$bu = "_";
$openmp = "-openmp";
}
if ($compiler =~ /frt/) {
$vendor = FUJITSU;
$bu = "_";
$openmp = "-openmp";
}
if ($compiler =~ /sunf77|sunf90|sunf95/) {
$vendor = SUN;
$bu = "_";
$openmp = "-xopenmp=parallel";
}
if ($compiler =~ /ppuf/) {
$vendor = IBM;
$openmp = "-openmp";
}
if ($compiler =~ /xlf/) {
$vendor = IBM;
$openmp = "-openmp";
}
if ($compiler =~ /open64/) {
$vendor = OPEN64;
$openmp = "-mp";
}
if ($compiler =~ /flang/) {
$vendor = FLANG;
$bu = "_";
$openmp = "-fopenmp";
}
if ($compiler =~ /nagfor/) {
$vendor = NAG;
$bu = "_";
$openmp = "-openmp";
}
if ($vendor eq "") {
$nofortran = 1;
$compiler = "gfortran";
$vendor = GFORTRAN;
$bu = "_";
$openmp = "";
}
}
}
$data = `which $compiler_bin > /dev/null 2> /dev/null`;
if (!$?) {
$binary = $ENV{"BINARY"};
$openmp = "" if $ENV{USE_OPENMP} != 1;
if ($binary == 32) {
$link = `$compiler $openmp -m32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
if ($?) {
$link = `$compiler $openmp -q32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
# for AIX
if ($?) {
$link = `$compiler $openmp -maix32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
#For gfortran MIPS
if ($?) {
$mips_data = `$compiler_bin -E -dM - < /dev/null`;
if ($mips_data =~ /_MIPS_ISA_MIPS64/) {
$link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
} else {
$link = `$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
}
$binary = "" if ($?);
}
if ($binary == 64) {
$link = `$compiler $openmp -m64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
if ($?) {
$link = `$compiler $openmp -q64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
# for AIX
if ($?) {
$link = `$compiler $openmp -maix64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
#For gfortran MIPS
if ($?) {
$link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
#For nagfor
if ($?) {
$link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`;
}
$binary = "" if ($?);
}
if ($binary eq "") {
$link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
}
if ( $vendor eq "NAG") {
$link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`;
}
if ( $vendor eq "CRAY") {
$link = `$compiler $openmp -hnopattern ftest2.f 2>&1 && rm -f a.out a.exe`;
}
$linker_L = "";
$linker_l = "";
$linker_a = "";
if ($link ne "") {
$link =~ s/\-Y\sP\,/\-Y/g;
$link =~ s/\-R\s*/\-rpath\%/g;
$link =~ s/\-rpath\s+/\-rpath\%/g;
$link =~ s/\-rpath-link\s+/\-rpath-link\%/g;
@flags = split(/[\s\,\n]/, $link);
# remove leading and trailing quotes from each flag.
@flags = map {s/^['"]|['"]$//g; $_} @flags;
foreach $flags (@flags) {
if (
($flags =~ /^\-L/)
&& ($flags !~ /^-LIST:/)
&& ($flags !~ /^-LANG:/)
) {
$linker_L .= $flags . " ";
}
if ($flags =~ /^\-Y/) {
next if ($hostos eq 'SunOS');
$linker_L .= "-Wl,". $flags . " ";
}
if ($flags =~ /^\--exclude-libs/) {
$linker_L .= "-Wl,". $flags . " ";
$flags="";
}
if ($flags =~ /^\-rpath\%/) {
$flags =~ s/\%/\,/g;
$linker_L .= "-Wl,". $flags . " " ;
}
if ($flags =~ /^\-rpath-link\%/) {
$flags =~ s/\%/\,/g;
$linker_L .= "-Wl,". $flags . " " ;
}
if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) {
$flags = "-lomp";
}
if (
($flags =~ /^\-l/)
&& ($flags !~ /ibrary/)
&& ($flags !~ /gfortranbegin/)
&& ($flags !~ /flangmain/)
&& ($flags !~ /frtbegin/)
&& ($flags !~ /pathfstart/)
&& ($flags !~ /crt[0-9]/)
&& ($flags !~ /gcc/)
&& ($flags !~ /user32/)
&& ($flags !~ /kernel32/)
&& ($flags !~ /advapi32/)
&& ($flags !~ /shell32/)
&& ($flags !~ /omp/ || ($vendor !~ /PGI/ && $vendor !~ /FUJITSU/ && $flags =~ /omp/))
&& ($flags !~ /[0-9]+/ || ($vendor == FUJITSU && $flags =~ /^-lfj90/))
&& ($flags !~ /^\-l$/)
) {
$linker_l .= $flags . " ";
}
if ( $flags =~ /quickfit.o/ && $vendor == NAG) {
$linker_l .= $flags . " ";
}
if ( $flags =~ /safefit.o/ && $vendor == NAG) {
$linker_l .= $flags . " ";
}
if ( $flags =~ /thsafe.o/ && $vendor == NAG) {
$linker_l .= $flags . " ";
}
$linker_a .= $flags . " " if $flags =~ /\.a$/;
}
}
if ($vendor eq "FLANG"){
$linker_a .= "-lflang"
}
open(MAKEFILE, ">> $makefile") || die "Can't append $makefile";
open(CONFFILE, ">> $config" ) || die "Can't append $config";
print MAKEFILE "F_COMPILER=$vendor\n";
print MAKEFILE "FC=$compiler\n";
print MAKEFILE "BU=$bu\n" if $bu ne "";
print MAKEFILE "NOFORTRAN=1\n" if $nofortran == 1;
print CONFFILE "#define BUNDERSCORE\t$bu\n" if $bu ne "";
print CONFFILE "#define NEEDBUNDERSCORE\t1\n" if $bu ne "";
print CONFFILE "#define NEED2UNDERSCORES\t1\n" if $need2bu ne "";
print MAKEFILE "NEED2UNDERSCORES=1\n" if $need2bu ne "";
if (($linker_l ne "") || ($linker_a ne "")) {
print MAKEFILE "FEXTRALIB=$linker_L $linker_l $linker_a\n";
}
close(MAKEFILE);
close(CONFFILE);

162
getarch.c
View File

@ -94,14 +94,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <sys/sysinfo.h>
#endif
#if defined(__x86_64__) || defined(_M_X64)
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#else
#ifndef NO_AVX512
#define NO_AVX512
#endif
#endif
#endif
/* #define FORCE_P2 */
/* #define FORCE_KATMAI */
/* #define FORCE_COPPERMINE */
@ -140,9 +132,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_PPC440FP2 */
/* #define FORCE_CELL */
/* #define FORCE_SICORTEX */
/* #define FORCE_LOONGSON3R3 */
/* #define FORCE_LOONGSON3R4 */
/* #define FORCE_LOONGSON3R5 */
/* #define FORCE_LOONGSON3R3 */
/* #define FORCE_LOONGSON3R4 */
/* #define FORCE_LOONGSON3R5 */
/* #define FORCE_LOONGSON2K1000 */
/* #define FORCE_LOONGSONGENERIC */
/* #define FORCE_I6400 */
/* #define FORCE_P6600 */
/* #define FORCE_P5600 */
@ -977,6 +971,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif
#ifdef FORCE_LOONGSON2K1000
#define FORCE
#define ARCHITECTURE "LOONGARCH"
#define SUBARCHITECTURE "LOONGSON2K1000"
#define SUBDIRNAME "loongarch64"
#define ARCHCONFIG "-DLOONGSON2K1000 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 "
#define LIBNAME "loongson2k1000"
#define CORENAME "LOONGSON2K1000"
#else
#endif
#ifdef FORCE_LOONGSONGENERIC
#define FORCE
#define ARCHITECTURE "LOONGARCH"
#define SUBARCHITECTURE "LOONGSONGENERIC"
#define SUBDIRNAME "loongarch64"
#define ARCHCONFIG "-DLOONGSONGENERIC " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 "
#define LIBNAME "loongsongeneric"
#define CORENAME "LOONGSONGENERIC"
#else
#endif
#ifdef FORCE_I6400
#define FORCE
#define ARCHITECTURE "MIPS"
@ -1240,7 +1262,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa53"
#define CORENAME "CORTEXA53"
#else
#endif
#ifdef FORCE_CORTEXA57
@ -1256,7 +1277,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa57"
#define CORENAME "CORTEXA57"
#else
#endif
#ifdef FORCE_CORTEXA72
@ -1272,7 +1292,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa72"
#define CORENAME "CORTEXA72"
#else
#endif
#ifdef FORCE_CORTEXA73
@ -1288,7 +1307,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa73"
#define CORENAME "CORTEXA73"
#else
#endif
#ifdef FORCE_CORTEXX1
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "CORTEXX1"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXX1 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexx1"
#define CORENAME "CORTEXX1"
#endif
#ifdef FORCE_CORTEXX2
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "CORTEXX2"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXX2 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 -DARMV9"
#define LIBNAME "cortexx2"
#define CORENAME "CORTEXX2"
#endif
#ifdef FORCE_CORTEXA510
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "CORTEXA510"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXA510 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 -DARMV9"
#define LIBNAME "cortexa510"
#define CORENAME "CORTEXA510"
#endif
#ifdef FORCE_CORTEXA710
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "CORTEXA710"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXA710 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 -DARMV9"
#define LIBNAME "cortexa710"
#define CORENAME "CORTEXA710"
#endif
#ifdef FORCE_NEOVERSEN1
@ -1305,7 +1379,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-march=armv8.2-a -mtune=neoverse-n1"
#define LIBNAME "neoversen1"
#define CORENAME "NEOVERSEN1"
#else
#endif
#ifdef FORCE_NEOVERSEV1
@ -1322,7 +1395,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-march=armv8.4-a -mtune=neoverse-v1"
#define LIBNAME "neoversev1"
#define CORENAME "NEOVERSEV1"
#else
#endif
@ -1340,7 +1412,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-march=armv8.5-a -mtune=neoverse-n2"
#define LIBNAME "neoversen2"
#define CORENAME "NEOVERSEN2"
#else
#endif
#ifdef FORCE_CORTEXA55
@ -1356,7 +1427,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa55"
#define CORENAME "CORTEXA55"
#else
#endif
#ifdef FORCE_FALKOR
@ -1372,7 +1442,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "falkor"
#define CORENAME "FALKOR"
#else
#endif
#ifdef FORCE_THUNDERX
@ -1387,7 +1456,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "thunderx"
#define CORENAME "THUNDERX"
#else
#endif
#ifdef FORCE_THUNDERX2T99
@ -1405,7 +1473,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "thunderx2t99"
#define CORENAME "THUNDERX2T99"
#else
#endif
#ifdef FORCE_TSV110
@ -1421,7 +1488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "tsv110"
#define CORENAME "TSV110"
#else
#endif
#ifdef FORCE_EMAG8180
@ -1456,7 +1522,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "thunderx3t110"
#define CORENAME "THUNDERX3T110"
#else
#endif
#ifdef FORCE_VORTEX
@ -1488,7 +1553,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8"
#define LIBNAME "a64fx"
#define CORENAME "A64FX"
#else
#endif
#ifdef FORCE_FT2000
#define ARMV8
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "FT2000"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DFT2000 " \
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
"-DL2_SIZE=33554426-DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "ft2000"
#define CORENAME "FT2000"
#endif
#ifdef FORCE_ZARCH_GENERIC
@ -1524,6 +1604,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef FORCE_C910V
#define FORCE
#define ARCHITECTURE "RISCV64"
#ifdef NO_RV64GV
#define SUBARCHITECTURE "RISCV64_GENERIC"
#define SUBDIRNAME "riscv64"
#define ARCHCONFIG "-DRISCV64_GENERIC " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
#define LIBNAME "riscv64_generic"
#define CORENAME "RISCV64_GENERIC"
#else
#define SUBARCHITECTURE "C910V"
#define SUBDIRNAME "riscv64"
#define ARCHCONFIG "-DC910V " \
@ -1532,6 +1622,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
#define LIBNAME "c910v"
#define CORENAME "C910V"
#endif
#else
#endif
@ -1632,17 +1723,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
static int get_num_cores(void) {
int count;
#ifdef OS_WINDOWS
SYSTEM_INFO sysinfo;
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
int m[2], count;
int m[2];
size_t len;
#endif
#if defined(linux) || defined(__sun__)
//returns the number of processors which are currently online
return sysconf(_SC_NPROCESSORS_CONF);
count = sysconf(_SC_NPROCESSORS_CONF);
if (count <= 0) count = 2;
return count;
#elif defined(OS_WINDOWS)
GetSystemInfo(&sysinfo);
@ -1653,13 +1747,15 @@ static int get_num_cores(void) {
m[1] = HW_NCPU;
len = sizeof(int);
sysctl(m, 2, &count, &len, NULL, 0);
if (count <= 0) count = 2;
return count;
#elif defined(AIX)
//returns the number of processors which are currently online
return sysconf(_SC_NPROCESSORS_ONLN);
count = sysconf(_SC_NPROCESSORS_ONLN);
if (count <= 0) count = 2;
#else
return 2;
#endif
@ -1681,7 +1777,7 @@ int main(int argc, char *argv[]){
#ifdef FORCE
printf("CORE=%s\n", CORENAME);
#else
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv)
printf("CORE=%s\n", get_corename());
#endif
#endif
@ -1829,7 +1925,7 @@ printf("ELF_VERSION=2\n");
#ifdef FORCE
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
#else
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv)
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
#endif
#endif

View File

@ -531,8 +531,11 @@ $(BLASOBJS) $(BLASOBJS_P) : functable.h
$(BLASOBJS) $(BLASOBJS_P) : override CFLAGS += -DPROFILE_FUNC_NAME=interface_$(*F)
functable.h : Makefile
ifndef USE_PERL
./create $(FUNCALLFILES) > functable.h
else
./create.pl $(FUNCALLFILES) > functable.h
endif
endif
clean ::

View File

@ -1,22 +1,22 @@
#!/usr/bin/env perl
#!/bin/sh
$count = 0;
count=0
foreach (@ARGV) {
print "#define\tinterface_", $_, "\t\t", $count, "\n";
$count ++;
}
for arg in "$@"; do
printf "#define\tinterface_%s\t\t%d\n" "$arg" "$count"
count=`expr $count + 1`
done
print "#ifdef USE_FUNCTABLE\n";
printf "#ifdef USE_FUNCTABLE\n"
print "#define MAX_PROF_TABLE ", $count, "\n";
printf "#define MAX_PROF_TABLE %d\n" "$count"
print "static char *func_table[] = {\n";
printf "static char *func_table[] = {\n"
foreach (@ARGV) {
print "\"", $_, "\",\n";
}
for arg in "$@"; do
printf "\"%s\",\n" "$arg"
done
print "};\n";
print "#endif\n";
printf "};\n"
printf "#endif\n"

22
interface/create.pl Normal file
View File

@ -0,0 +1,22 @@
#!/usr/bin/env perl
$count = 0;
foreach (@ARGV) {
print "#define\tinterface_", $_, "\t\t", $count, "\n";
$count ++;
}
print "#ifdef USE_FUNCTABLE\n";
print "#define MAX_PROF_TABLE ", $count, "\n";
print "static char *func_table[] = {\n";
foreach (@ARGV) {
print "\"", $_, "\",\n";
}
print "};\n";
print "#endif\n";

View File

@ -678,7 +678,7 @@ endif ()
set(SBGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c)
endif ()
if (NOT DEFINED SBGEMM_SMALL_K_B0_TT)
set($SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
set(SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
endif ()
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "BFLOAT16")
GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16")
@ -854,49 +854,49 @@ endif ()
# Makefile.LA
if(NOT NO_LAPACK)
foreach (float_type ${FLOAT_TYPES})
string(SUBSTRING ${float_type} 0 1 float_char)
if (${float_type} STREQUAL "BFLOAT16")
set (float_char "SB")
endif ()
if (NOT DEFINED ${float_char}NEG_TCOPY)
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C" OR ${float_char} STREQUAL "X")
set(${float_char}NEG_TCOPY ../generic/zneg_tcopy.c)
set(${float_char}NEG_TCOPY ../generic/zneg_tcopy_${${float_char}GEMM_UNROLL_M}.c)
else ()
set(${float_char}NEG_TCOPY ../generic/neg_tcopy.c)
set(${float_char}NEG_TCOPY ../generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c)
endif ()
endif ()
if (NOT DEFINED ${float_char}LASWP_NCOPY)
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C" OR ${float_char} STREQUAL "X")
set(${float_char}LASWP_NCOPY ../generic/zlaswp_ncopy.c)
set(${float_char}LASWP_NCOPY ../generic/zlaswp_ncopy_${${float_char}GEMM_UNROLL_N}.c)
else ()
set(${float_char}LASWP_NCOPY ../generic/laswp_ncopy.c)
set(${float_char}LASWP_NCOPY ../generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c)
endif ()
endif ()
string(SUBSTRING ${float_type} 0 1 float_char)
GenerateNamedObjects("${KERNELDIR}/${${float_char}NEG_TCOPY}_${${float_char}GEMM_UNROLL_M}" "" "neg_tcopy" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}LASWP_NCOPY}_${${float_char}GEMM_UNROLL_N}" "" "laswp_ncopy" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}NEG_TCOPY}" "" "neg_tcopy" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}LASWP_NCOPY}" "" "laswp_ncopy" false "" "" false ${float_type})
endforeach()
if (BUILD_COMPLEX AND NOT BUILD_SINGLE)
if (NOT DEFINED SNEG_TCOPY)
set(SNEG_TCOPY ../generic/neg_tcopy.c)
set(SNEG_TCOPY ../generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c)
endif ()
if (NOT DEFINED SLASWP_NCOPY)
set(SLASWP_NCOPY ../generic/laswp_ncopy.c)
set(SLASWP_NCOPY ../generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c)
endif ()
GenerateNamedObjects("${KERNELDIR}/${SNEG_TCOPY}_${SGEMM_UNROLL_M}" "" "neg_tcopy" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${SLASWP_NCOPY}_${SGEMM_UNROLL_N}" "" "laswp_ncopy" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${SNEG_TCOPY}" "" "neg_tcopy" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${SLASWP_NCOPY}" "" "laswp_ncopy" false "" "" false "SINGLE")
endif()
if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
if (NOT DEFINED DNEG_TCOPY)
set(DNEG_TCOPY ../generic/neg_tcopy.c)
set(DNEG_TCOPY ../generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c)
endif ()
if (NOT DEFINED DLASWP_NCOPY)
set(DLASWP_NCOPY ../generic/laswp_ncopy.c)
set(DLASWP_NCOPY ../generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c)
endif ()
GenerateNamedObjects("${KERNELDIR}/${DNEG_TCOPY}_${DGEMM_UNROLL_M}" "" "neg_tcopy" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DLASWP_NCOPY}_${DGEMM_UNROLL_N}" "" "laswp_ncopy" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DNEG_TCOPY}_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DLASWP_NCOPY}_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" "" false "DOUBLE")
endif()
endif()

View File

@ -0,0 +1,216 @@
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRSMKERNEL_LN = trsm_kernel_LN_sve.c
STRSMKERNEL_LT = trsm_kernel_LT_sve.c
STRSMKERNEL_RN = trsm_kernel_RN_sve.c
STRSMKERNEL_RT = trsm_kernel_RT_sve.c
DTRSMKERNEL_LN = trsm_kernel_LN_sve.c
DTRSMKERNEL_LT = trsm_kernel_LT_sve.c
DTRSMKERNEL_RN = trsm_kernel_RN_sve.c
DTRSMKERNEL_RT = trsm_kernel_RT_sve.c
TRSMCOPYLN_M = trsm_lncopy_sve.c
TRSMCOPYLT_M = trsm_ltcopy_sve.c
TRSMCOPYUN_M = trsm_uncopy_sve.c
TRSMCOPYUT_M = trsm_utcopy_sve.c
CTRSMKERNEL_LN = trsm_kernel_LN_sve.c
CTRSMKERNEL_LT = trsm_kernel_LT_sve.c
CTRSMKERNEL_RN = trsm_kernel_RN_sve.c
CTRSMKERNEL_RT = trsm_kernel_RT_sve.c
ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c
ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c
ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c
ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c
ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c
ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c
ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c
ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
SDOTKERNEL = ../generic/dot.c
else
SDOTKERNEL = dot.S
endif
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S
SGEMM_BETA = sgemm_beta.S
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
SGEMMINCOPY = sgemm_ncopy_sve_v1.c
SGEMMITCOPY = sgemm_tcopy_sve_v1.c
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRMMUNCOPY_M = trmm_uncopy_sve_v1.c
STRMMLNCOPY_M = trmm_lncopy_sve_v1.c
STRMMUTCOPY_M = trmm_utcopy_sve_v1.c
STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
SSYMMUCOPY_M = symm_ucopy_sve.c
SSYMMLCOPY_M = symm_lcopy_sve.c
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
DSYMMUCOPY_M = symm_ucopy_sve.c
DSYMMLCOPY_M = symm_lcopy_sve.c
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
CGEMMINCOPY = cgemm_ncopy_sve_v1.c
CGEMMITCOPY = cgemm_tcopy_sve_v1.c
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
CHEMMLTCOPY_M = zhemm_ltcopy_sve.c
CHEMMUTCOPY_M = zhemm_utcopy_sve.c
CSYMMUCOPY_M = zsymm_ucopy_sve.c
CSYMMLCOPY_M = zsymm_lcopy_sve.c
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
ZGEMMINCOPY = zgemm_ncopy_sve_v1.c
ZGEMMITCOPY = zgemm_tcopy_sve_v1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c
ZHEMMUTCOPY_M = zhemm_utcopy_sve.c
ZSYMMUCOPY_M = zsymm_ucopy_sve.c
ZSYMMLCOPY_M = zsymm_lcopy_sve.c

View File

@ -0,0 +1,216 @@
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRSMKERNEL_LN = trsm_kernel_LN_sve.c
STRSMKERNEL_LT = trsm_kernel_LT_sve.c
STRSMKERNEL_RN = trsm_kernel_RN_sve.c
STRSMKERNEL_RT = trsm_kernel_RT_sve.c
DTRSMKERNEL_LN = trsm_kernel_LN_sve.c
DTRSMKERNEL_LT = trsm_kernel_LT_sve.c
DTRSMKERNEL_RN = trsm_kernel_RN_sve.c
DTRSMKERNEL_RT = trsm_kernel_RT_sve.c
TRSMCOPYLN_M = trsm_lncopy_sve.c
TRSMCOPYLT_M = trsm_ltcopy_sve.c
TRSMCOPYUN_M = trsm_uncopy_sve.c
TRSMCOPYUT_M = trsm_utcopy_sve.c
CTRSMKERNEL_LN = trsm_kernel_LN_sve.c
CTRSMKERNEL_LT = trsm_kernel_LT_sve.c
CTRSMKERNEL_RN = trsm_kernel_RN_sve.c
CTRSMKERNEL_RT = trsm_kernel_RT_sve.c
ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c
ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c
ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c
ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c
ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c
ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c
ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c
ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
SDOTKERNEL = ../generic/dot.c
else
SDOTKERNEL = dot.S
endif
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S
SGEMM_BETA = sgemm_beta.S
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
SGEMMINCOPY = sgemm_ncopy_sve_v1.c
SGEMMITCOPY = sgemm_tcopy_sve_v1.c
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRMMUNCOPY_M = trmm_uncopy_sve_v1.c
STRMMLNCOPY_M = trmm_lncopy_sve_v1.c
STRMMUTCOPY_M = trmm_utcopy_sve_v1.c
STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
SSYMMUCOPY_M = symm_ucopy_sve.c
SSYMMLCOPY_M = symm_lcopy_sve.c
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
DSYMMUCOPY_M = symm_ucopy_sve.c
DSYMMLCOPY_M = symm_lcopy_sve.c
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
CGEMMINCOPY = cgemm_ncopy_sve_v1.c
CGEMMITCOPY = cgemm_tcopy_sve_v1.c
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
CHEMMLTCOPY_M = zhemm_ltcopy_sve.c
CHEMMUTCOPY_M = zhemm_utcopy_sve.c
CSYMMUCOPY_M = zsymm_ucopy_sve.c
CSYMMLCOPY_M = zsymm_lcopy_sve.c
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
ZGEMMINCOPY = zgemm_ncopy_sve_v1.c
ZGEMMITCOPY = zgemm_tcopy_sve_v1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c
ZHEMMUTCOPY_M = zhemm_utcopy_sve.c
ZSYMMUCOPY_M = zsymm_ucopy_sve.c
ZSYMMLCOPY_M = zsymm_lcopy_sve.c

View File

@ -0,0 +1 @@
include $(KERNELDIR)/KERNEL.CORTEXA57

View File

@ -0,0 +1,216 @@
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRSMKERNEL_LN = trsm_kernel_LN_sve.c
STRSMKERNEL_LT = trsm_kernel_LT_sve.c
STRSMKERNEL_RN = trsm_kernel_RN_sve.c
STRSMKERNEL_RT = trsm_kernel_RT_sve.c
DTRSMKERNEL_LN = trsm_kernel_LN_sve.c
DTRSMKERNEL_LT = trsm_kernel_LT_sve.c
DTRSMKERNEL_RN = trsm_kernel_RN_sve.c
DTRSMKERNEL_RT = trsm_kernel_RT_sve.c
TRSMCOPYLN_M = trsm_lncopy_sve.c
TRSMCOPYLT_M = trsm_ltcopy_sve.c
TRSMCOPYUN_M = trsm_uncopy_sve.c
TRSMCOPYUT_M = trsm_utcopy_sve.c
CTRSMKERNEL_LN = trsm_kernel_LN_sve.c
CTRSMKERNEL_LT = trsm_kernel_LT_sve.c
CTRSMKERNEL_RN = trsm_kernel_RN_sve.c
CTRSMKERNEL_RT = trsm_kernel_RT_sve.c
ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c
ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c
ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c
ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c
ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c
ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c
ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c
ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
SDOTKERNEL = ../generic/dot.c
else
SDOTKERNEL = dot.S
endif
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S
SGEMM_BETA = sgemm_beta.S
SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
SGEMMINCOPY = sgemm_ncopy_sve_v1.c
SGEMMITCOPY = sgemm_tcopy_sve_v1.c
SGEMMONCOPY = sgemm_ncopy_$(DGEMM_UNROLL_N).S
SGEMMOTCOPY = sgemm_tcopy_$(DGEMM_UNROLL_N).S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRMMUNCOPY_M = trmm_uncopy_sve_v1.c
STRMMLNCOPY_M = trmm_lncopy_sve_v1.c
STRMMUTCOPY_M = trmm_utcopy_sve_v1.c
STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
SSYMMUCOPY_M = symm_ucopy_sve.c
SSYMMLCOPY_M = symm_lcopy_sve.c
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
DGEMMINCOPY = dgemm_ncopy_sve_v1.c
DGEMMITCOPY = dgemm_tcopy_sve_v1.c
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c
DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c
DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c
DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c
DSYMMUCOPY_M = symm_ucopy_sve.c
DSYMMLCOPY_M = symm_lcopy_sve.c
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
CGEMMINCOPY = cgemm_ncopy_sve_v1.c
CGEMMITCOPY = cgemm_tcopy_sve_v1.c
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
CHEMMLTCOPY_M = zhemm_ltcopy_sve.c
CHEMMUTCOPY_M = zhemm_utcopy_sve.c
CSYMMUCOPY_M = zsymm_ucopy_sve.c
CSYMMLCOPY_M = zsymm_lcopy_sve.c
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
ZGEMMINCOPY = zgemm_ncopy_sve_v1.c
ZGEMMITCOPY = zgemm_tcopy_sve_v1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c
ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c
ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c
ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c
ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c
ZHEMMUTCOPY_M = zhemm_utcopy_sve.c
ZSYMMUCOPY_M = zsymm_ucopy_sve.c
ZSYMMLCOPY_M = zsymm_lcopy_sve.c

View File

@ -0,0 +1,3 @@
include $(KERNELDIR)/KERNEL.CORTEXA57

View File

@ -187,3 +187,14 @@ ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
SBGEMM_BETA = sbgemm_beta_neoversen2.c
SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversen2.c
SBGEMMINCOPY = sbgemm_ncopy_neoversen2.c
SBGEMMITCOPY = sbgemm_tcopy_neoversen2.c
SBGEMMONCOPY = sbgemm_ncopy_neoversen2.c
SBGEMMOTCOPY = sbgemm_tcopy_neoversen2.c
SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)

173
kernel/arm64/KERNEL.generic Normal file
View File

@ -0,0 +1,173 @@
ifndef DSDOTKERNEL
DSDOTKERNEL = ../generic/dot.c
endif
SGEMM_BETA = ../generic/gemm_beta.c
DGEMM_BETA = ../generic/gemm_beta.c
CGEMM_BETA = ../generic/zgemm_beta.c
ZGEMM_BETA = ../generic/zgemm_beta.c
STRMMKERNEL = ../generic/trmmkernel_2x2.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
#Todo: CGEMM3MKERNEL should be 4x4 blocksizes.
CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S
#Pure C for other kernels
SAMAXKERNEL = ../arm/amax.c
DAMAXKERNEL = ../arm/amax.c
CAMAXKERNEL = ../arm/zamax.c
ZAMAXKERNEL = ../arm/zamax.c
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMAXKERNEL = ../arm/iamax.c
IDAMAXKERNEL = ../arm/iamax.c
ICAMAXKERNEL = ../arm/izamax.c
IZAMAXKERNEL = ../arm/izamax.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
SASUMKERNEL = ../arm/asum.c
DASUMKERNEL = ../arm/asum.c
CASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = ../arm/zasum.c
SSUMKERNEL = ../arm/sum.c
DSUMKERNEL = ../arm/sum.c
CSUMKERNEL = ../arm/zsum.c
ZSUMKERNEL = ../arm/zsum.c
SAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = ../arm/axpy.c
CAXPYKERNEL = ../arm/zaxpy.c
ZAXPYKERNEL = ../arm/zaxpy.c
SCOPYKERNEL = ../arm/copy.c
DCOPYKERNEL = ../arm/copy.c
CCOPYKERNEL = ../arm/zcopy.c
ZCOPYKERNEL = ../arm/zcopy.c
SDOTKERNEL = ../arm/dot.c
DDOTKERNEL = ../arm/dot.c
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
SNRM2KERNEL = ../arm/nrm2.c
DNRM2KERNEL = ../arm/nrm2.c
CNRM2KERNEL = ../arm/znrm2.c
ZNRM2KERNEL = ../arm/znrm2.c
SROTKERNEL = ../arm/rot.c
DROTKERNEL = ../arm/rot.c
CROTKERNEL = ../arm/zrot.c
ZROTKERNEL = ../arm/zrot.c
SSCALKERNEL = ../arm/scal.c
DSCALKERNEL = ../arm/scal.c
CSCALKERNEL = ../arm/zscal.c
ZSCALKERNEL = ../arm/zscal.c
SSWAPKERNEL = ../arm/swap.c
DSWAPKERNEL = ../arm/swap.c
CSWAPKERNEL = ../arm/zswap.c
ZSWAPKERNEL = ../arm/zswap.c
SGEMVNKERNEL = ../arm/gemv_n.c
DGEMVNKERNEL = ../arm/gemv_n.c
CGEMVNKERNEL = ../arm/zgemv_n.c
ZGEMVNKERNEL = ../arm/zgemv_n.c
SGEMVTKERNEL = ../arm/gemv_t.c
DGEMVTKERNEL = ../arm/gemv_t.c
CGEMVTKERNEL = ../arm/zgemv_t.c
ZGEMVTKERNEL = ../arm/zgemv_t.c
SSYMV_U_KERNEL = ../generic/symv_k.c
SSYMV_L_KERNEL = ../generic/symv_k.c
DSYMV_U_KERNEL = ../generic/symv_k.c
DSYMV_L_KERNEL = ../generic/symv_k.c
QSYMV_U_KERNEL = ../generic/symv_k.c
QSYMV_L_KERNEL = ../generic/symv_k.c
CSYMV_U_KERNEL = ../generic/zsymv_k.c
CSYMV_L_KERNEL = ../generic/zsymv_k.c
ZSYMV_U_KERNEL = ../generic/zsymv_k.c
ZSYMV_L_KERNEL = ../generic/zsymv_k.c
XSYMV_U_KERNEL = ../generic/zsymv_k.c
XSYMV_L_KERNEL = ../generic/zsymv_k.c
ZHEMV_U_KERNEL = ../generic/zhemv_k.c
ZHEMV_L_KERNEL = ../generic/zhemv_k.c
LSAME_KERNEL = ../generic/lsame.c
SCABS_KERNEL = ../generic/cabs.c
DCABS_KERNEL = ../generic/cabs.c
QCABS_KERNEL = ../generic/cabs.c
#Dump kernel
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c

View File

@ -404,6 +404,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
#else
nrm2_compute(n, x, inc_x, &ssq, &scale);
#endif
if (fabs(scale) <1.e-300) return 0.;
ssq = sqrt(ssq) * scale;
return ssq;

View File

@ -0,0 +1,83 @@
/***************************************************************************
* Copyright (c) 2022, The OpenBLAS Project
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of the OpenBLAS project nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* *****************************************************************************/
#include "common.h"
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, IFLOAT *dummy2,
BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, FLOAT *c,
BLASLONG ldc) {
BLASLONG i, j;
BLASLONG chunk, remain;
FLOAT *c_offset1, *c_offset;
c_offset = c;
chunk = m >> 3;
remain = m & 7;
if (beta == ZERO) {
for (j = n; j > 0; j--) {
c_offset1 = c_offset;
c_offset += ldc;
for (i = chunk; i > 0; i--) {
*(c_offset1 + 0) = ZERO;
*(c_offset1 + 1) = ZERO;
*(c_offset1 + 2) = ZERO;
*(c_offset1 + 3) = ZERO;
*(c_offset1 + 4) = ZERO;
*(c_offset1 + 5) = ZERO;
*(c_offset1 + 6) = ZERO;
*(c_offset1 + 7) = ZERO;
c_offset1 += 8;
}
for (i = remain; i > 0; i--) {
*c_offset1 = ZERO;
c_offset1++;
}
}
} else {
for (j = n; j > 0; j--) {
c_offset1 = c_offset;
c_offset += ldc;
for (i = chunk; i > 0; i--) {
*(c_offset1 + 0) *= beta;
*(c_offset1 + 1) *= beta;
*(c_offset1 + 2) *= beta;
*(c_offset1 + 3) *= beta;
*(c_offset1 + 4) *= beta;
*(c_offset1 + 5) *= beta;
*(c_offset1 + 6) *= beta;
*(c_offset1 + 7) *= beta;
c_offset1 += 8;
}
for (i = remain; i > 0; i--) {
*c_offset1 *= beta;
c_offset1++;
}
}
}
return 0;
};

View File

@ -0,0 +1,45 @@
/***************************************************************************
* Copyright (c) 2022, The OpenBLAS Project
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of the OpenBLAS project nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* *****************************************************************************/
#include <arm_sve.h>
#include "common.h"
#define ALPHA_ONE
#include "sbgemm_kernel_8x4_neoversen2_impl.c"
#undef ALPHA_ONE
#include "sbgemm_kernel_8x4_neoversen2_impl.c"
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B,
FLOAT *C, BLASLONG ldc) {
if (alpha == 1.0f)
return sbgemm_kernel_neoversen2_alpha_one(m, n, k, alpha, A, B, C, ldc);
else
return sbgemm_kernel_neoversen2_alpha(m, n, k, alpha, A, B, C, ldc);
return 0;
}

View File

@ -0,0 +1,665 @@
/***************************************************************************
* Copyright (c) 2022, The OpenBLAS Project
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of the OpenBLAS project nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* *****************************************************************************/
#include <arm_sve.h>
#include "common.h"
#ifdef ALPHA_ONE
#define LOAD_C(M, N) \
mc##M##N = svld1_gather_index(pg32, ptr_c0##N + 2 * M , off_vc);
#define LOAD_C_LOW(M, N) \
mc##M##N = svld1_gather_index(pg32_low, ptr_c0##N + 2 * M, off_vc);
#define LOAD_C_EVEN(M, N) \
mc##M##N = svld1_gather_index(pg32_even, ptr_c0##N + 2 * M, off_vc);
#define LOAD_C_FIRST(M, N) \
mc##M##N = svld1_gather_index(pg32_first, ptr_c0##N + 2 * M, off_vc);
#define STORE_C(M, N) \
svst1_scatter_index(pg32, ptr_c0##N + 2 * M, off_vc, mc##M##N);
#define STORE_C_LOW(M, N) \
svst1_scatter_index(pg32_low, ptr_c0##N + 2 * M, off_vc, mc##M##N);
#define STORE_C_EVEN(M, N) \
svst1_scatter_index(pg32_even, ptr_c0##N + 2 * M, off_vc, mc##M##N);
#define STORE_C_FIRST(M, N) \
svst1_scatter_index(pg32_first, ptr_c0##N + 2 * M, off_vc, mc##M##N);
#else
#define LOAD_C(M, N) \
mc##M##N = svdup_f32(0); \
oc##M##N = svld1_gather_index(pg32, ptr_c0##N + 2 * M , off_vc);
#define LOAD_C_LOW(M, N) \
mc##M##N = svdup_f32(0); \
oc##M##N = svld1_gather_index(pg32_low, ptr_c0##N + 2 * M , off_vc);
#define LOAD_C_EVEN(M, N) \
mc##M##N = svdup_f32(0); \
oc##M##N = svld1_gather_index(pg32_even, ptr_c0##N + 2 * M , off_vc);
#define LOAD_C_FIRST(M, N) \
mc##M##N = svdup_f32(0); \
oc##M##N = svld1_gather_index(pg32_first, ptr_c0##N + 2 * M , off_vc);
#define STORE_C(M, N) \
mc##M##N = svmad_z(pg32, svalpha, mc##M##N, oc##M##N); \
svst1_scatter_index(pg32, ptr_c0##N + 2 * M, off_vc, mc##M##N);
#define STORE_C_LOW(M, N) \
mc##M##N = svmad_z(pg32_low, svalpha, mc##M##N, oc##M##N); \
svst1_scatter_index(pg32_low, ptr_c0##N + 2 * M, off_vc, mc##M##N);
#define STORE_C_EVEN(M, N) \
mc##M##N = svmad_z(pg32_even, svalpha, mc##M##N, oc##M##N); \
svst1_scatter_index(pg32_even, ptr_c0##N + 2 * M, off_vc, mc##M##N);
#define STORE_C_FIRST(M, N) \
mc##M##N = svmad_z(pg32_first, svalpha, mc##M##N, oc##M##N); \
svst1_scatter_index(pg32_first, ptr_c0##N + 2 * M, off_vc, mc##M##N);
#endif
#define LOAD_A(M) ma##M = svld1_bf16(pg16, ptr_a##M);
#define LOAD_B(N) mb##N = svld1_bf16(pg16, ptr_b##N);
#define MATMUL(M, N) mc##M##N = svbfmmla(mc##M##N, ma##M, mb##N);
#define LOAD_KREST_1(NAME, M) \
m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), zero, zero, zero, \
*(ptr_##NAME##M + 1), zero, zero, zero);
#define LOAD_KREST_1_LOW(NAME, M) \
m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), zero, zero, zero, zero, zero, \
zero, zero);
#define LOAD_KREST_2(NAME, M) \
m##NAME##M = \
svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), zero, zero, \
*(ptr_##NAME##M + 2), *(ptr_##NAME##M + 3), zero, zero);
#define LOAD_KREST_2_LOW(NAME, M) \
m##NAME##M = svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), zero, \
zero, zero, zero, zero, zero);
#define LOAD_KREST_3(NAME, M) \
m##NAME##M = \
svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), \
*(ptr_##NAME##M + 2), zero, *(ptr_##NAME##M + 3), \
*(ptr_##NAME##M + 4), *(ptr_##NAME##M + 5), zero);
#define LOAD_KREST_3_LOW(NAME, M) \
m##NAME##M = \
svdupq_bf16(*(ptr_##NAME##M), *(ptr_##NAME##M + 1), \
*(ptr_##NAME##M + 2), zero, zero, zero, zero, zero);
#ifdef ALPHA_ONE
int sbgemm_kernel_neoversen2_alpha_one(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc)
#else
int sbgemm_kernel_neoversen2_alpha(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc)
#endif
{
bfloat16_t *ptr_a = (bfloat16_t *)A;
bfloat16_t *ptr_b = (bfloat16_t *)B;
FLOAT *ptr_c = C;
bfloat16_t *ptr_a0, *ptr_a1, *ptr_a2, *ptr_a3;
bfloat16_t *ptr_b0, *ptr_b1;
FLOAT *ptr_c00, *ptr_c01;
svbfloat16_t ma0, ma1, ma2, ma3, mb0, mb1;
svfloat32_t mc00, mc01, mc10, mc11, mc20, mc21, mc30, mc31;
#ifndef ALPHA_ONE
svfloat32_t oc00, oc01, oc10, oc11, oc20, oc21, oc30, oc31;
#endif
svbool_t pg16 = svptrue_b16();
svbool_t pg16_low = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0);
svbool_t pg32 = svptrue_b32();
svbool_t pg32_low = svdupq_b32(1, 1, 0, 0);
svbool_t pg32_even = svdupq_b32(1, 0, 1, 0);
svbool_t pg32_first = svdupq_b32(1, 0, 0, 0);
svfloat32_t svalpha = svdup_f32(alpha);
bfloat16 tmp = 0;
bfloat16_t zero = *((bfloat16_t *)&tmp);
BLASLONG krest = k & 3;
// 00 01 10 11
svuint32_t off_vc = svdupq_u32(0, (uint32_t)ldc, 1, (uint32_t)ldc + 1);
for (BLASLONG j = 0; j < n / 4; j++) {
ptr_c00 = ptr_c;
ptr_c01 = ptr_c + 2 * ldc;
ptr_c += 4 * ldc;
ptr_a = (bfloat16_t *)A;
for (BLASLONG i = 0; i < m / 8; i++) {
ptr_a0 = ptr_a;
ptr_a1 = ptr_a0 + 2 * k;
ptr_a2 = ptr_a1 + 2 * k;
ptr_a3 = ptr_a2 + 2 * k;
ptr_a += 8 * k;
ptr_b0 = ptr_b;
ptr_b1 = ptr_b0 + 2 * k;
LOAD_C(0, 0); LOAD_C(0, 1);
LOAD_C(1, 0); LOAD_C(1, 1);
LOAD_C(2, 0); LOAD_C(2, 1);
LOAD_C(3, 0); LOAD_C(3, 1);
for (BLASLONG p = 0; p < k / 4; p++) {
LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3);
LOAD_B(0); LOAD_B(1);
MATMUL(0, 0); MATMUL(0, 1);
MATMUL(1, 0); MATMUL(1, 1);
MATMUL(2, 0); MATMUL(2, 1);
MATMUL(3, 0); MATMUL(3, 1);
ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8;
ptr_b0 += 8; ptr_b1 += 8;
}
if (krest) {
if (krest == 1) {
LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3);
LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1);
} else if (krest == 2) {
LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3);
LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1);
} else if (krest == 3) {
LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3);
LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1);
}
MATMUL(0, 0); MATMUL(0, 1);
MATMUL(1, 0); MATMUL(1, 1);
MATMUL(2, 0); MATMUL(2, 1);
MATMUL(3, 0); MATMUL(3, 1);
}
STORE_C(0, 0); STORE_C(0, 1);
STORE_C(1, 0); STORE_C(1, 1);
STORE_C(2, 0); STORE_C(2, 1);
STORE_C(3, 0); STORE_C(3, 1);
ptr_c00 += 8; ptr_c01 += 8;
}
if (m & 4) {
ptr_a0 = ptr_a;
ptr_a1 = ptr_a0 + 2 * k;
ptr_a += 4 * k;
ptr_b0 = ptr_b;
ptr_b1 = ptr_b0 + 2 * k;
LOAD_C(0, 0); LOAD_C(0, 1);
LOAD_C(1, 0); LOAD_C(1, 1);
for (BLASLONG p = 0; p < k / 4; p++) {
LOAD_A(0); LOAD_A(1);
LOAD_B(0); LOAD_B(1);
MATMUL(0, 0); MATMUL(0, 1);
MATMUL(1, 0); MATMUL(1, 1);
ptr_a0 += 8; ptr_a1 += 8;
ptr_b0 += 8; ptr_b1 += 8;
}
if (krest) {
if (krest == 1) {
LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1);
} else if (krest == 2) {
LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1);
} else if (krest == 3) {
LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1);
}
MATMUL(0, 0); MATMUL(0, 1);
MATMUL(1, 0); MATMUL(1, 1);
}
STORE_C(0, 0); STORE_C(0, 1);
STORE_C(1, 0); STORE_C(1, 1);
ptr_c00 += 4; ptr_c01 += 4;
}
if (m & 2) {
ptr_a0 = ptr_a;
ptr_a += 2 * k;
ptr_b0 = ptr_b;
ptr_b1 = ptr_b0 + 2 * k;
LOAD_C(0, 0); LOAD_C(0, 1);
for (BLASLONG p = 0; p < k / 4; p++) {
LOAD_A(0);
LOAD_B(0); LOAD_B(1);
MATMUL(0, 0); MATMUL(0, 1);
ptr_a0 += 8;
ptr_b0 += 8; ptr_b1 += 8;
}
if (krest) {
if (krest == 1) {
LOAD_KREST_1(a, 0);
LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1);
} else if (krest == 2) {
LOAD_KREST_2(a, 0);
LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1);
} else if (krest == 3) {
LOAD_KREST_3(a, 0);
LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1);
}
MATMUL(0, 0); MATMUL(0, 1);
}
STORE_C(0, 0); STORE_C(0, 1);
ptr_c00 += 2; ptr_c01 += 2;
}
if (m & 1) {
ptr_a0 = ptr_a;
ptr_b0 = ptr_b;
ptr_b1 = ptr_b0 + 2 * k;
LOAD_C_LOW(0, 0); LOAD_C_LOW(0, 1);
for (BLASLONG p = 0; p < k / 4; p++) {
ma0 = svld1_bf16(pg16_low, ptr_a0);
LOAD_B(0); LOAD_B(1);
MATMUL(0, 0); MATMUL(0, 1);
ptr_a0 += 4;
ptr_b0 += 8;
ptr_b1 += 8;
}
if (krest) {
if (krest == 1) {
LOAD_KREST_1_LOW(a, 0);
LOAD_KREST_1(b, 0); LOAD_KREST_1(b, 1);
} else if (krest == 2) {
LOAD_KREST_2_LOW(a, 0);
LOAD_KREST_2(b, 0); LOAD_KREST_2(b, 1);
} else if (krest == 3) {
LOAD_KREST_3_LOW(a, 0);
LOAD_KREST_3(b, 0); LOAD_KREST_3(b, 1);
}
MATMUL(0, 0); MATMUL(0, 1);
}
STORE_C_LOW(0, 0); STORE_C_LOW(0, 1);
}
ptr_b += 4 * k;
}
if (n & 2) {
ptr_c00 = ptr_c;
ptr_c += 2 * ldc;
ptr_a = (bfloat16_t *)A;
for (BLASLONG i = 0; i < m / 8; i++) {
ptr_a0 = ptr_a;
ptr_a1 = ptr_a0 + 2 * k;
ptr_a2 = ptr_a1 + 2 * k;
ptr_a3 = ptr_a2 + 2 * k;
ptr_a += 8 * k;
ptr_b0 = ptr_b;
LOAD_C(0, 0);
LOAD_C(1, 0);
LOAD_C(2, 0);
LOAD_C(3, 0);
for (BLASLONG p = 0; p < k / 4; p++) {
LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3);
LOAD_B(0);
MATMUL(0, 0);
MATMUL(1, 0);
MATMUL(2, 0);
MATMUL(3, 0);
ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8;
ptr_b0 += 8;
}
if (krest) {
if (krest == 1) {
LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3);
LOAD_KREST_1(b, 0);
} else if (krest == 2) {
LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3);
LOAD_KREST_2(b, 0);
} else if (krest == 3) {
LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3);
LOAD_KREST_3(b, 0);
}
MATMUL(0, 0);
MATMUL(1, 0);
MATMUL(2, 0);
MATMUL(3, 0);
}
STORE_C(0, 0);
STORE_C(1, 0);
STORE_C(2, 0);
STORE_C(3, 0);
ptr_c00 += 8;
}
if (m & 4) {
ptr_a0 = ptr_a;
ptr_a1 = ptr_a0 + 2 * k;
ptr_a += 4 * k;
ptr_b0 = ptr_b;
LOAD_C(0, 0);
LOAD_C(1, 0);
for (BLASLONG p = 0; p < k / 4; p++) {
LOAD_A(0); LOAD_A(1);
LOAD_B(0);
MATMUL(0, 0);
MATMUL(1, 0);
ptr_a0 += 8; ptr_a1 += 8;
ptr_b0 += 8;
}
if (krest) {
if (krest == 1) {
LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
LOAD_KREST_1(b, 0);
} else if (krest == 2) {
LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
LOAD_KREST_2(b, 0);
} else if (krest == 3) {
LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
LOAD_KREST_3(b, 0);
}
MATMUL(0, 0);
MATMUL(1, 0);
}
STORE_C(0, 0)
STORE_C(1, 0)
ptr_c00 += 4;
}
if (m & 2) {
ptr_a0 = ptr_a;
ptr_a += 2 * k;
ptr_b0 = ptr_b;
LOAD_C(0, 0);
for (BLASLONG p = 0; p < k / 4; p++) {
LOAD_A(0);
LOAD_B(0);
MATMUL(0, 0);
ptr_a0 += 8;
ptr_b0 += 8;
}
if (krest) {
if (krest == 1) {
LOAD_KREST_1(a, 0);
LOAD_KREST_1(b, 0);
} else if (krest == 2) {
LOAD_KREST_2(a, 0);
LOAD_KREST_2(b, 0);
} else if (krest == 3) {
LOAD_KREST_3(a, 0);
LOAD_KREST_3(b, 0);
}
MATMUL(0, 0);
}
STORE_C(0, 0);
ptr_c00 += 2;
}
if (m & 1) {
ptr_a0 = ptr_a;
ptr_b0 = ptr_b;
LOAD_C(0, 0);
for (BLASLONG p = 0; p < k / 4; p++) {
ma0 = svld1_bf16(pg16_low, ptr_a0);
LOAD_B(0);
MATMUL(0, 0);
ptr_a0 += 4;
ptr_b0 += 8;
}
if (krest) {
if (krest == 1) {
LOAD_KREST_1_LOW(a, 0);
LOAD_KREST_1(b, 0);
} else if (krest == 2) {
LOAD_KREST_2_LOW(a, 0);
LOAD_KREST_2(b, 0);
} else if (krest == 3) {
LOAD_KREST_3_LOW(a, 0);
LOAD_KREST_3(b, 0);
}
MATMUL(0, 0);
}
STORE_C_LOW(0, 0);
}
ptr_b += 2 * k;
}
if (n & 1) {
ptr_c00 = ptr_c;
ptr_a = (bfloat16_t *) A;
for (BLASLONG i = 0; i < m / 8; i++) {
ptr_a0 = ptr_a;
ptr_a1 = ptr_a0 + 2 * k;
ptr_a2 = ptr_a1 + 2 * k;
ptr_a3 = ptr_a2 + 2 * k;
ptr_a += 8 * k;
ptr_b0 = ptr_b;
LOAD_C_EVEN(0, 0);
LOAD_C_EVEN(1, 0);
LOAD_C_EVEN(2, 0);
LOAD_C_EVEN(3, 0);
for (BLASLONG p = 0; p < k / 4; p++) {
LOAD_A(0); LOAD_A(1); LOAD_A(2); LOAD_A(3);
mb0 = svld1_bf16(pg16_low, ptr_b0);
MATMUL(0, 0);
MATMUL(1, 0);
MATMUL(2, 0);
MATMUL(3, 0);
ptr_a0 += 8; ptr_a1 += 8; ptr_a2 += 8; ptr_a3 += 8;
ptr_b0 += 4;
}
if (krest) {
if (krest == 1) {
LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
LOAD_KREST_1(a, 2); LOAD_KREST_1(a, 3);
LOAD_KREST_1_LOW(b, 0);
} else if (krest == 2) {
LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
LOAD_KREST_2(a, 2); LOAD_KREST_2(a, 3);
LOAD_KREST_2_LOW(b, 0);
} else if (krest == 3) {
LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
LOAD_KREST_3(a, 2); LOAD_KREST_3(a, 3);
LOAD_KREST_3_LOW(b, 0);
}
MATMUL(0, 0);
MATMUL(1, 0);
MATMUL(2, 0);
MATMUL(3, 0);
}
STORE_C_EVEN(0, 0)
STORE_C_EVEN(1, 0);
STORE_C_EVEN(2, 0);
STORE_C_EVEN(3, 0);
ptr_c00 += 8;
}
if (m & 4) {
ptr_a0 = ptr_a;
ptr_a1 = ptr_a0 + 2 * k;
ptr_a += 4 * k;
ptr_b0 = ptr_b;
LOAD_C_EVEN(0, 0);
LOAD_C_EVEN(1, 0);
for (BLASLONG p = 0; p < k / 4; p++) {
LOAD_A(0); LOAD_A(1);
mb0 = svld1_bf16(pg16_low, ptr_b0);
MATMUL(0, 0);
MATMUL(1, 0);
ptr_a0 += 8; ptr_a1 += 8;
ptr_b0 += 4;
}
if (krest) {
if (krest == 1) {
LOAD_KREST_1(a, 0); LOAD_KREST_1(a, 1);
LOAD_KREST_1_LOW(b, 0);
} else if (krest == 2) {
LOAD_KREST_2(a, 0); LOAD_KREST_2(a, 1);
LOAD_KREST_2_LOW(b, 0);
} else if (krest == 3) {
LOAD_KREST_3(a, 0); LOAD_KREST_3(a, 1);
LOAD_KREST_3_LOW(b, 0);
}
MATMUL(0, 0);
MATMUL(1, 0);
}
STORE_C_EVEN(0, 0)
STORE_C_EVEN(1, 0)
ptr_c00 += 4;
}
if (m & 2) {
ptr_a0 = ptr_a;
ptr_a += 2 * k;
ptr_b0 = ptr_b;
LOAD_C_EVEN(0, 0);
for (BLASLONG p = 0; p < k / 4; p++) {
LOAD_A(0);
mb0 = svld1_bf16(pg16_low, ptr_b0);
MATMUL(0, 0);
ptr_a0 += 8;
ptr_b0 += 4;
}
if (krest) {
if (krest == 1) {
LOAD_KREST_1(a, 0);
LOAD_KREST_1_LOW(b, 0);
} else if (krest == 2) {
LOAD_KREST_2(a, 0);
LOAD_KREST_2_LOW(b, 0);
} else if (krest == 3) {
LOAD_KREST_3(a, 0);
LOAD_KREST_3_LOW(b, 0);
}
MATMUL(0, 0);
}
STORE_C_EVEN(0, 0);
ptr_c00 += 2;
}
if (m & 1) {
ptr_a0 = ptr_a;
ptr_b0 = ptr_b;
LOAD_C_FIRST(0, 0);
for (BLASLONG p = 0; p < k / 4; p++) {
ma0 = svld1_bf16(pg16_low, ptr_a0);
mb0 = svld1_bf16(pg16_low, ptr_b0);
MATMUL(0, 0);
ptr_a0 += 4;
ptr_b0 += 4;
}
if (krest) {
if (krest == 1) {
LOAD_KREST_1_LOW(a, 0);
LOAD_KREST_1_LOW(b, 0);
} else if (krest == 2) {
LOAD_KREST_2_LOW(a, 0);
LOAD_KREST_2_LOW(b, 0);
} else if (krest == 3) {
LOAD_KREST_3_LOW(a, 0);
LOAD_KREST_3_LOW(b, 0);
}
MATMUL(0, 0);
}
STORE_C_FIRST(0, 0);
}
}
return 0;
}

View File

@ -0,0 +1,101 @@
/***************************************************************************
* Copyright (c) 2022, The OpenBLAS Project
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of the OpenBLAS project nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* *****************************************************************************/
#include "common.h"
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
IFLOAT *a_offset, *a_offset1, *a_offset2;
IFLOAT *b_offset;
a_offset = a;
b_offset = b;
for (BLASLONG j = 0; j < n / 2; j++) {
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset += 2 * lda;
for (BLASLONG i = 0; i < m / 4; i++) {
*(b_offset + 0) = *(a_offset1 + 0);
*(b_offset + 1) = *(a_offset1 + 1);
*(b_offset + 2) = *(a_offset1 + 2);
*(b_offset + 3) = *(a_offset1 + 3);
*(b_offset + 4) = *(a_offset2 + 0);
*(b_offset + 5) = *(a_offset2 + 1);
*(b_offset + 6) = *(a_offset2 + 2);
*(b_offset + 7) = *(a_offset2 + 3);
a_offset1 += 4;
a_offset2 += 4;
b_offset += 8;
}
BLASLONG rest = m & 3;
if (rest == 3) {
*(b_offset + 0) = *(a_offset1 + 0);
*(b_offset + 1) = *(a_offset1 + 1);
*(b_offset + 2) = *(a_offset1 + 2);
*(b_offset + 3) = *(a_offset2 + 0);
*(b_offset + 4) = *(a_offset2 + 1);
*(b_offset + 5) = *(a_offset2 + 2);
b_offset += 6;
} else if (rest == 2) {
*(b_offset + 0) = *(a_offset1 + 0);
*(b_offset + 1) = *(a_offset1 + 1);
*(b_offset + 2) = *(a_offset2 + 0);
*(b_offset + 3) = *(a_offset2 + 1);
b_offset += 4;
} else if (rest == 1) {
*(b_offset + 0) = *(a_offset1 + 0);
*(b_offset + 1) = *(a_offset2 + 0);
b_offset += 2;
}
}
if (n & 1) {
for (BLASLONG i = 0; i < m / 4; i++) {
*(b_offset + 0) = *(a_offset + 0);
*(b_offset + 1) = *(a_offset + 1);
*(b_offset + 2) = *(a_offset + 2);
*(b_offset + 3) = *(a_offset + 3);
b_offset += 4;
a_offset += 4;
}
BLASLONG rest = m & 3;
if (rest == 3) {
*(b_offset + 0) = *(a_offset + 0);
*(b_offset + 1) = *(a_offset + 1);
*(b_offset + 2) = *(a_offset + 2);
} else if (rest == 2) {
*(b_offset + 0) = *(a_offset + 0);
*(b_offset + 1) = *(a_offset + 1);
} else if (rest == 1) {
*(b_offset + 0) = *(a_offset + 0);
}
}
return 0;
}

View File

@ -0,0 +1,109 @@
/***************************************************************************
* Copyright (c) 2022, The OpenBLAS Project
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name of the OpenBLAS project nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
* *****************************************************************************/
#include "common.h"
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
IFLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
IFLOAT *b_offset;
a_offset = a;
b_offset = b;
for (BLASLONG j = 0; j < n / 2; j++) {
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset += 2;
for (BLASLONG i = 0; i < m / 4; i++) {
*(b_offset + 0) = *(a_offset1 + 0);
*(b_offset + 1) = *(a_offset2 + 0);
*(b_offset + 2) = *(a_offset3 + 0);
*(b_offset + 3) = *(a_offset4 + 0);
*(b_offset + 4) = *(a_offset1 + 1);
*(b_offset + 5) = *(a_offset2 + 1);
*(b_offset + 6) = *(a_offset3 + 1);
*(b_offset + 7) = *(a_offset4 + 1);
b_offset += 8;
a_offset1 += 4 * lda;
a_offset2 += 4 * lda;
a_offset3 += 4 * lda;
a_offset4 += 4 * lda;
}
if (m & 3) {
BLASLONG rest = m & 3;
if (rest == 3) {
*(b_offset + 0) = *(a_offset1 + 0);
*(b_offset + 1) = *(a_offset2 + 0);
*(b_offset + 2) = *(a_offset3 + 0);
*(b_offset + 3) = *(a_offset1 + 1);
*(b_offset + 4) = *(a_offset2 + 1);
*(b_offset + 5) = *(a_offset3 + 1);
b_offset += 6;
} else if (rest == 2) {
*(b_offset + 0) = *(a_offset1 + 0);
*(b_offset + 1) = *(a_offset2 + 0);
*(b_offset + 2) = *(a_offset1 + 1);
*(b_offset + 3) = *(a_offset2 + 1);
b_offset += 4;
} else if (rest == 1) {
*(b_offset + 0) = *(a_offset1 + 0);
*(b_offset + 1) = *(a_offset1 + 1);
b_offset += 2;
}
}
}
if (n & 1) {
for (BLASLONG i = 0; i < m / 4; i++) {
*(b_offset + 0) = *(a_offset);
*(b_offset + 1) = *(a_offset + lda);
*(b_offset + 2) = *(a_offset + lda * 2);
*(b_offset + 3) = *(a_offset + lda * 3);
b_offset += 4;
a_offset += 4 * lda;
}
BLASLONG rest = m & 3;
if (rest == 3) {
*(b_offset + 0) = *(a_offset);
*(b_offset + 1) = *(a_offset + lda);
*(b_offset + 2) = *(a_offset + lda * 2);
} else if (rest == 2) {
*(b_offset + 0) = *(a_offset);
*(b_offset + 1) = *(a_offset + lda);
} else if (rest == 1) {
*(b_offset + 0) = *(a_offset);
}
}
return 0;
}

View File

@ -198,8 +198,8 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
static void zdot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, OPENBLAS_COMPLEX_FLOAT *result)
{
FLOAT dotr = 0.0, doti = 0.0;
CREAL(*result) = 0.0;
CIMAG(*result) = 0.0;
OPENBLAS_COMPLEX_FLOAT cf = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0, 0.0);
*result = cf;
if ( n < 0 ) return;
@ -290,8 +290,8 @@ static void zdot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLON
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
);
CREAL(*result) = dotr;
CIMAG(*result) = doti;
cf=OPENBLAS_MAKE_COMPLEX_FLOAT(dotr, doti);
*result = cf;
return;
}
@ -312,9 +312,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
int nthreads;
FLOAT dummy_alpha;
#endif
OPENBLAS_COMPLEX_FLOAT zdot;
CREAL(zdot) = 0.0;
CIMAG(zdot) = 0.0;
OPENBLAS_COMPLEX_FLOAT zdot = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0);
#if defined(SMP)
if (inc_x == 0 || inc_y == 0 || n <= 10000)
@ -341,8 +339,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
ptr = (OPENBLAS_COMPLEX_FLOAT *)result;
for (i = 0; i < nthreads; i++) {
CREAL(zdot) = CREAL(zdot) + CREAL(*ptr);
CIMAG(zdot) = CIMAG(zdot) + CIMAG(*ptr);
zdot = OPENBLAS_MAKE_COMPLEX_FLOAT (CREAL(zdot) + CREAL(*ptr), CIMAG(zdot) + CIMAG(*ptr));
ptr = (void *)(((char *)ptr) + sizeof(double) * 2);
}
}

View File

@ -108,10 +108,10 @@ SGEMMINCOPY = ../generic/gemm_ncopy_2.c
SGEMMITCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
endif
ifndef DGEMMKERNEL
@ -120,10 +120,10 @@ DGEMMINCOPY = ../generic/gemm_ncopy_2.c
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPY = ../generic/gemm_ncopy_8.c
DGEMMOTCOPY = ../generic/gemm_tcopy_8.c
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
endif
ifndef CGEMMKERNEL
@ -132,10 +132,10 @@ CGEMMINCOPY = ../generic/zgemm_ncopy_1.c
CGEMMITCOPY = ../generic/zgemm_tcopy_1.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMINCOPYOBJ = cgemm_incopy.o
CGEMMITCOPYOBJ = cgemm_itcopy.o
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
endif
ifndef ZGEMMKERNEL
@ -144,10 +144,10 @@ ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMINCOPYOBJ = zgemm_incopy.o
ZGEMMITCOPYOBJ = zgemm_itcopy.o
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
endif
ifndef SGEMM_BETA

View File

@ -3,10 +3,10 @@ DGEMMINCOPY = dgemm_ncopy_16.S
DGEMMITCOPY = dgemm_tcopy_16.S
DGEMMONCOPY = dgemm_ncopy_4.S
DGEMMOTCOPY = dgemm_tcopy_4.S
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c

View File

@ -11,26 +11,26 @@ ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c

View File

@ -53,6 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define s4 $f9
#define ALPHA $f4
#define max $f5
#define INF $f6
PROLOGUE
@ -61,6 +62,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LDINT INCX, 0(INCX)
#endif
// Init INF
addi.d TEMP, $r0, 0x7FF
slli.d TEMP, TEMP, 52
MTC INF, TEMP
MTC s1, $r0
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
@ -198,7 +204,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
CMPEQ $fcc0, s1, a1
fcvt.d.s ALPHA, ALPHA
bcnez $fcc0, .L999
fdiv.d ALPHA, ALPHA, s1
CMPEQ $fcc0, INF, ALPHA
bcnez $fcc0, .L999
MOV max, s1
MOV s1, a1
MOV s2, a1

View File

@ -68,6 +68,7 @@
#define ALPHA $f16
#define max $f17
#define INF $f18
PROLOGUE
@ -86,6 +87,11 @@
move XX, X
NOP
//Init INF
lui TEMP, 0x7FF0
dsll TEMP, TEMP, 32
MTC1 TEMP, INF
LD a1, 0 * SIZE(X)
daddiu N, N, -1
@ -255,6 +261,9 @@
div.d ALPHA, ALPHA, s1
MOV max, s1
CMPEQ $fcc0, ALPHA, INF
bc1t $fcc0, .L999
MOV s1, a1
MOV s2, a1
MOV s3, a1

View File

@ -1,152 +0,0 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL 1
static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__
(
"lxvp 32, 0(%2) \n\t"
"lxvp 34, 32(%2) \n\t"
"lxvp 36, 64(%2) \n\t"
"lxvp 38, 96(%2) \n\t"
"lxvp 40, 128(%2) \n\t"
"lxvp 42, 160(%2) \n\t"
"lxvp 44, 192(%2) \n\t"
"lxvp 46, 224(%2) \n\t"
"addi %2, %2, 256 \n\t"
"addic. %1, %1, -32 \n\t"
"ble two%= \n\t"
".align 5 \n"
"one%=: \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 32, 0(%3) \n\t"
"stxv 33, 16(%3) \n\t"
"stxv 34, 32(%3) \n\t"
"stxv 35, 48(%3) \n\t"
"stxv 36, 64(%3) \n\t"
"stxv 37, 80(%3) \n\t"
"stxv 38, 96(%3) \n\t"
"stxv 39, 112(%3) \n\t"
#else
"stxv 33, 0(%3) \n\t"
"stxv 32, 16(%3) \n\t"
"stxv 35, 32(%3) \n\t"
"stxv 34, 48(%3) \n\t"
"stxv 37, 64(%3) \n\t"
"stxv 36, 80(%3) \n\t"
"stxv 39, 96(%3) \n\t"
"stxv 38, 112(%3) \n\t"
#endif
"lxvp 32, 0(%2) \n\t"
"lxvp 34, 32(%2) \n\t"
"lxvp 36, 64(%2) \n\t"
"lxvp 38, 96(%2) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 40, 128(%3) \n\t"
"stxv 41, 144(%3) \n\t"
"stxv 42, 160(%3) \n\t"
"stxv 43, 176(%3) \n\t"
"stxv 44, 192(%3) \n\t"
"stxv 45, 208(%3) \n\t"
"stxv 46, 224(%3) \n\t"
"stxv 47, 240(%3) \n\t"
#else
"stxv 41, 128(%3) \n\t"
"stxv 40, 144(%3) \n\t"
"stxv 43, 160(%3) \n\t"
"stxv 42, 176(%3) \n\t"
"stxv 45, 192(%3) \n\t"
"stxv 44, 208(%3) \n\t"
"stxv 47, 224(%3) \n\t"
"stxv 46, 240(%3) \n\t"
#endif
"lxvp 40, 128(%2) \n\t"
"lxvp 42, 160(%2) \n\t"
"lxvp 44, 192(%2) \n\t"
"lxvp 46, 224(%2) \n\t"
"addi %3, %3, 256 \n\t"
"addi %2, %2, 256 \n\t"
"addic. %1, %1, -32 \n\t"
"bgt one%= \n"
"two%=: \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 32, 0(%3) \n\t"
"stxv 33, 16(%3) \n\t"
"stxv 34, 32(%3) \n\t"
"stxv 35, 48(%3) \n\t"
"stxv 36, 64(%3) \n\t"
"stxv 37, 80(%3) \n\t"
"stxv 38, 96(%3) \n\t"
"stxv 39, 112(%3) \n\t"
"stxv 40, 128(%3) \n\t"
"stxv 41, 144(%3) \n\t"
"stxv 42, 160(%3) \n\t"
"stxv 43, 176(%3) \n\t"
"stxv 44, 192(%3) \n\t"
"stxv 45, 208(%3) \n\t"
"stxv 46, 224(%3) \n\t"
"stxv 47, 240(%3) \n\t"
#else
"stxv 33, 0(%3) \n\t"
"stxv 32, 16(%3) \n\t"
"stxv 35, 32(%3) \n\t"
"stxv 34, 48(%3) \n\t"
"stxv 37, 64(%3) \n\t"
"stxv 36, 80(%3) \n\t"
"stxv 39, 96(%3) \n\t"
"stxv 38, 112(%3) \n\t"
"stxv 41, 128(%3) \n\t"
"stxv 40, 144(%3) \n\t"
"stxv 43, 160(%3) \n\t"
"stxv 42, 176(%3) \n\t"
"stxv 45, 192(%3) \n\t"
"stxv 44, 208(%3) \n\t"
"stxv 47, 224(%3) \n\t"
"stxv 46, 240(%3) \n\t"
#endif
"#n=%1 x=%4=%2 y=%0=%3"
:
"=m" (*y),
"+r" (n), // 1
"+b" (x), // 2
"+b" (y) // 3
:
"m" (*x)
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
);
}

View File

@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "ccopy_microk_power10.c"
#include "copy_microk_power10.c"
#endif
#ifndef HAVE_KERNEL
@ -86,7 +86,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -32;
BLASLONG n1 = n & -64;
if ( n1 > 0 )
{
copy_kernel(n1, x, y);

View File

@ -29,6 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(POWER10)
#pragma GCC optimize "O1"
#include "cdot_microk_power10.c"
#else
#ifndef HAVE_KERNEL_8

View File

@ -61,37 +61,97 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
".align 5 \n"
"one%=: \n\t"
"stxvp 32, 0(%3) \n\t"
"stxvp 34, 32(%3) \n\t"
"stxvp 36, 64(%3) \n\t"
"stxvp 38, 96(%3) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 32, 0(%3) \n\t"
"stxv 33, 16(%3) \n\t"
"stxv 34, 32(%3) \n\t"
"stxv 35, 48(%3) \n\t"
"stxv 36, 64(%3) \n\t"
"stxv 37, 80(%3) \n\t"
"stxv 38, 96(%3) \n\t"
"stxv 39, 112(%3) \n\t"
#else
"stxv 33, 0(%3) \n\t"
"stxv 32, 16(%3) \n\t"
"stxv 35, 32(%3) \n\t"
"stxv 34, 48(%3) \n\t"
"stxv 37, 64(%3) \n\t"
"stxv 36, 80(%3) \n\t"
"stxv 39, 96(%3) \n\t"
"stxv 38, 112(%3) \n\t"
#endif
"lxvp 32, 0(%2) \n\t"
"lxvp 34, 32(%2) \n\t"
"lxvp 36, 64(%2) \n\t"
"lxvp 38, 96(%2) \n\t"
"stxvp 40, 128(%3) \n\t"
"stxvp 42, 160(%3) \n\t"
"stxvp 44, 192(%3) \n\t"
"stxvp 46, 224(%3) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 40, 128(%3) \n\t"
"stxv 41, 144(%3) \n\t"
"stxv 42, 160(%3) \n\t"
"stxv 43, 176(%3) \n\t"
"stxv 44, 192(%3) \n\t"
"stxv 45, 208(%3) \n\t"
"stxv 46, 224(%3) \n\t"
"stxv 47, 240(%3) \n\t"
#else
"stxv 41, 128(%3) \n\t"
"stxv 40, 144(%3) \n\t"
"stxv 43, 160(%3) \n\t"
"stxv 42, 176(%3) \n\t"
"stxv 45, 192(%3) \n\t"
"stxv 44, 208(%3) \n\t"
"stxv 47, 224(%3) \n\t"
"stxv 46, 240(%3) \n\t"
#endif
"lxvp 40, 128(%2) \n\t"
"lxvp 42, 160(%2) \n\t"
"lxvp 44, 192(%2) \n\t"
"lxvp 46, 224(%2) \n\t"
"stxvp 48, 256(%3) \n\t"
"stxvp 50, 288(%3) \n\t"
"stxvp 52, 320(%3) \n\t"
"stxvp 54, 352(%3) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 48, 256(%3) \n\t"
"stxv 49, 272(%3) \n\t"
"stxv 50, 288(%3) \n\t"
"stxv 51, 304(%3) \n\t"
"stxv 52, 320(%3) \n\t"
"stxv 53, 336(%3) \n\t"
"stxv 54, 352(%3) \n\t"
"stxv 55, 368(%3) \n\t"
#else
"stxv 49, 256(%3) \n\t"
"stxv 48, 272(%3) \n\t"
"stxv 51, 288(%3) \n\t"
"stxv 50, 304(%3) \n\t"
"stxv 53, 320(%3) \n\t"
"stxv 52, 336(%3) \n\t"
"stxv 55, 352(%3) \n\t"
"stxv 54, 368(%3) \n\t"
#endif
"lxvp 48, 256(%2) \n\t"
"lxvp 50, 288(%2) \n\t"
"lxvp 52, 320(%2) \n\t"
"lxvp 54, 352(%2) \n\t"
"stxvp 56, 384(%3) \n\t"
"stxvp 58, 416(%3) \n\t"
"stxvp 60, 448(%3) \n\t"
"stxvp 62, 480(%3) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 56, 384(%3) \n\t"
"stxv 57, 400(%3) \n\t"
"stxv 58, 416(%3) \n\t"
"stxv 59, 432(%3) \n\t"
"stxv 60, 448(%3) \n\t"
"stxv 61, 464(%3) \n\t"
"stxv 62, 480(%3) \n\t"
"stxv 63, 496(%3) \n\t"
#else
"stxv 57, 384(%3) \n\t"
"stxv 56, 400(%3) \n\t"
"stxv 59, 416(%3) \n\t"
"stxv 58, 432(%3) \n\t"
"stxv 61, 448(%3) \n\t"
"stxv 60, 464(%3) \n\t"
"stxv 63, 480(%3) \n\t"
"stxv 62, 496(%3) \n\t"
#endif
"lxvp 56, 384(%2) \n\t"
"lxvp 58, 416(%2) \n\t"
"lxvp 60, 448(%2) \n\t"
@ -111,22 +171,73 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
"two%=: \n\t"
"stxvp 32, 0(%3) \n\t"
"stxvp 34, 32(%3) \n\t"
"stxvp 36, 64(%3) \n\t"
"stxvp 38, 96(%3) \n\t"
"stxvp 40, 128(%3) \n\t"
"stxvp 42, 160(%3) \n\t"
"stxvp 44, 192(%3) \n\t"
"stxvp 46, 224(%3) \n\t"
"stxvp 48, 256(%3) \n\t"
"stxvp 50, 288(%3) \n\t"
"stxvp 52, 320(%3) \n\t"
"stxvp 54, 352(%3) \n\t"
"stxvp 56, 384(%3) \n\t"
"stxvp 58, 416(%3) \n\t"
"stxvp 60, 448(%3) \n\t"
"stxvp 62, 480(%3) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 32, 0(%3) \n\t"
"stxv 33, 16(%3) \n\t"
"stxv 34, 32(%3) \n\t"
"stxv 35, 48(%3) \n\t"
"stxv 36, 64(%3) \n\t"
"stxv 37, 80(%3) \n\t"
"stxv 38, 96(%3) \n\t"
"stxv 39, 112(%3) \n\t"
"stxv 40, 128(%3) \n\t"
"stxv 41, 144(%3) \n\t"
"stxv 42, 160(%3) \n\t"
"stxv 43, 176(%3) \n\t"
"stxv 44, 192(%3) \n\t"
"stxv 45, 208(%3) \n\t"
"stxv 46, 224(%3) \n\t"
"stxv 47, 240(%3) \n\t"
"stxv 48, 256(%3) \n\t"
"stxv 49, 272(%3) \n\t"
"stxv 50, 288(%3) \n\t"
"stxv 51, 304(%3) \n\t"
"stxv 52, 320(%3) \n\t"
"stxv 53, 336(%3) \n\t"
"stxv 54, 352(%3) \n\t"
"stxv 55, 368(%3) \n\t"
"stxv 56, 384(%3) \n\t"
"stxv 57, 400(%3) \n\t"
"stxv 58, 416(%3) \n\t"
"stxv 59, 432(%3) \n\t"
"stxv 60, 448(%3) \n\t"
"stxv 61, 464(%3) \n\t"
"stxv 62, 480(%3) \n\t"
"stxv 63, 496(%3) \n\t"
#else
"stxv 33, 0(%3) \n\t"
"stxv 32, 16(%3) \n\t"
"stxv 35, 32(%3) \n\t"
"stxv 34, 48(%3) \n\t"
"stxv 37, 64(%3) \n\t"
"stxv 36, 80(%3) \n\t"
"stxv 39, 96(%3) \n\t"
"stxv 38, 112(%3) \n\t"
"stxv 41, 128(%3) \n\t"
"stxv 40, 144(%3) \n\t"
"stxv 43, 160(%3) \n\t"
"stxv 42, 176(%3) \n\t"
"stxv 45, 192(%3) \n\t"
"stxv 44, 208(%3) \n\t"
"stxv 47, 224(%3) \n\t"
"stxv 46, 240(%3) \n\t"
"stxv 49, 256(%3) \n\t"
"stxv 48, 272(%3) \n\t"
"stxv 51, 288(%3) \n\t"
"stxv 50, 304(%3) \n\t"
"stxv 53, 320(%3) \n\t"
"stxv 52, 336(%3) \n\t"
"stxv 55, 352(%3) \n\t"
"stxv 54, 368(%3) \n\t"
"stxv 57, 384(%3) \n\t"
"stxv 56, 400(%3) \n\t"
"stxv 59, 416(%3) \n\t"
"stxv 58, 432(%3) \n\t"
"stxv 61, 448(%3) \n\t"
"stxv 60, 464(%3) \n\t"
"stxv 63, 480(%3) \n\t"
"stxv 62, 496(%3) \n\t"
#endif
"#n=%1 x=%4=%2 y=%0=%3"
:

View File

@ -95,18 +95,38 @@ static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i)
"xvaddsp 50, 50, 36 \n\t"
"xvaddsp 51, 51, 37 \n\t"
"stxvp 48, 0(%2) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 48, 0(%2) \n\t"
"stxv 49, 16(%2) \n\t"
#else
"stxv 49, 0(%2) \n\t"
"stxv 48, 16(%2) \n\t"
#endif
"xvaddsp 52, 52, 38 \n\t"
"xvaddsp 53, 53, 39 \n\t"
"stxvp 50, 32(%2) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 50, 32(%2) \n\t"
"stxv 51, 48(%2) \n\t"
#else
"stxv 51, 32(%2) \n\t"
"stxv 50, 48(%2) \n\t"
#endif
"xvaddsp 54, 54, 56 \n\t"
"xvaddsp 55, 55, 57 \n\t"
"stxvp 52, 64(%2) \n\t"
"stxvp 54, 96(%2) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 52, 64(%2) \n\t"
"stxv 53, 80(%2) \n\t"
"stxv 54, 96(%2) \n\t"
"stxv 55, 112(%2) \n\t"
#else
"stxv 53, 64(%2) \n\t"
"stxv 52, 80(%2) \n\t"
"stxv 55, 96(%2) \n\t"
"stxv 54, 112(%2) \n\t"
#endif
"addi %2, %2, 128 \n\t"
@ -148,18 +168,39 @@ static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i)
"xvaddsp 50, 50, 36 \n\t"
"xvaddsp 51, 51, 37 \n\t"
"stxvp 48, 0(%2) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 48, 0(%2) \n\t"
"stxv 49, 16(%2) \n\t"
#else
"stxv 49, 0(%2) \n\t"
"stxv 48, 16(%2) \n\t"
#endif
"xvaddsp 52, 52, 38 \n\t"
"xvaddsp 53, 53, 39 \n\t"
"stxvp 50, 32(%2) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 50, 32(%2) \n\t"
"stxv 51, 48(%2) \n\t"
#else
"stxv 51, 32(%2) \n\t"
"stxv 50, 48(%2) \n\t"
#endif
"xvaddsp 54, 54, 56 \n\t"
"xvaddsp 55, 55, 57 \n\t"
"stxvp 52, 64(%2) \n\t"
"stxvp 54, 96(%2) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 52, 64(%2) \n\t"
"stxv 53, 80(%2) \n\t"
"stxv 54, 96(%2) \n\t"
"stxv 55, 112(%2) \n\t"
#else
"stxv 53, 64(%2) \n\t"
"stxv 52, 80(%2) \n\t"
"stxv 55, 96(%2) \n\t"
"stxv 54, 112(%2) \n\t"
#endif
"#n=%1 x=%0=%2 alpha=(%3,%4)\n"
:

View File

@ -60,14 +60,25 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
"xvmaddadp 37, 33, %x4 \n\t"
"lxvp 32, 0(%2) \n\t"
"stxvp 36, 0(%3) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 36, 0(%3) \n\t"
"stxv 37, 16(%3) \n\t"
#else
"stxv 37, 0(%3) \n\t"
"stxv 36, 16(%3) \n\t"
#endif
"xvmaddadp 38, 34, %x4 \n\t"
"xvmaddadp 39, 35, %x4 \n\t"
"lxvp 34, 32(%2) \n\t"
"stxvp 38, 32(%3) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 38, 32(%3) \n\t"
"stxv 39, 48(%3) \n\t"
#else
"stxv 39, 32(%3) \n\t"
"stxv 38, 48(%3) \n\t"
#endif
"lxvp 36, 128(%3) \n\t"
"lxvp 38, 160(%3) \n\t"
@ -76,13 +87,25 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
"xvmaddadp 45, 41, %x4 \n\t"
"lxvp 40, 64(%2) \n\t"
"stxvp 44, 64(%3) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 44, 64(%3) \n\t"
"stxv 45, 80(%3) \n\t"
#else
"stxv 45, 64(%3) \n\t"
"stxv 44, 80(%3) \n\t"
#endif
"xvmaddadp 46, 42, %x4 \n\t"
"xvmaddadp 47, 43, %x4 \n\t"
"lxvp 42, 96(%2) \n\t"
"stxvp 46, 96(%3) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 46, 96(%3) \n\t"
"stxv 47, 112(%3) \n\t"
#else
"stxv 47, 96(%3) \n\t"
"stxv 46, 112(%3) \n\t"
#endif
"addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t"
@ -105,10 +128,25 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
"xvmaddadp 46, 42, %x4 \n\t"
"xvmaddadp 47, 43, %x4 \n\t"
"stxvp 36, 0(%3) \n\t"
"stxvp 38, 32(%3) \n\t"
"stxvp 44, 64(%3) \n\t"
"stxvp 46, 96(%3) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 36, 0(%3) \n\t"
"stxv 37, 16(%3) \n\t"
"stxv 38, 32(%3) \n\t"
"stxv 39, 48(%3) \n\t"
"stxv 44, 64(%3) \n\t"
"stxv 45, 80(%3) \n\t"
"stxv 46, 96(%3) \n\t"
"stxv 47, 112(%3) \n\t"
#else
"stxv 37, 0(%3) \n\t"
"stxv 36, 16(%3) \n\t"
"stxv 39, 32(%3) \n\t"
"stxv 38, 48(%3) \n\t"
"stxv 45, 64(%3) \n\t"
"stxv 44, 80(%3) \n\t"
"stxv 47, 96(%3) \n\t"
"stxv 46, 112(%3) \n\t"
#endif
"#n=%1 x=%5=%2 y=%0=%3 alpha=%6 t0=%x4\n"
:

View File

@ -68,7 +68,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( n >= 16 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
for (i = 0; i < align; i++) {
y[i] += da * x[i] ;
}

View File

@ -87,7 +87,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
if ( n >= 64 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
for (i = 0; i < align; i++) {
y[i] = x[i] ;
}

View File

@ -35,327 +35,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(HAVE_KERNEL4x8_ASM)
static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) {
#if !__has_builtin(__builtin_vsx_disassemble_pair)
#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
#endif
typedef __vector unsigned char vec_t;
static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
BLASLONG i;
FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
BLASLONG off2;
BLASLONG tempR;
__asm__(
"sldi %[temp],%[off], 4 \n\t" // lda * sizeof (double) *2
"sldi %[off], %[off], 3 \n\t" // lda * sizeof (double)
"xxlxor 34,34,34 \n\t"
"xxlxor 35,34,34 \n\t"
"add %[a2], %[a0], %[temp] \n\t"
"add %[a1], %[a0], %[off] \n\t"
"xxlxor 4,34,34 \n\t"
"xxlxor 5,34,34 \n\t"
"xxlxor 6,34,34 \n\t"
"xxlxor 7,34,34 \n\t"
"add %[a3], %[a2], %[off] \n\t"
"add %[a4], %[a2], %[temp] \n\t"
"xxlxor 8,34,34 \n\t"
"xxlxor 9,34,34 \n\t"
"add %[a5], %[a3], %[temp] \n\t"
"li %[off],0 \n\t"
"li %[off2],16 \n\t"
"add %[a6], %[a4], %[temp] \n\t"
"add %[a7], %[a5], %[temp] \n\t"
"lxvp 32, 0(%[x]) \n\t"
"lxvp 36, 0(%[a0]) \n\t"
"lxvp 38, 0(%[a1]) \n\t"
"lxvp 40, 0(%[a2]) \n\t"
"lxvp 42, 0(%[a3]) \n\t"
"lxvp 44, 0(%[a4]) \n\t"
"lxvp 46, 0(%[a5]) \n\t"
"lxvp 48, 0(%[a6]) \n\t"
"lxvp 50, 0(%[a7]) \n\t"
#if defined(PREFETCH)
"li %[temp],896 \n\t"
#endif
"addic. %[n],%[n],-4 \n\t"
"li %[off],32 \n\t"
"ble- two%= \n\t"
//--------------------------------------------------
".align 5 \n\t"
"one%=: \n\t"
"xvmaddadp 34,36,32 \n\t"
"xvmaddadp 35,38,32 \n\t"
"addi %[off2], %[off2],32 \n\t"
"xvmaddadp 4,40,32 \n\t"
"xvmaddadp 5,42,32 \n\t"
"xvmaddadp 6,44,32 \n\t"
"xvmaddadp 7,46,32 \n\t"
"xvmaddadp 8,48,32 \n\t"
"xvmaddadp 9,50,32 \n\t"
"xvmaddadp 34,37,33 \n\t"
"xvmaddadp 35,39,33 \n\t"
"lxvp 36, 32(%[a0]) \n\t"
"lxvp 38, 32(%[a1]) \n\t"
"xvmaddadp 4,41,33 \n\t"
"xvmaddadp 5,43,33 \n\t"
"addi %[off], %[off],32 \n\t"
"lxvp 40, 32(%[a2]) \n\t"
"lxvp 42, 32(%[a3]) \n\t"
"xvmaddadp 6,45,33 \n\t"
"xvmaddadp 7,47,33 \n\t"
"lxvp 44, 32(%[a4]) \n\t"
"lxvp 46, 32(%[a5]) \n\t"
"xvmaddadp 8,49,33 \n\t"
"xvmaddadp 9,51,33 \n\t"
"addic. %[n],%[n],-4 \n\t"
"lxvp 48, 32(%[a6]) \n\t"
"lxvp 50, 32(%[a7]) \n\t"
"lxvp 32, 32(%[x]) \n\t"
"ble- two%= \n\t"
"xvmaddadp 34,36,32 \n\t"
"xvmaddadp 35,38,32 \n\t"
"addi %[off2], %[off2],32 \n\t"
"xvmaddadp 4,40,32 \n\t"
"xvmaddadp 5,42,32 \n\t"
"xvmaddadp 6,44,32 \n\t"
"xvmaddadp 7,46,32 \n\t"
"xvmaddadp 8,48,32 \n\t"
"xvmaddadp 9,50,32 \n\t"
"xvmaddadp 34,37,33 \n\t"
"xvmaddadp 35,39,33 \n\t"
"lxvp 36, 64(%[a0]) \n\t"
"lxvp 38, 64(%[a1]) \n\t"
"xvmaddadp 4,41,33 \n\t"
"xvmaddadp 5,43,33 \n\t"
"addi %[off], %[off],32 \n\t"
"lxvp 40, 64(%[a2]) \n\t"
"lxvp 42, 64(%[a3]) \n\t"
"xvmaddadp 6,45,33 \n\t"
"xvmaddadp 7,47,33 \n\t"
"lxvp 44, 64(%[a4]) \n\t"
"lxvp 46, 64(%[a5]) \n\t"
"xvmaddadp 8,49,33 \n\t"
"xvmaddadp 9,51,33 \n\t"
"addic. %[n],%[n],-4 \n\t"
"lxvp 48, 64(%[a6]) \n\t"
"lxvp 50, 64(%[a7]) \n\t"
"lxvp 32, 64(%[x]) \n\t"
"ble- two%= \n\t"
"xvmaddadp 34,36,32 \n\t"
"xvmaddadp 35,38,32 \n\t"
#if defined(PREFETCH)
"addi %[temp],%[temp],128 \n\t"
#endif
"addi %[off2], %[off2],32 \n\t"
"xvmaddadp 4,40,32 \n\t"
"xvmaddadp 5,42,32 \n\t"
"xvmaddadp 6,44,32 \n\t"
"xvmaddadp 7,46,32 \n\t"
"xvmaddadp 8,48,32 \n\t"
"xvmaddadp 9,50,32 \n\t"
#if defined(PREFETCH)
"dcbt %[temp],%[a0] \n\t"
#endif
"xvmaddadp 34,37,33 \n\t"
"xvmaddadp 35,39,33 \n\t"
"lxvp 36, 96(%[a0]) \n\t"
"lxvp 38, 96(%[a1]) \n\t"
"xvmaddadp 4,41,33 \n\t"
"xvmaddadp 5,43,33 \n\t"
#if defined(PREFETCH)
"dcbt %[temp],%[a1] \n\t"
#endif
"lxvp 40, 96(%[a2]) \n\t"
"lxvp 42, 96(%[a3]) \n\t"
"addi %[off], %[off],32 \n\t"
"xvmaddadp 6,45,33 \n\t"
"xvmaddadp 7,47,33 \n\t"
"lxvp 44, 96(%[a4]) \n\t"
"lxvp 46, 96(%[a5]) \n\t"
"xvmaddadp 8,49,33 \n\t"
"xvmaddadp 9,51,33 \n\t"
#if defined(PREFETCH)
"dcbt %[temp],%[a3] \n\t"
#endif
"lxvp 48, 96(%[a6]) \n\t"
"lxvp 50, 96(%[a7]) \n\t"
"lxvp 32, 96(%[x]) \n\t"
"addic. %[n],%[n],-4 \n\t"
"ble- two%= \n\t"
"addi %[off2], %[off2],32 \n\t"
#if defined(PREFETCH)
"dcbt %[temp],%[a2] \n\t"
#endif
"xvmaddadp 34,36,32 \n\t"
"xvmaddadp 35,38,32 \n\t"
"xvmaddadp 4,40,32 \n\t"
"xvmaddadp 5,42,32 \n\t"
#if defined(PREFETCH)
"dcbt %[temp],%[a4] \n\t"
#endif
"xvmaddadp 6,44,32 \n\t"
"xvmaddadp 7,46,32 \n\t"
"xvmaddadp 8,48,32 \n\t"
"xvmaddadp 9,50,32 \n\t"
#if defined(PREFETCH)
"dcbt %[temp],%[a5] \n\t"
#endif
"xvmaddadp 34,37,33 \n\t"
"xvmaddadp 35,39,33 \n\t"
"lxvp 36, 128(%[a0]) \n\t"
"lxvp 38, 128(%[a1]) \n\t"
"xvmaddadp 4,41,33 \n\t"
"xvmaddadp 5,43,33 \n\t"
"addi %[off], %[off],32 \n\t"
"lxvp 40, 128(%[a2]) \n\t"
"lxvp 42, 128(%[a3]) \n\t"
#if defined(PREFETCH)
"dcbt %[temp],%[a6] \n\t"
#endif
"xvmaddadp 6,45,33 \n\t"
"xvmaddadp 7,47,33 \n\t"
"lxvp 44, 128(%[a4]) \n\t"
"lxvp 46, 128(%[a5]) \n\t"
"xvmaddadp 8,49,33 \n\t"
"xvmaddadp 9,51,33 \n\t"
#if defined(PREFETCH)
"dcbt %[temp],%[a7] \n\t"
#endif
"addic. %[n],%[n],-4 \n\t"
"lxvp 48, 128(%[a6]) \n\t"
"lxvp 50, 128(%[a7]) \n\t"
"lxvp 32, 128(%[x]) \n\t"
#if defined(PREFETCH)
"dcbt %[temp],%[x] \n\t"
#endif
"addi %[a0], %[a0], 128 \n\t"
"addi %[a1], %[a1], 128 \n\t"
"addi %[a2], %[a2], 128 \n\t"
"addi %[a3], %[a3], 128 \n\t"
"addi %[a4], %[a4], 128 \n\t"
"addi %[a5], %[a5], 128 \n\t"
"addi %[a6], %[a6], 128 \n\t"
"addi %[a7], %[a7], 128 \n\t"
"addi %[x], %[x], 128 \n\t"
"bgt+ one%= \n\t"
".align 5 \n\t"
"two%=: \n\t"
//--------------------------------------------
"xvmaddadp 34,36,32 \n\t"
"xvmaddadp 35,38,32 \n\t"
"xvmaddadp 4,40,32 \n\t"
"xvmaddadp 5,42,32 \n\t"
"xvmaddadp 6,44,32 \n\t"
"xvmaddadp 7,46,32 \n\t"
"xvmaddadp 8,48,32 \n\t"
"xvmaddadp 9,50,32 \n\t"
XXSPLTD_S(36,%x[alpha],0)
"xvmaddadp 34,37,33 \n\t"
"xvmaddadp 35,39,33 \n\t"
"xvmaddadp 4,41,33 \n\t"
"xvmaddadp 5,43,33 \n\t"
"xvmaddadp 6,45,33 \n\t"
"xvmaddadp 7,47,33 \n\t"
"xvmaddadp 8,49,33 \n\t"
"xvmaddadp 9,51,33 \n\t"
"lxvp 38, 0(%[y]) \n\t"
"lxvp 40, 32(%[y]) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
XXMRGHD_S(42,34,35)
XXMRGLD_S(43,34,35)
XXMRGHD_S(44,4,5)
XXMRGLD_S(45,4,5)
#else
XXMRGLD_S(42,35,34)
XXMRGHD_S(43,35,34)
XXMRGLD_S(44,5,4)
XXMRGHD_S(45,5,4)
#endif
"xvadddp 42,42,43 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
XXMRGHD_S(46,6,7)
XXMRGLD_S(47,6,7)
#else
XXMRGLD_S(46,7,6)
XXMRGHD_S(47,7,6)
#endif
"xvadddp 44,44,45 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
XXMRGHD_S(48,8,9)
XXMRGLD_S(49,8,9)
#else
XXMRGLD_S(48,9,8)
XXMRGHD_S(49,9,8)
#endif
"xvadddp 46,46,47 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 38,42,36 \n\t"
"xvmaddadp 39,44,36 \n\t"
#else
"xvmaddadp 39,42,36 \n\t"
"xvmaddadp 38,44,36 \n\t"
#endif
"xvadddp 48,48,49 \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 41,48,36 \n\t"
#else
"xvmaddadp 41,46,36 \n\t"
#endif
"stxvp 38, 0(%[y]) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"xvmaddadp 40,46,36 \n\t"
#else
"xvmaddadp 40,48,36 \n\t"
#endif
"stxvp 40, 32(%[y]) \n\t"
: [memy] "+m" (*(double (*)[8])y),
[n] "+&r" (n),
[a0] "=b" (a0),
[a1] "=&b" (a1),
[a2] "=&b" (a2),
[a3] "=&b" (a3),
[a4] "=&b" (a4),
[a5] "=&b" (a5),
[a6] "=&b" (a6),
[a7] "=&b" (a7),
[off] "+&b" (lda),
[off2]"=&b" (off2),
[temp] "=&b" (tempR)
: [memx] "m" (*(const double (*)[n])x),
[mem_ap] "m" (*(const double (*)[n*8]) ap),
[alpha] "d" (alpha),
"[a0]" (ap),
[x] "b" (x),
[y] "b" (y)
: "cc","vs4","vs5","vs6","vs7","vs8","vs9" ,"vs32","vs33","vs34","vs35", "vs36", "vs37", "vs38", "vs39",
"vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
);
return;
__vector_pair vx, vp;
vec_t res[2],res1[2];
register __vector double temp0 = {0, 0};
register __vector double temp1 = {0, 0};
register __vector double temp2 = {0, 0};
register __vector double temp3 = {0, 0};
register __vector double temp4 = {0, 0};
register __vector double temp5 = {0, 0};
register __vector double temp6 = {0, 0};
register __vector double temp7 = {0, 0};
a0 = ap;
a1 = ap + lda;
a2 = a1 + lda;
a3 = a2 + lda;
a4 = a3 + lda;
a5 = a4 + lda;
a6 = a5 + lda;
a7 = a6 + lda;
for (i = 0; i < n/2; i += 2) {
vp = *((__vector_pair *)((void *)&a0[i*2]));
vx = *((__vector_pair *)((void *)&x[i*2]));
__builtin_vsx_disassemble_pair (res, &vx);
__builtin_vsx_disassemble_pair (res1, &vp);
temp0 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp0);
temp0 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp0);
vp = *((__vector_pair *)((void *)&a1[i*2]));
__builtin_vsx_disassemble_pair (res1, &vp);
temp1 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp1);
temp1 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp1);
vp = *((__vector_pair *)((void *)&a2[i*2]));
__builtin_vsx_disassemble_pair (res1, &vp);
temp2 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp2);
temp2 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp2);
vp = *((__vector_pair *)((void *)&a3[i*2]));
__builtin_vsx_disassemble_pair (res1, &vp);
temp3 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp3);
temp3 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp3);
vp = *((__vector_pair *)((void *)&a4[i*2]));
__builtin_vsx_disassemble_pair (res1, &vp);
temp4 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp4);
temp4 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp4);
vp = *((__vector_pair *)((void *)&a5[i*2]));
__builtin_vsx_disassemble_pair (res1, &vp);
temp5 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp5);
temp5 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp5);
vp = *((__vector_pair *)((void *)&a6[i*2]));
__builtin_vsx_disassemble_pair (res1, &vp);
temp6 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp6);
temp6 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp6);
vp = *((__vector_pair *)((void *)&a7[i*2]));
__builtin_vsx_disassemble_pair (res1, &vp);
temp7 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp7);
temp7 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp7);
}
y[0] += alpha * (temp0[0] + temp0[1]);
y[1] += alpha * (temp1[0] + temp1[1]);
y[2] += alpha * (temp2[0] + temp2[1]);
y[3] += alpha * (temp3[0] + temp3[1]);
y[4] += alpha * (temp4[0] + temp4[1]);
y[5] += alpha * (temp5[0] + temp5[1]);
y[6] += alpha * (temp6[0] + temp6[1]);
y[7] += alpha * (temp7[0] + temp7[1]);
}
#else
static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {

View File

@ -59,10 +59,25 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
"lxvp 36, 192(%2) \n\t"
"lxvp 38, 224(%2) \n\t"
"stxvp 40, 0(%2) \n\t"
"stxvp 42, 32(%2) \n\t"
"stxvp 44, 64(%2) \n\t"
"stxvp 46, 96(%2) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 40, 0(%2) \n\t"
"stxv 41, 16(%2) \n\t"
"stxv 42, 32(%2) \n\t"
"stxv 43, 48(%2) \n\t"
"stxv 44, 64(%2) \n\t"
"stxv 45, 80(%2) \n\t"
"stxv 46, 96(%2) \n\t"
"stxv 47, 112(%2) \n\t"
#else
"stxv 41, 0(%2) \n\t"
"stxv 40, 16(%2) \n\t"
"stxv 43, 32(%2) \n\t"
"stxv 42, 48(%2) \n\t"
"stxv 45, 64(%2) \n\t"
"stxv 44, 80(%2) \n\t"
"stxv 47, 96(%2) \n\t"
"stxv 46, 112(%2) \n\t"
#endif
"addi %2, %2, 128 \n\t"
@ -81,10 +96,25 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
"xvmuldp 46, 38, 48 \n\t"
"xvmuldp 47, 39, 48 \n\t"
"stxvp 40, 0(%2) \n\t"
"stxvp 42, 32(%2) \n\t"
"stxvp 44, 64(%2) \n\t"
"stxvp 46, 96(%2) \n\t"
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
"stxv 40, 0(%2) \n\t"
"stxv 41, 16(%2) \n\t"
"stxv 42, 32(%2) \n\t"
"stxv 43, 48(%2) \n\t"
"stxv 44, 64(%2) \n\t"
"stxv 45, 80(%2) \n\t"
"stxv 46, 96(%2) \n\t"
"stxv 47, 112(%2) \n\t"
#else
"stxv 41, 0(%2) \n\t"
"stxv 40, 16(%2) \n\t"
"stxv 43, 32(%2) \n\t"
"stxv 42, 48(%2) \n\t"
"stxv 45, 64(%2) \n\t"
"stxv 44, 80(%2) \n\t"
"stxv 47, 96(%2) \n\t"
"stxv 46, 112(%2) \n\t"
#endif
"#n=%1 alpha=%3 x=%0=%2"
:
@ -112,10 +142,14 @@ static void dscal_kernel_8_zero (long n, double *x)
".align 5 \n"
"one%=: \n\t"
"stxvp 32, 0(%2) \n\t"
"stxvp 32, 32(%2) \n\t"
"stxvp 32, 64(%2) \n\t"
"stxvp 32, 96(%2) \n\t"
"stxv 32, 0(%2) \n\t"
"stxv 32, 16(%2) \n\t"
"stxv 32, 32(%2) \n\t"
"stxv 32, 48(%2) \n\t"
"stxv 32, 64(%2) \n\t"
"stxv 32, 80(%2) \n\t"
"stxv 32, 96(%2) \n\t"
"stxv 32, 112(%2) \n\t"
"addi %2, %2, 128 \n\t"

View File

@ -120,7 +120,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
#if defined(POWER10)
if ( n >= 32 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
for (i = 0; i < align; i++) {
temp = y[i];
y[i] = x[i];

View File

@ -69,6 +69,7 @@ int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alph
#endif
#ifdef SMP
// Multi-threading execution outperforms (or approaches) the execution of the
// small kernel.
if (num_cpu_avail(3) > 1) {
@ -77,6 +78,9 @@ int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alph
} else {
return 1;
}
#else
return 1;
#endif
#endif

View File

@ -131,6 +131,10 @@
#define alpha f27
#if defined(PPC440)
#define PREFETCHSIZE_A (3 * 4)
#endif
#if defined(PPCG4)
#define PREFETCHSIZE_A (3 * 4)
#endif

View File

@ -96,6 +96,11 @@
#define X1 r22
#if defined(PPC440)
#define PREFETCHSIZE_A 42
#define PREFETCHSIZE_C 7
#endif
#if defined(PPCG4)
#define PREFETCHSIZE_A 42
#define PREFETCHSIZE_C 7

Some files were not shown because too many files have changed in this diff Show More